diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 06b45b2..8f2bbf7 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4216,6 +4216,30 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) pagecopy((void *)src, (void *)dst); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]-> + phys_addr) + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]-> + phys_addr) + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 4252197..6668e48 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1431,13 +1431,14 @@ adastart(struct cam_periph *periph, union ccb *start_ccb) return; } #endif - cam_fill_ataio(ataio, + cam_fill_ataio_U(ataio, ada_retry_count, adadone, bp->bio_cmd == BIO_READ ? CAM_DIR_IN : CAM_DIR_OUT, tag_code, bp->bio_data, + bp->bio_ma, bp->bio_bcount, ada_default_timeout*1000); diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h index 1f12d91..fbc06e1 100644 --- a/sys/cam/cam_ccb.h +++ b/sys/cam/cam_ccb.h @@ -42,6 +42,7 @@ #include #include +struct vm_page; /* General allocation length definitions for CCB structures */ #define IOCDBLEN CAM_MAX_CDBLEN /* Space for CDB bytes/pointer */ @@ -713,6 +714,7 @@ struct ccb_ataio { struct ata_cmd cmd; /* ATA command register set */ struct ata_res res; /* ATA result register set */ u_int8_t *data_ptr; /* Ptr to the data buf/SG list */ + struct vm_page **ma; u_int32_t dxfer_len; /* Data transfer length */ u_int32_t resid; /* Transfer residual length: 2's comp */ u_int8_t tag_action; /* What to do for tag queueing */ @@ -1201,6 +1203,13 @@ cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, u_int32_t timeout); static __inline void +cam_fill_ataio_U(struct ccb_ataio *ataio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + u_int32_t flags, u_int tag_action, + u_int8_t *data_ptr, struct vm_page **ma, u_int32_t dxfer_len, + u_int32_t timeout); + +static __inline void cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t *smp_request, int smp_request_len, @@ -1248,6 +1257,24 @@ cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries, } static __inline void +cam_fill_ataio_U(struct ccb_ataio *ataio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + u_int32_t flags, u_int tag_action, + u_int8_t *data_ptr, struct vm_page **ma, u_int32_t dxfer_len, + u_int32_t timeout) +{ + ataio->ccb_h.func_code = XPT_ATA_IO; + ataio->ccb_h.flags = flags; + ataio->ccb_h.retry_count = retries; + ataio->ccb_h.cbfcnp = cbfcnp; + ataio->ccb_h.timeout = timeout; + ataio->data_ptr = data_ptr; + ataio->ma = ma; + ataio->dxfer_len = dxfer_len; + ataio->tag_action = tag_action; +} + +static __inline void cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action, @@ -1260,6 +1287,7 @@ cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, ataio->ccb_h.cbfcnp = cbfcnp; ataio->ccb_h.timeout = timeout; ataio->data_ptr = data_ptr; + ataio->ma = NULL; ataio->dxfer_len = dxfer_len; ataio->tag_action = tag_action; } diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c index 4a3c355..0fd3c84 100644 --- a/sys/dev/ahci/ahci.c +++ b/sys/dev/ahci/ahci.c @@ -51,6 +51,9 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include + /* local prototypes */ static int ahci_setup_interrupt(device_t dev); static void ahci_intr(void *data); @@ -95,6 +98,10 @@ static void ahci_process_request_sense(device_t dev, union ccb *ccb); static void ahciaction(struct cam_sim *sim, union ccb *ccb); static void ahcipoll(struct cam_sim *sim); +struct vm_page; +static void ahci_unmappedprd(struct ahci_slot *slot, struct vm_page **ma, + bus_size_t size); + static MALLOC_DEFINE(M_AHCI, "AHCI driver", "AHCI driver data buffers"); static struct { @@ -1670,12 +1677,18 @@ ahci_begin_transaction(device_t dev, union ccb *ccb) /* If request moves data, setup and load SG list */ if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) { void *buf; + struct vm_page **ma; bus_size_t size; slot->state = AHCI_SLOT_LOADING; if (ccb->ccb_h.func_code == XPT_ATA_IO) { - buf = ccb->ataio.data_ptr; + ma = ccb->ataio.ma; size = ccb->ataio.dxfer_len; + if (ma != NULL) { + ahci_unmappedprd(slot, ma, size); + return; + } + buf = ccb->ataio.data_ptr; } else { buf = ccb->csio.data_ptr; size = ccb->csio.dxfer_len; @@ -1686,6 +1699,34 @@ ahci_begin_transaction(device_t dev, union ccb *ccb) ahci_execute_transaction(slot); } +static void +ahci_unmappedprd(struct ahci_slot *slot, struct vm_page **ma, bus_size_t size) +{ + struct ahci_channel *ch = device_get_softc(slot->dev); + struct ahci_cmd_tab *ctp; + struct ahci_dma_prd *prd; + bus_size_t left, c; + int i, npages; + + npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + KASSERT(npages <= AHCI_SG_ENTRIES, ("too many DMA segment entries")); + ctp = (struct ahci_cmd_tab *) + (ch->dma.work + AHCI_CT_OFFSET + (AHCI_CT_SIZE * slot->slot)); + prd = &ctp->prd_tab[0]; + for (i = 0, left = size; i < npages; i++, left -= PAGE_SIZE) { + prd[i].dba = htole64(VM_PAGE_TO_PHYS(ma[i])); + c = (left > PAGE_SIZE) ? PAGE_SIZE : left; + prd[i].dbc = htole32((c - 1) & AHCI_PRD_MASK); + } + slot->dma.nsegs = npages; +#if 0 + bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map, + ((slot->ccb->ccb_h.flags & CAM_DIR_IN) ? + BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE)); +#endif + ahci_execute_transaction(slot); +} + /* Locked by busdma engine. */ static void ahci_dmasetprd(void *arg, bus_dma_segment_t *segs, int nsegs, int error) diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index 443c127..6a8f243 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -629,11 +629,12 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) static int mdstart_swap(struct md_s *sc, struct bio *bp) { - struct sf_buf *sf; int rv, offs, len, lastend; vm_pindex_t i, lastp; vm_page_t m; u_char *p; + struct uio uio; + struct iovec iov[1]; switch (bp->bio_cmd) { case BIO_READ: @@ -662,19 +663,12 @@ mdstart_swap(struct md_s *sc, struct bio *bp) vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); - VM_OBJECT_UNLOCK(sc->object); - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - VM_OBJECT_LOCK(sc->object); + m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL | + VM_ALLOC_RETRY); if (bp->bio_cmd == BIO_READ) { if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } else if (rv == VM_PAGER_FAIL) { @@ -684,40 +678,62 @@ mdstart_swap(struct md_s *sc, struct bio *bp) * valid. Do not set dirty, the page * can be recreated if thrown out. */ - bzero((void *)sf_buf_kva(sf), PAGE_SIZE); + pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } - bcopy((void *)(sf_buf_kva(sf) + offs), p, len); - cpu_flush_dcache(p, len); + if ((bp->bio_flags & BIO_NOTMAPPED) != 0) { + pmap_copy_pages(&m, offs, &bp->bio_ma[i], + offs, len); + } else { + uio.uio_resid = len; + uio.uio_offset = offs; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_READ; + uio.uio_td = curthread; + uio.uio_iov = iov; + uio.uio_iovcnt = 1; + iov[0].iov_base = p; + iov[0].iov_len = len; + uiomove_fromphys(&m, offs, 1, &uio); + cpu_flush_dcache(p, len); + } } else if (bp->bio_cmd == BIO_WRITE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } - bcopy(p, (void *)(sf_buf_kva(sf) + offs), len); - m->valid = VM_PAGE_BITS_ALL; + if ((bp->bio_flags & BIO_NOTMAPPED) != 0) { + pmap_copy_pages(&bp->bio_ma[i], offs, &m, + offs, len); + } else { + uio.uio_resid = len; + uio.uio_offset = offs; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_WRITE; + uio.uio_td = curthread; + uio.uio_iov = iov; + uio.uio_iovcnt = 1; + iov[0].iov_base = p; + iov[0].iov_len = len; + uiomove_fromphys(&m, offs, 1, &uio); + m->valid = VM_PAGE_BITS_ALL; + } } else if (bp->bio_cmd == BIO_DELETE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } if (len != PAGE_SIZE) { - bzero((void *)(sf_buf_kva(sf) + offs), len); + pmap_zero_page_area(m, offs, len); vm_page_clear_dirty(m, offs, len); m->valid = VM_PAGE_BITS_ALL; } else vm_pager_page_unswapped(m); } - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); vm_page_lock(m); if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) diff --git a/sys/fs/cd9660/cd9660_vnops.c b/sys/fs/cd9660/cd9660_vnops.c index 21ee0fc..47d4f75 100644 --- a/sys/fs/cd9660/cd9660_vnops.c +++ b/sys/fs/cd9660/cd9660_vnops.c @@ -329,7 +329,7 @@ cd9660_read(ap) if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, - (ap->a_ioflag >> 16), &bp); + (ap->a_ioflag >> 16), 0, &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { diff --git a/sys/fs/ext2fs/ext2_balloc.c b/sys/fs/ext2fs/ext2_balloc.c index 6e60c6e..ba91ac5 100644 --- a/sys/fs/ext2fs/ext2_balloc.c +++ b/sys/fs/ext2fs/ext2_balloc.c @@ -281,7 +281,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->e2fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, 0, &nbp); } else { error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index 4f8f6a9..5d3d279 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1748,10 +1748,11 @@ ext2_read(ap) if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); - else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) + else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, - NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); - else if (seqcount > 1) { + NOCRED, blkoffset + uio->uio_resid, seqcount, + 0, &bp); + } else if (seqcount > 1) { int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); @@ -1967,7 +1968,7 @@ ext2_write(ap) } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index a90ee8d..e18d0b1 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -600,7 +600,7 @@ msdosfs_read(ap) error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, - NOCRED, on + uio->uio_resid, seqcount, &bp); + NOCRED, on + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, @@ -820,7 +820,7 @@ msdosfs_write(ap) else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, bp, dep->de_FileSize, - seqcount); + seqcount, 0); else bawrite(bp); } else diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index b1a3b1d..abe073e 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -478,8 +478,9 @@ udf_read(struct vop_read_args *ap) rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(udfmp, rablock) < fsize) { - error = cluster_read(vp, fsize, lbn, size, NOCRED, - uio->uio_resid, (ap->a_ioflag >> 16), &bp); + error = cluster_read(vp, fsize, lbn, size, + NOCRED, uio->uio_resid, + (ap->a_ioflag >> 16), 0, &bp); } else { error = bread(vp, lbn, size, NOCRED, &bp); } diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index 0294887..60e9ed3 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -182,10 +182,11 @@ g_clone_bio(struct bio *bp) * ordering restrictions, so this flag needs to be cloned. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & BIO_ORDERED; + bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_NOTMAPPED); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; @@ -210,11 +211,13 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); + bp2->bio_flags = bp->bio_flags & BIO_NOTMAPPED; bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index afc6549..cb80215 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -188,7 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; - bip->bio_data = bp->b_data; + if ((bp->b_flags & B_NOTMAPPED) != 0) { + bip->bio_ma = bp->b_pages; + bip->bio_data = unmapped_buf; + bip->bio_flags |= BIO_NOTMAPPED; + } else { + bip->bio_data = bp->b_data; + bip->bio_ma = NULL; + } bip->bio_done = g_vfs_done; bip->bio_caller2 = bp; bip->bio_length = bp->b_bcount; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index f14f215..10e2521 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -4224,6 +4224,48 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + struct sysmaps *sysmaps; + vm_page_t a_pg, b_pg; + char *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1) + panic("pmap_copy_page: CMAP1 busy"); + if (*sysmaps->CMAP2) + panic("pmap_copy_page: CMAP2 busy"); + sched_pin(); + while (xfersize > 0) { + invlpg((u_int)sysmaps->CADDR1); + invlpg((u_int)sysmaps->CADDR2); + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | + pmap_cache_bits(b_pg->md.pat_mode, 0); + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | + PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); + a_cp = sysmaps->CADDR1 + a_pg_offset; + b_cp = sysmaps->CADDR2 + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + *sysmaps->CMAP1 = 0; + *sysmaps->CMAP2 = 0; + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + mtx_unlock(&sysmaps->lock); +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 0480bd4..06d3167 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -91,6 +91,7 @@ struct buf_ops buf_ops_bio = { * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ +caddr_t unmapped_buf; static struct proc *bufdaemonproc; @@ -671,6 +672,21 @@ bufinit(void) bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS); +} + +static void +bpmap_qenter(struct buf *bp) +{ + + /* + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* @@ -682,11 +698,13 @@ static void bfreekva(struct buf *bp) { - if (bp->b_kvasize) { + if (bp->b_kvasize != 0) { atomic_add_int(&buffreekvacnt, 1); atomic_subtract_long(&bufspace, bp->b_kvasize); - vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, - (vm_offset_t) bp->b_kvabase + bp->b_kvasize); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, + (vm_offset_t) bp->b_kvabase + bp->b_kvasize); + } bp->b_kvasize = 0; bufspacewakeup(); } @@ -826,9 +844,9 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, * getblk(). Also starts asynchronous I/O on read-ahead blocks. */ int -breadn_flags(struct vnode * vp, daddr_t blkno, int size, - daddr_t * rablkno, int *rabsize, int cnt, - struct ucred * cred, int flags, struct buf **bpp) +breadn_flags(struct vnode *vp, daddr_t blkno, int size, + daddr_t *rablkno, int *rabsize, int cnt, + struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; @@ -1363,7 +1381,7 @@ brelse(struct buf *bp) } } - if ((bp->b_flags & B_INVAL) == 0) { + if ((bp->b_flags & (B_INVAL | B_NOTMAPPED)) == 0) { pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); @@ -1612,7 +1630,8 @@ vfs_vmio_release(struct buf *bp) int i; vm_page_t m; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; @@ -1716,8 +1735,10 @@ vfs_bio_awrite(struct buf *bp) int nwritten; int size; int maxcl; + int gbflags; bo = &vp->v_bufobj; + gbflags = (bp->b_flags & B_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster @@ -1748,8 +1769,9 @@ vfs_bio_awrite(struct buf *bp) */ if (ncl != 1) { BUF_UNLOCK(bp); - nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); - return nwritten; + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, + gbflags); + return (nwritten); } } bremfree(bp); @@ -1793,7 +1815,7 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, struct buf *bp; struct buf *nbp; int defrag = 0; - int nqindex; + int nqindex, notmapped; static int flushingbufs; td = curthread; @@ -1969,6 +1991,7 @@ restart: if (bp->b_bufsize) allocbuf(bp, 0); + notmapped = (bp->b_flags & B_NOTMAPPED) != 0; bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; @@ -1996,7 +2019,7 @@ restart: * If we are defragging then free the buffer. */ if (defrag) { - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | (notmapped ? B_NOTMAPPED : 0); bfreekva(bp); brelse(bp); defrag = 0; @@ -2008,7 +2031,7 @@ restart: * identity change by freeing the buffer. */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | (notmapped ? B_NOTMAPPED : 0); bfreekva(bp); brelse(bp); goto restart; @@ -2022,7 +2045,7 @@ restart: if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | (notmapped ? B_NOTMAPPED : 0); bfreekva(bp); brelse(bp); goto restart; @@ -2096,6 +2119,11 @@ restart: } } mtx_unlock(&nblock); + } else if ((gbflags & GB_NOTMAPPED) != 0) { + bfreekva(bp); + bp->b_flags |= B_NOTMAPPED; + bp->b_data = unmapped_buf; + bp->b_kvasize = maxsize; } else { /* * We finally have a valid bp. We aren't quite out of the @@ -2105,7 +2133,7 @@ restart: */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - if (maxsize != bp->b_kvasize) { + if (maxsize != bp->b_kvasize || notmapped) { vm_offset_t addr = 0; bfreekva(bp); @@ -2124,17 +2152,15 @@ restart: brelse(bp); goto restart; } - if (addr) { - vm_map_insert(buffer_map, NULL, 0, - addr, addr + maxsize, - VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); - - bp->b_kvabase = (caddr_t) addr; - bp->b_kvasize = maxsize; - atomic_add_long(&bufspace, bp->b_kvasize); - atomic_add_int(&bufreusecnt, 1); - } + KASSERT(addr != 0, ("addr == 0")); + vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); vm_map_unlock(buffer_map); + + bp->b_kvabase = (caddr_t)addr; + bp->b_kvasize = maxsize; + atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; @@ -2589,7 +2615,8 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, { struct buf *bp; struct bufobj *bo; - int error; + vm_offset_t addr; + int error, maxsize; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); ASSERT_VOP_LOCKED(vp, "getblk"); @@ -2605,8 +2632,6 @@ loop: * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. - * - * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (TD_IS_IDLETHREAD(curthread)) @@ -2696,13 +2721,34 @@ loop: } } + if ((bp->b_flags & B_NOTMAPPED) != 0 && + (flags & GB_NOTMAPPED) == 0) { + maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; + vm_map_lock(buffer_map); + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr)) { + /* XXXKIB implement */ + panic("resizing: fragmented"); + } + KASSERT(addr != 0, ("addr == 0")); + vm_map_insert(buffer_map, NULL, 0, addr, + addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, + MAP_NOFAULT); + vm_map_unlock(buffer_map); + bp->b_kvabase = (caddr_t)addr; + bp->b_kvasize = maxsize; + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; + bp->b_flags &= ~B_NOTMAPPED; + bpmap_qenter(bp); + } + /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ - if (bp->b_bcount != size) allocbuf(bp, size); @@ -2991,10 +3037,12 @@ allocbuf(struct buf *bp, int size) if (desiredpages < bp->b_npages) { vm_page_t m; - pmap_qremove((vm_offset_t)trunc_page( - (vm_offset_t)bp->b_data) + - (desiredpages << PAGE_SHIFT), - (bp->b_npages - desiredpages)); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qremove((vm_offset_t)trunc_page( + (vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), + (bp->b_npages - desiredpages)); + } VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* @@ -3100,21 +3148,10 @@ allocbuf(struct buf *bp, int size) VM_OBJECT_UNLOCK(obj); /* - * Step 3, fixup the KVM pmap. Remember that - * bp->b_data is relative to bp->b_offset, but - * bp->b_offset may be offset into the first page. + * Step 3, fixup the KVM pmap. */ - - bp->b_data = (caddr_t) - trunc_page((vm_offset_t)bp->b_data); - pmap_qenter( - (vm_offset_t)bp->b_data, - bp->b_pages, - bp->b_npages - ); - - bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | - (vm_offset_t)(bp->b_offset & PAGE_MASK)); + if ((bp->b_flags & B_NOTMAPPED) == 0) + bpmap_qenter(bp); } } if (newbsize < bp->b_bufsize) @@ -3395,7 +3432,7 @@ bufdone_finish(struct buf *bp) } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_NOTMAPPED) == 0) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } @@ -3440,8 +3477,10 @@ vfs_unbusy_pages(struct buf *bp) if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), - bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); @@ -3606,7 +3645,7 @@ vfs_busy_pages(struct buf *bp, int clear_modify) foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_NOTMAPPED) == 0) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } @@ -3664,8 +3703,7 @@ vfs_bio_set_valid(struct buf *bp, int base, int size) void vfs_bio_clrbuf(struct buf *bp) { - int i, j, mask; - caddr_t sa, ea; + int i, j, mask, sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); @@ -3683,30 +3721,30 @@ vfs_bio_clrbuf(struct buf *bp) if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { - bzero(bp->b_data, bp->b_bufsize); + pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } - ea = sa = bp->b_data; - for(i = 0; i < bp->b_npages; i++, sa = ea) { - ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); - ea = (caddr_t)(vm_offset_t)ulmin( - (u_long)(vm_offset_t)ea, - (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); + ea = sa = 0; + for (i = 0; i < bp->b_npages; i++, sa = ea) { + ea = trunc_page(sa + PAGE_SIZE); + ea = imin(ea, bp->b_bufsize); if (bp->b_pages[i] == bogus_page) continue; - j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; + j = (sa & PAGE_MASK) / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) - bzero(sa, ea - sa); + pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { - if ((bp->b_pages[i]->valid & (1 << j)) == 0) - bzero(sa, DEV_BSIZE); + if ((bp->b_pages[i]->valid & (1 << j)) == 0) { + pmap_zero_page_area(bp->b_pages[i], + sa, DEV_BSIZE); + } } } bp->b_pages[i]->valid |= mask; @@ -3716,6 +3754,30 @@ unlock: bp->b_resid = 0; } +void +vfs_bio_bzero_buf(struct buf *bp, int base, int size) +{ + vm_page_t m; + int i, n; + + if ((bp->b_flags & B_NOTMAPPED) == 0) + bzero(bp->b_data + base, size); + else { + n = PAGE_SIZE - (base & PAGE_MASK); + VM_OBJECT_LOCK(bp->b_bufobj->bo_object); + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + m = bp->b_pages[i]; + if (n > size) + n = size; + pmap_zero_page_area(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); + } +} + /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are @@ -3728,6 +3790,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) vm_page_t p; int index; + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Not mapped malloc buffer")); + to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; @@ -3759,9 +3823,11 @@ vm_hold_free_pages(struct buf *bp, int newbsize) vm_page_t p; int index, newnpages; + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Not mapped malloc buffer")); + from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; - if (bp->b_npages > newnpages) + if (bp->b_npages > newnpages && (bp->b_flags & B_NOTMAPPED) == 0) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; @@ -3795,6 +3861,8 @@ vmapbuf(struct buf *bp) vm_prot_t prot; int pidx; + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Not mapped buffer")); + if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; @@ -3822,8 +3890,11 @@ vunmapbuf(struct buf *bp) { int npages; + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Not mapped buffer")); + npages = bp->b_npages; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = bp->b_saveaddr; diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 19ee05f..d36cdee 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -62,9 +62,9 @@ static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp); -static struct buf * - cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, - daddr_t blkno, long size, int run, struct buf *fbp); +static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, + daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, + struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; @@ -83,15 +83,9 @@ extern vm_page_t bogus_page; * cluster_read replaces bread. */ int -cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lblkno; - long size; - struct ucred *cred; - long totread; - int seqcount; - struct buf **bpp; +cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, + struct ucred *cred, long totread, int seqcount, int gbflags, + struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; @@ -117,7 +111,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) /* * get the requested block */ - *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); origblkno = lblkno; /* @@ -208,7 +202,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, - blkno, size, nblks, bp); + blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; @@ -252,14 +246,14 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, - size, ncontig, NULL); + size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { - rbp = getblk(vp, lblkno, size, 0, 0, 0); + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); @@ -298,14 +292,8 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * -cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lbn; - daddr_t blkno; - long size; - int run; - struct buf *fbp; +cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct bufobj *bo; struct buf *bp, *tbp; @@ -329,7 +317,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) tbp = fbp; tbp->b_iocmd = BIO_READ; } else { - tbp = getblk(vp, lbn, size, 0, 0, 0); + tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; @@ -350,9 +338,13 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + if ((gbflags & GB_NOTMAPPED) != 0) + bp->b_flags |= B_NOTMAPPED; + else { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; @@ -499,8 +491,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *)bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } return (bp); } @@ -523,7 +517,10 @@ cluster_callback(bp) if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), + bp->b_npages); + } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. @@ -565,18 +562,19 @@ cluster_callback(bp) */ static __inline int -cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { int r = 0; - switch(write_behind) { + switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: - r = cluster_wbuild(vp, size, start_lbn, len); + r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ @@ -596,7 +594,8 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) * 4. end of a cluster - asynchronously write cluster */ void -cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) +cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, + int gbflags) { daddr_t lbn; int maxclen, cursize; @@ -642,7 +641,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; @@ -667,7 +666,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, - cursize); + cursize, gbflags); } } else { /* @@ -715,8 +714,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * update daemon handle it. */ bdwrite(bp); - if (seqcount > 1) - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) { + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, + vp->v_clen + 1, gbflags); + } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { @@ -742,11 +743,8 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * the current block (if last_bp == NULL). */ int -cluster_wbuild(vp, size, start_lbn, len) - struct vnode *vp; - long size; - daddr_t start_lbn; - int len; +cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; @@ -832,10 +830,14 @@ cluster_wbuild(vp, size, start_lbn, len) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); - bp->b_flags |= B_CLUSTER | - (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); + if ((gbflags & GB_NOTMAPPED) == 0 || + (tbp->b_flags & B_VMIO) != 0) { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } else + bp->b_flags |= B_NOTMAPPED; + bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | + B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* @@ -956,8 +958,10 @@ cluster_wbuild(vp, size, start_lbn, len) tbp, b_cluster.cluster_entry); } finishcluster: - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *) bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index ea42f9d..9bc6723 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1117,6 +1117,46 @@ vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) return (error); } +int +vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio) +{ + struct thread *td; + vm_offset_t iov_base; + int adv, cnt, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove_fromphys(ma, offset, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; + iov_base = (vm_offset_t)uio->uio_iov->iov_base; + switch (uio->uio_rw) { + case UIO_WRITE: + pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, + offset, cnt); + break; + case UIO_READ: + pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, + cnt); + break; + } + adv = xfersize - cnt; + pgadv = ((iov_base + adv) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)(iov_base + adv); + uio->uio_iov->iov_len -= adv; + uio->uio_resid -= adv; + uio->uio_offset += adv; + return (0); +} + + /* * File table truncate routine. */ diff --git a/sys/sys/bio.h b/sys/sys/bio.h index c016ee6..8c5fc2d 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -55,6 +55,7 @@ #define BIO_DONE 0x02 #define BIO_ONQUEUE 0x04 #define BIO_ORDERED 0x08 +#define BIO_NOTMAPPED 0x10 #ifdef _KERNEL struct disk; @@ -78,6 +79,7 @@ struct bio { off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ + struct vm_page **bio_ma; /* Or unmapped. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 418d6c5..0e576c2 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -202,7 +202,7 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ +#define B_NOTMAPPED 0x00000800 /* KVA is not mapped. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_00004000 0x00004000 /* Available flag. */ @@ -453,7 +453,8 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ -#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ +#define GB_NOTMAPPED 0x0008 /* Do not mmap buffer pages. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -470,6 +471,7 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ +extern caddr_t unmapped_buf; void runningbufwakeup(struct buf *); void waitrunningbufspace(void); @@ -480,7 +482,7 @@ int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ - breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp) + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, @@ -506,9 +508,10 @@ void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, - struct ucred *, long, int, struct buf **); -int cluster_wbuild(struct vnode *, long, daddr_t, int); -void cluster_write(struct vnode *, struct buf *, u_quad_t, int); + struct ucred *, long, int, int, struct buf **); +int cluster_wbuild(struct vnode *, long, daddr_t, int, int); +void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); +void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 029458f..249efa7 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -686,6 +686,8 @@ int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); +int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 5ad5775..995c1a4 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -254,7 +254,7 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; - int error; + int error, gbflags; ufs2_daddr_t bno; static struct timeval lastfail; static int curfail; @@ -265,6 +265,8 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) fs = ip->i_fs; bp = NULL; ump = ip->i_ump; + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; + mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) @@ -296,7 +298,8 @@ retry: /* * Allocate the extra space in the buffer. */ - error = bread(vp, lbprev, osize, NOCRED, &bp); + error = breadn_flags(vp, lbprev, osize, NULL, NULL, 0, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -332,7 +335,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; @@ -400,7 +403,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 0e29be87f..4019b01 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -107,7 +107,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din1; @@ -123,6 +123,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, return (EOPNOTSUPP); if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -211,7 +212,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -255,7 +256,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { @@ -389,7 +390,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -418,16 +419,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = breadn_flags(vp, lbn, (int)fs->fs_bsize, NULL, + NULL, 0, NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); @@ -539,7 +541,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din2; @@ -553,6 +555,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, *bpp = NULL; if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -620,7 +623,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, -1 - lbn, osize, NOCRED, &bp); + error = breadn_flags(vp, -1 - lbn, osize, NULL, + NULL, 0, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); @@ -653,7 +657,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0); + bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); bp->b_xflags |= BX_ALTDATA; if (flags & BA_CLRBUF) @@ -707,7 +711,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { - error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); + error = breadn_flags(vp, lbn, fs->fs_bsize, NULL, NULL, + 0, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); @@ -723,7 +728,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, lbn, osize, NOCRED, &bp); + error = breadn_flags(vp, lbn, osize, NULL, + NULL, 0, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); @@ -753,7 +759,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, &dp->di_db[0]), nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -931,7 +937,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -966,16 +972,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = breadn_flags(vp, lbn, (int)fs->fs_bsize, + NULL, NULL, 0, NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 83ae202..87bd6f4 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -2095,6 +2095,7 @@ ffs_bufwrite(struct buf *bp) * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 5c99d5b..3cd3110 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -508,7 +508,8 @@ ffs_read(ap) /* * Don't do readahead if this is the end of the file. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = breadn_flags(vp, lbn, size, 0, 0, 0, NOCRED, + GB_NOTMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, @@ -518,7 +519,8 @@ ffs_read(ap) * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, - size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); + size, NOCRED, blkoffset + uio->uio_resid, + seqcount, GB_NOTMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then @@ -529,15 +531,16 @@ ffs_read(ap) * the 6th argument. */ int nextsize = blksize(fs, ip, nextlbn); - error = breadn(vp, lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); + error = breadn_flags(vp, lbn, size, &nextlbn, + &nextsize, 1, NOCRED, GB_NOTMAPPED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = breadn_flags(vp, lbn, size, 0, 0, 0, NOCRED, + GB_NOTMAPPED, &bp); } if (error) { brelse(bp); @@ -568,8 +571,13 @@ ffs_read(ap) xfersize = size; } - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } if (error) break; @@ -700,6 +708,7 @@ ffs_write(ap) flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; + flags |= BA_NOTMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); @@ -739,8 +748,13 @@ ffs_write(ap) if (size < xfersize) xfersize = size; - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any @@ -783,7 +797,8 @@ ffs_write(ap) } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, + GB_NOTMAPPED); } else { bawrite(bp); } diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index c590748..82973a2 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -121,6 +121,7 @@ void softdep_revert_rmdir(struct inode *, struct inode *); */ #define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ #define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_NOTMAPPED 0x00040000 /* Do not mmap resulted buffer. */ #define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ #define BA_SEQSHIFT 24 #define BA_SEQMAX 0x7F diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h index d06c22b..c64a549 100644 --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -108,6 +108,8 @@ void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); +void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, + vm_page_t mb[], vm_offset_t b_offset, int xfersize); void pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t, vm_prot_t, boolean_t); void pmap_enter_object(pmap_t pmap, vm_offset_t start,