diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 8e06ff9..e432d84 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4216,6 +4216,30 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) pagecopy((void *)src, (void *)dst); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]-> + phys_addr) + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]-> + phys_addr) + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 4252197..eef0a43 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1167,6 +1167,7 @@ adaregister(struct cam_periph *periph, void *arg) ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))) softc->disk->d_flags |= DISKFLAG_CANDELETE; + softc->disk->d_flags |= DISKFLAG_NOTMAPPED_BIO; strlcpy(softc->disk->d_descr, cgd->ident_data.model, MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model))); strlcpy(softc->disk->d_ident, cgd->ident_data.serial, @@ -1431,14 +1432,16 @@ adastart(struct cam_periph *periph, union ccb *start_ccb) return; } #endif - cam_fill_ataio(ataio, + cam_fill_ataio_U(ataio, ada_retry_count, adadone, bp->bio_cmd == BIO_READ ? CAM_DIR_IN : CAM_DIR_OUT, tag_code, bp->bio_data, + (bp->bio_flags & BIO_NOTMAPPED) ? bp->bio_ma : NULL, bp->bio_bcount, + bp->bio_ma_offset, ada_default_timeout*1000); if ((softc->flags & ADA_FLAG_CAN_NCQ) && tag_code) { diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h index 1f12d91..6b9ca9f 100644 --- a/sys/cam/cam_ccb.h +++ b/sys/cam/cam_ccb.h @@ -42,6 +42,7 @@ #include #include +struct vm_page; /* General allocation length definitions for CCB structures */ #define IOCDBLEN CAM_MAX_CDBLEN /* Space for CDB bytes/pointer */ @@ -713,7 +714,9 @@ struct ccb_ataio { struct ata_cmd cmd; /* ATA command register set */ struct ata_res res; /* ATA result register set */ u_int8_t *data_ptr; /* Ptr to the data buf/SG list */ + struct vm_page **ma; u_int32_t dxfer_len; /* Data transfer length */ + int ma_offset; u_int32_t resid; /* Transfer residual length: 2's comp */ u_int8_t tag_action; /* What to do for tag queueing */ /* @@ -1201,6 +1204,13 @@ cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, u_int32_t timeout); static __inline void +cam_fill_ataio_U(struct ccb_ataio *ataio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + u_int32_t flags, u_int tag_action, + u_int8_t *data_ptr, struct vm_page **ma, u_int32_t dxfer_len, + int ma_offset, u_int32_t timeout); + +static __inline void cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t *smp_request, int smp_request_len, @@ -1248,6 +1258,25 @@ cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries, } static __inline void +cam_fill_ataio_U(struct ccb_ataio *ataio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + u_int32_t flags, u_int tag_action, + u_int8_t *data_ptr, struct vm_page **ma, u_int32_t dxfer_len, + int ma_offset, u_int32_t timeout) +{ + ataio->ccb_h.func_code = XPT_ATA_IO; + ataio->ccb_h.flags = flags; + ataio->ccb_h.retry_count = retries; + ataio->ccb_h.cbfcnp = cbfcnp; + ataio->ccb_h.timeout = timeout; + ataio->data_ptr = data_ptr; + ataio->ma = ma; + ataio->ma_offset = ma_offset; + ataio->dxfer_len = dxfer_len; + ataio->tag_action = tag_action; +} + +static __inline void cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action, @@ -1260,6 +1289,7 @@ cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, ataio->ccb_h.cbfcnp = cbfcnp; ataio->ccb_h.timeout = timeout; ataio->data_ptr = data_ptr; + ataio->ma = NULL; ataio->dxfer_len = dxfer_len; ataio->tag_action = tag_action; } diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c index 43ec254..787f0f4 100644 --- a/sys/dev/ahci/ahci.c +++ b/sys/dev/ahci/ahci.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -51,6 +52,14 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include + +static int ahci_unmapped; +SYSCTL_INT(_debug, OID_AUTO, ahci_unmapped, CTLFLAG_RD, + &ahci_unmapped, 0, + ""); + /* local prototypes */ static int ahci_setup_interrupt(device_t dev); static void ahci_intr(void *data); @@ -95,6 +104,10 @@ static void ahci_process_request_sense(device_t dev, union ccb *ccb); static void ahciaction(struct cam_sim *sim, union ccb *ccb); static void ahcipoll(struct cam_sim *sim); +struct vm_page; +static void ahci_unmappedprd(struct ahci_slot *slot, struct vm_page **ma, + int ma_offset, bus_size_t size); + static MALLOC_DEFINE(M_AHCI, "AHCI driver", "AHCI driver data buffers"); static struct { @@ -1675,12 +1688,19 @@ ahci_begin_transaction(device_t dev, union ccb *ccb) /* If request moves data, setup and load SG list */ if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) { void *buf; + struct vm_page **ma; bus_size_t size; slot->state = AHCI_SLOT_LOADING; if (ccb->ccb_h.func_code == XPT_ATA_IO) { - buf = ccb->ataio.data_ptr; + ma = ccb->ataio.ma; size = ccb->ataio.dxfer_len; + if (ma != NULL) { + ahci_unmappedprd(slot, ma, ccb->ataio.ma_offset, + size); + return; + } + buf = ccb->ataio.data_ptr; } else { buf = ccb->csio.data_ptr; size = ccb->csio.dxfer_len; @@ -1691,6 +1711,37 @@ ahci_begin_transaction(device_t dev, union ccb *ccb) ahci_execute_transaction(slot); } +static void +ahci_unmappedprd(struct ahci_slot *slot, struct vm_page **ma, int ma_offset, + bus_size_t size) +{ + struct ahci_channel *ch = device_get_softc(slot->dev); + struct ahci_cmd_tab *ctp; + struct ahci_dma_prd *prd; + bus_size_t left, c; + int i, npages; + + npages = (ma_offset + size + PAGE_SIZE - 1) / PAGE_SIZE; + KASSERT(npages <= AHCI_SG_ENTRIES, ("too many DMA segment entries")); + ctp = (struct ahci_cmd_tab *) + (ch->dma.work + AHCI_CT_OFFSET + (AHCI_CT_SIZE * slot->slot)); + prd = &ctp->prd_tab[0]; + for (i = 0, left = size; i < npages; i++, left -= PAGE_SIZE) { + prd[i].dba = htole64(VM_PAGE_TO_PHYS(ma[i]) + ma_offset); + c = min(PAGE_SIZE - ma_offset, left); + ma_offset = 0; + prd[i].dbc = htole32((c - 1) & AHCI_PRD_MASK); + } + slot->dma.nsegs = npages; +#if 0 + bus_dmamap_sync(ch->dma.data_tag, slot->dma.data_map, + ((slot->ccb->ccb_h.flags & CAM_DIR_IN) ? + BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE)); +#endif + atomic_add_int(&ahci_unmapped, 1); + ahci_execute_transaction(slot); +} + /* Locked by busdma engine. */ static void ahci_dmasetprd(void *arg, bus_dma_segment_t *segs, int nsegs, int error) diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index b72f294..9918564 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -110,6 +110,19 @@ static int md_malloc_wait; SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "Allow malloc to wait for memory allocations"); +static int md_unmapped_swap; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_swap, CTLFLAG_RD, + &md_unmapped_swap, 0, + ""); +static int md_unmapped_vnode; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_vnode, CTLFLAG_RD, + &md_unmapped_vnode, 0, + ""); +static int md_unmapped_malloc; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_malloc, CTLFLAG_RD, + &md_unmapped_malloc, 0, + ""); + #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) #define MD_ROOT_FSTYPE "ufs" #endif @@ -414,13 +427,103 @@ g_md_start(struct bio *bp) wakeup(sc); } +#define MD_MALLOC_MOVE_ZERO 1 +#define MD_MALLOC_MOVE_FILL 2 +#define MD_MALLOC_MOVE_READ 3 +#define MD_MALLOC_MOVE_WRITE 4 +#define MD_MALLOC_MOVE_CMP 5 + +static int +md_malloc_move(vm_page_t **mp, vm_offset_t *ma_offs, unsigned sectorsize, + void *ptr, u_char fill, int op) +{ + struct sf_buf *sf; + vm_page_t m, *mp1; + char *p, first; + vm_offset_t ma_offs1; + off_t *uc; + unsigned n; + int error, i, sz, first_read; + + m = NULL; + error = 0; + sf = NULL; + /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ + first = 0; + first_read = 0; + uc = ptr; + mp1 = *mp; + ma_offs1 = *ma_offs; + /* } */ + sched_pin(); + for (n = sectorsize; n != 0; n -= sz) { + sz = imin(PAGE_SIZE - *ma_offs, n); + if (m != **mp) { + if (sf != NULL) + sf_buf_free(sf); + m = **mp; + sf = sf_buf_alloc(m, SFB_CPUPRIVATE | + (md_malloc_wait ? 0 : SFB_NOWAIT)); + if (sf == NULL) { + error = ENOMEM; + break; + } + } + p = (char *)sf_buf_kva(sf) + *ma_offs; + switch (op) { + case MD_MALLOC_MOVE_ZERO: + bzero(p, sz); + break; + case MD_MALLOC_MOVE_FILL: + memset(p, fill, sz); + break; + case MD_MALLOC_MOVE_READ: + bcopy(ptr, p, sz); + cpu_flush_dcache(p, sz); + break; + case MD_MALLOC_MOVE_WRITE: + bcopy(p, ptr, sz); + break; + case MD_MALLOC_MOVE_CMP: + for (i = 0; i < sz; i++, p++) { + if (!first_read) { + *uc = *p; + first = *p; + first_read = 1; + } else if (*p != first) { + error = EDOOFUS; + break; + } + } + break; + } + if (error != 0) + break; + *ma_offs += sz; + *ma_offs %= PAGE_SIZE; + if (*ma_offs == 0) + (*mp)++; + } + + if (sf != NULL) + sf_buf_free(sf); + sched_unpin(); + if (op == MD_MALLOC_MOVE_CMP) { + *mp = mp1; + *ma_offs = ma_offs1; + } + return (error); +} + static int mdstart_malloc(struct md_s *sc, struct bio *bp) { - int i, error; u_char *dst; + vm_page_t *m; + int i, error, error1, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; + vm_offset_t ma_offs; switch (bp->bio_cmd) { case BIO_READ: @@ -431,9 +534,17 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) return (EOPNOTSUPP); } + notmapped = (bp->bio_flags & BIO_NOTMAPPED) != 0; + if (notmapped) { + m = bp->bio_ma; + ma_offs = bp->bio_ma_offset; + dst = NULL; + } else { + dst = bp->bio_data; + } + nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; - dst = bp->bio_data; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); @@ -441,21 +552,45 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { - if (osp == 0) - bzero(dst, sc->sectorsize); - else if (osp <= 255) - memset(dst, osp, sc->sectorsize); - else { - bcopy((void *)osp, dst, sc->sectorsize); - cpu_flush_dcache(dst, sc->sectorsize); + if (osp == 0) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, 0, + MD_MALLOC_MOVE_ZERO); + } else + bzero(dst, sc->sectorsize); + } else if (osp <= 255) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, osp, + MD_MALLOC_MOVE_FILL); + } else + memset(dst, osp, sc->sectorsize); + } else { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, (void *)osp, 0, + MD_MALLOC_MOVE_READ); + } else { + bcopy((void *)osp, dst, sc->sectorsize); + cpu_flush_dcache(dst, sc->sectorsize); + } } osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { - uc = dst[0]; - for (i = 1; i < sc->sectorsize; i++) - if (dst[i] != uc) - break; + if (notmapped) { + error1 = md_malloc_move(&m, &ma_offs, + sc->sectorsize, &uc, 0, + MD_MALLOC_MOVE_CMP); + i = error1 == 0 ? sc->sectorsize : 0; + } else { + uc = dst[0]; + for (i = 1; i < sc->sectorsize; i++) { + if (dst[i] != uc) + break; + } + } } else { i = 0; uc = 0; @@ -472,10 +607,26 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) error = ENOSPC; break; } - bcopy(dst, (void *)sp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)sp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)sp, + sc->sectorsize); + } error = s_write(sc->indir, secno, sp); } else { - bcopy(dst, (void *)osp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)osp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)osp, + sc->sectorsize); + } osp = 0; } } @@ -487,7 +638,8 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) if (error != 0) break; secno++; - dst += sc->sectorsize; + if (!notmapped) + dst += sc->sectorsize; } bp->bio_resid = 0; return (error); @@ -628,11 +780,12 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) static int mdstart_swap(struct md_s *sc, struct bio *bp) { - struct sf_buf *sf; - int rv, offs, len, lastend; - vm_pindex_t i, lastp; vm_page_t m; u_char *p; + struct uio uio; + struct iovec iov[1]; + vm_pindex_t i, lastp; + int rv, ma_offs, offs, len, lastend, j; switch (bp->bio_cmd) { case BIO_READ: @@ -644,6 +797,17 @@ mdstart_swap(struct md_s *sc, struct bio *bp) } p = bp->bio_data; + if ((bp->bio_flags & BIO_NOTMAPPED) == 0) { + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = bp->bio_cmd == BIO_READ ? UIO_READ : UIO_WRITE; + uio.uio_td = curthread; + uio.uio_iov = iov; + uio.uio_iovcnt = 1; + ma_offs = 0; + } else { + atomic_add_int(&md_unmapped_swap, 1); + ma_offs = bp->bio_ma_offset; + } /* * offs is the offset at which to start operating on the @@ -659,21 +823,14 @@ mdstart_swap(struct md_s *sc, struct bio *bp) rv = VM_PAGER_OK; VM_OBJECT_LOCK(sc->object); vm_object_pip_add(sc->object, 1); - for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { + for (i = bp->bio_offset / PAGE_SIZE, j = 0; i <= lastp; i++, j++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); - VM_OBJECT_UNLOCK(sc->object); - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - VM_OBJECT_LOCK(sc->object); + m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL | + VM_ALLOC_RETRY); if (bp->bio_cmd == BIO_READ) { if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } else if (rv == VM_PAGER_FAIL) { @@ -683,40 +840,52 @@ mdstart_swap(struct md_s *sc, struct bio *bp) * valid. Do not set dirty, the page * can be recreated if thrown out. */ - bzero((void *)sf_buf_kva(sf), PAGE_SIZE); + pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } - bcopy((void *)(sf_buf_kva(sf) + offs), p, len); - cpu_flush_dcache(p, len); + if ((bp->bio_flags & BIO_NOTMAPPED) != 0) { + pmap_copy_pages(&m, offs, &bp->bio_ma[j], + ma_offs, len); + } else { + uio.uio_resid = len; + uio.uio_offset = offs; + iov[0].iov_base = p; + iov[0].iov_len = len; + uiomove_fromphys(&m, offs, len, &uio); + cpu_flush_dcache(p, len); + } } else if (bp->bio_cmd == BIO_WRITE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } - bcopy(p, (void *)(sf_buf_kva(sf) + offs), len); + if ((bp->bio_flags & BIO_NOTMAPPED) != 0) { + pmap_copy_pages(&bp->bio_ma[j], ma_offs, &m, + offs, len); + } else { + uio.uio_resid = len; + uio.uio_offset = offs; + iov[0].iov_base = p; + iov[0].iov_len = len; + uiomove_fromphys(&m, offs, len, &uio); + } m->valid = VM_PAGE_BITS_ALL; } else if (bp->bio_cmd == BIO_DELETE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } if (len != PAGE_SIZE) { - bzero((void *)(sf_buf_kva(sf) + offs), len); + pmap_zero_page_area(m, offs, len); vm_page_clear_dirty(m, offs, len); m->valid = VM_PAGE_BITS_ALL; } else vm_pager_page_unswapped(m); } - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); vm_page_lock(m); if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) @@ -730,6 +899,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; + ma_offs = 0; } vm_object_pip_subtract(sc->object, 1); VM_OBJECT_UNLOCK(sc->object); @@ -845,6 +1015,14 @@ mdinit(struct md_s *sc) pp = g_new_providerf(gp, "md%d", sc->unit); pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; + switch (sc->type) { + case MD_SWAP: + case MD_MALLOC: + pp->flags |= G_PF_ACCEPT_UNMAPPED; + break; + default: + break; + } sc->gp = gp; sc->pp = pp; g_error_provider(pp, 0); diff --git a/sys/fs/cd9660/cd9660_vnops.c b/sys/fs/cd9660/cd9660_vnops.c index 21ee0fc..47d4f75 100644 --- a/sys/fs/cd9660/cd9660_vnops.c +++ b/sys/fs/cd9660/cd9660_vnops.c @@ -329,7 +329,7 @@ cd9660_read(ap) if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, - (ap->a_ioflag >> 16), &bp); + (ap->a_ioflag >> 16), 0, &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { diff --git a/sys/fs/ext2fs/ext2_balloc.c b/sys/fs/ext2fs/ext2_balloc.c index 6e60c6e..ba91ac5 100644 --- a/sys/fs/ext2fs/ext2_balloc.c +++ b/sys/fs/ext2fs/ext2_balloc.c @@ -281,7 +281,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->e2fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, 0, &nbp); } else { error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index 4f8f6a9..5d3d279 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1748,10 +1748,11 @@ ext2_read(ap) if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); - else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) + else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, - NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); - else if (seqcount > 1) { + NOCRED, blkoffset + uio->uio_resid, seqcount, + 0, &bp); + } else if (seqcount > 1) { int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); @@ -1967,7 +1968,7 @@ ext2_write(ap) } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index a90ee8d..e18d0b1 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -600,7 +600,7 @@ msdosfs_read(ap) error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, - NOCRED, on + uio->uio_resid, seqcount, &bp); + NOCRED, on + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, @@ -820,7 +820,7 @@ msdosfs_write(ap) else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, bp, dep->de_FileSize, - seqcount); + seqcount, 0); else bawrite(bp); } else diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index b1a3b1d..abe073e 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -478,8 +478,9 @@ udf_read(struct vop_read_args *ap) rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(udfmp, rablock) < fsize) { - error = cluster_read(vp, fsize, lbn, size, NOCRED, - uio->uio_resid, (ap->a_ioflag >> 16), &bp); + error = cluster_read(vp, fsize, lbn, size, + NOCRED, uio->uio_resid, + (ap->a_ioflag >> 16), 0, &bp); } else { error = bread(vp, lbn, size, NOCRED, &bp); } diff --git a/sys/geom/geom.h b/sys/geom/geom.h index 351b05d..660bf6e 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -205,6 +205,7 @@ struct g_provider { u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 +#define G_PF_ACCEPT_UNMAPPED 0x8 /* Two fields for the implementing class to use */ void *private; diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index 72e9162..adf9e5e 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -488,6 +488,8 @@ g_disk_create(void *arg, int flag) pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; + if ((dp->d_flags & DISKFLAG_NOTMAPPED_BIO) != 0) + pp->flags |= G_PF_ACCEPT_UNMAPPED; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); diff --git a/sys/geom/geom_disk.h b/sys/geom/geom_disk.h index 33d8eb2..cb81883 100644 --- a/sys/geom/geom_disk.h +++ b/sys/geom/geom_disk.h @@ -103,6 +103,7 @@ struct disk { #define DISKFLAG_OPEN 0x2 #define DISKFLAG_CANDELETE 0x4 #define DISKFLAG_CANFLUSHCACHE 0x8 +#define DISKFLAG_NOTMAPPED_BIO 0x10 struct disk *disk_alloc(void); void disk_create(struct disk *disk, int version); diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index 0294887..eed4f9b 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -51,6 +52,13 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include +#include +#include +#include +#include static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; @@ -182,10 +190,12 @@ g_clone_bio(struct bio *bp) * ordering restrictions, so this flag needs to be cloned. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & BIO_ORDERED; + bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_NOTMAPPED); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; @@ -210,11 +220,14 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); + bp2->bio_flags = bp->bio_flags & BIO_NOTMAPPED; bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR @@ -572,12 +585,33 @@ g_io_deliver(struct bio *bp, int error) return; } +SYSCTL_DECL(_kern_geom); + +static long transient_maps; +SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, + &transient_maps, 0, + ""); +int transient_map_retries; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RD, + &transient_map_retries, 0, + ""); +int transient_map_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_failures, CTLFLAG_RD, + &transient_map_failures, 0, + ""); +int inflight_transient_maps; +SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, + &inflight_transient_maps, 0, + ""); + void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; off_t excess; - int error; + vm_offset_t addr; + long size; + int error, retried, rv; for(;;) { g_bioq_lock(&g_bio_run_down); @@ -633,6 +667,56 @@ g_io_schedule_down(struct thread *tp __unused) default: break; } + if ((bp->bio_flags & BIO_NOTMAPPED) != 0 && + (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && + (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { + size = round_page(bp->bio_ma_offset + bp->bio_length); + addr = 0; + retried = 0; + atomic_add_long(&transient_maps, 1); +retry: + vm_map_lock(bio_transient_map); + if (vm_map_findspace(bio_transient_map, + vm_map_min(bio_transient_map), size, &addr)) { + vm_map_unlock(bio_transient_map); + if (retried >= 3) { + g_io_deliver(bp, EDEADLK/* XXXKIB */); + CTR2(KTR_GEOM, "g_down cannot map " + "bp %p provider %s", bp, + bp->bio_to->name); + atomic_add_int(&transient_map_failures, + 1); + continue; + } else { + /* + * Naive attempt to quisce the + * I/O to get more in-flight + * requests completed and + * defragment the bio_transient_map. + */ + CTR3(KTR_GEOM, "g_down retry map " + "bp %p provider %s r %d", bp, + bp->bio_to->name, retried); + pause("g_d_tra", hz / 10); + retried++; + atomic_add_int(&transient_map_retries, + 1); + goto retry; + } + } + rv = vm_map_insert(bio_transient_map, NULL, 0, addr, + addr + size, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, + ("vm_map_insert(bio_transient_map) rv %d %jx %lx", + rv, (uintmax_t)addr, size)); + vm_map_unlock(bio_transient_map); + atomic_add_int(&inflight_transient_maps, 1); + pmap_qenter((vm_offset_t)addr, bp->bio_ma, + OFF_TO_IDX(size)); + bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; + bp->bio_flags |= BIO_TRANSIENT_MAPPING; + bp->bio_flags &= ~BIO_NOTMAPPED; + } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index afc6549..c8fa189 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -188,7 +188,15 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; - bip->bio_data = bp->b_data; + if ((bp->b_flags & B_NOTMAPPED) != 0) { + bip->bio_ma = bp->b_pages; + bip->bio_data = unmapped_buf; + bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bip->bio_flags |= BIO_NOTMAPPED; + } else { + bip->bio_data = bp->b_data; + bip->bio_ma = NULL; + } bip->bio_done = g_vfs_done; bip->bio_caller2 = bp; bip->bio_length = bp->b_bcount; diff --git a/sys/geom/part/g_part.c b/sys/geom/part/g_part.c index e2ba79e..7650499 100644 --- a/sys/geom/part/g_part.c +++ b/sys/geom/part/g_part.c @@ -427,6 +427,7 @@ g_part_new_provider(struct g_geom *gp, struct g_part_table *table, entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset; if (pp->stripesize > 0) entry->gpe_pp->stripeoffset %= pp->stripesize; + entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(entry->gpe_pp, 0); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index f14f215..a7f6520 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -4224,6 +4224,49 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + struct sysmaps *sysmaps; + vm_page_t a_pg, b_pg; + char *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1 != 0) + panic("pmap_copy_pages: CMAP1 busy"); + if (*sysmaps->CMAP2 != 0) + panic("pmap_copy_pages: CMAP2 busy"); + sched_pin(); + while (xfersize > 0) { + invlpg((u_int)sysmaps->CADDR1); + invlpg((u_int)sysmaps->CADDR2); + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | + pmap_cache_bits(b_pg->md.pat_mode, 0); + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | + PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); + a_cp = sysmaps->CADDR1 + a_pg_offset; + b_cp = sysmaps->CADDR2 + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + *sysmaps->CMAP1 = 0; + *sysmaps->CMAP2 = 0; + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 32a1089..929e730 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -91,6 +91,7 @@ struct buf_ops buf_ops_bio = { * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ +caddr_t unmapped_buf; static struct proc *bufdaemonproc; @@ -131,6 +132,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Virtual memory used for buffers"); #endif +static long unmapped_bufspace; +SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD, + &unmapped_bufspace, 0, + "Amount of unmapped buffers, inclusive in the bufspace"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); @@ -200,6 +205,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); +static int mappingrestarts; +SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, + "Number of times getblk has had to restart a buffer mapping for " + "unmapped buffer"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); @@ -671,6 +680,55 @@ bufinit(void) bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS); +} + +#ifdef INVARIANTS +static inline void +vfs_buf_check_mapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, + ("mapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase != unmapped_buf, + ("mapped buf: b_kvabase was not updated %p", bp)); + KASSERT(bp->b_data != unmapped_buf, + ("mapped buf: b_data was not updated %p", bp)); +} + +static inline void +vfs_buf_check_unmapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_NOTMAPPED) == B_NOTMAPPED, + ("unmapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase == unmapped_buf, + ("unmapped buf: corrupted b_kvabase %p", bp)); + KASSERT(bp->b_data == unmapped_buf, + ("unmapped buf: corrupted b_data %p", bp)); +} + +#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) +#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) +#else +#define BUF_CHECK_MAPPED(bp) do {} while (0) +#define BUF_CHECK_UNMAPPED(bp) do {} while (0) +#endif + +static void +bpmap_qenter(struct buf *bp) +{ + + BUF_CHECK_MAPPED(bp); + + /* + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* @@ -682,14 +740,26 @@ static void bfreekva(struct buf *bp) { - if (bp->b_kvasize) { - atomic_add_int(&buffreekvacnt, 1); - atomic_subtract_long(&bufspace, bp->b_kvasize); - vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, - (vm_offset_t) bp->b_kvabase + bp->b_kvasize); - bp->b_kvasize = 0; - bufspacewakeup(); + if (bp->b_kvasize == 0) + return; + + atomic_add_int(&buffreekvacnt, 1); + atomic_subtract_long(&bufspace, bp->b_kvasize); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase, + (vm_offset_t)bp->b_kvabase + bp->b_kvasize); + } else { + BUF_CHECK_UNMAPPED(bp); + if ((bp->b_flags & B_KVAALLOC) != 0) { + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc, + (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize); + } + atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); + bp->b_flags &= ~(B_NOTMAPPED | B_KVAALLOC); } + bp->b_kvasize = 0; + bufspacewakeup(); } /* @@ -826,9 +896,8 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, * getblk(). Also starts asynchronous I/O on read-ahead blocks. */ int -breadn_flags(struct vnode * vp, daddr_t blkno, int size, - daddr_t * rablkno, int *rabsize, int cnt, - struct ucred * cred, int flags, struct buf **bpp) +breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, + int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; @@ -1363,7 +1432,8 @@ brelse(struct buf *bp) } } - if ((bp->b_flags & B_INVAL) == 0) { + if ((bp->b_flags & (B_INVAL | B_NOTMAPPED)) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); @@ -1612,7 +1682,11 @@ vfs_vmio_release(struct buf *bp) int i; vm_page_t m; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; @@ -1716,8 +1790,10 @@ vfs_bio_awrite(struct buf *bp) int nwritten; int size; int maxcl; + int gbflags; bo = &vp->v_bufobj; + gbflags = (bp->b_flags & B_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster @@ -1748,8 +1824,9 @@ vfs_bio_awrite(struct buf *bp) */ if (ncl != 1) { BUF_UNLOCK(bp); - nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); - return nwritten; + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, + gbflags); + return (nwritten); } } bremfree(bp); @@ -1796,6 +1873,9 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, int nqindex; static int flushingbufs; + KASSERT((gbflags & (GB_NOTMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_NOTMAPPED")); + td = curthread; /* * We can't afford to block since we might be holding a vnode lock, @@ -1969,7 +2049,7 @@ restart: if (bp->b_bufsize) allocbuf(bp, 0); - bp->b_flags = 0; + bp->b_flags &= B_NOTMAPPED; bp->b_ioflags = 0; bp->b_xflags = 0; KASSERT((bp->b_vflags & BV_INFREECNT) == 0, @@ -2096,6 +2176,14 @@ restart: } } mtx_unlock(&nblock); + } else if ((gbflags & (GB_NOTMAPPED | GB_KVAALLOC)) == GB_NOTMAPPED) { + bfreekva(bp); + bp->b_flags |= B_NOTMAPPED; + bp->b_kvabase = bp->b_data = unmapped_buf; + bp->b_kvasize = maxsize; + atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_long(&unmapped_bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } else { /* * We finally have a valid bp. We aren't quite out of the @@ -2105,7 +2193,8 @@ restart: */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - if (maxsize != bp->b_kvasize) { + if (maxsize != bp->b_kvasize || + (bp->b_flags & (B_NOTMAPPED | B_KVAALLOC)) == B_NOTMAPPED) { vm_offset_t addr = 0; int rv; @@ -2126,18 +2215,37 @@ restart: goto restart; } rv = vm_map_insert(buffer_map, NULL, 0, addr, - addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, + addr + maxsize, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv)); vm_map_unlock(buffer_map); - bp->b_kvabase = (caddr_t)addr; + if ((gbflags & GB_NOTMAPPED) == 0) { + bp->b_kvabase = (caddr_t)addr; + } else if ((gbflags & GB_KVAALLOC) != 0) { + KASSERT((gbflags & GB_NOTMAPPED) != 0, + ("GB_KVAALLOC without GB_NOTMAPPED")); + bp->b_kvaalloc = (caddr_t)addr; + bp->b_flags |= B_KVAALLOC; + atomic_add_long(&unmapped_bufspace, + bp->b_kvasize); + } bp->b_kvasize = maxsize; atomic_add_long(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); + } else if ((bp->b_flags & B_KVAALLOC) != 0 && + (gbflags & (GB_NOTMAPPED | GB_KVAALLOC)) == 0) { + bp->b_kvabase = bp->b_kvaalloc; + bp->b_flags &= ~B_KVAALLOC; + atomic_subtract_long(&unmapped_bufspace, + bp->b_kvasize); + } + if ((gbflags & GB_NOTMAPPED) == 0) { + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; + bp->b_flags &= ~B_NOTMAPPED; + BUF_CHECK_MAPPED(bp); } - bp->b_saveaddr = bp->b_kvabase; - bp->b_data = bp->b_saveaddr; } return (bp); } @@ -2584,14 +2692,18 @@ vfs_setdirty_locked_object(struct buf *bp) * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * -getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, +getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { - struct buf *bp; + struct buf *bp, *scratch_bp; struct bufobj *bo; - int error; + vm_offset_t addr; + int bsize, error, maxsize, need_mapping, need_kva, vmio; + off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); + KASSERT((flags & (GB_NOTMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_NOTMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); @@ -2659,9 +2771,8 @@ loop: } /* - * check for size inconsistancies for non-VMIO case. + * check for size inconsistencies for non-VMIO case. */ - if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { @@ -2694,13 +2805,85 @@ loop: } } + need_mapping = (bp->b_flags & B_NOTMAPPED) != 0 && + (flags & GB_NOTMAPPED) == 0; + need_kva = (bp->b_flags & (B_KVAALLOC | B_NOTMAPPED)) == + B_NOTMAPPED && (flags & GB_KVAALLOC) != 0; + if (need_mapping || need_kva) { + BUF_CHECK_UNMAPPED(bp); + + if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) { + bp->b_flags &= ~B_KVAALLOC; + bp->b_kvabase = bp->b_kvaalloc; + atomic_subtract_long(&unmapped_bufspace, + bp->b_kvasize); + goto has_addr; + } + + bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; + offset = blkno * bsize; + maxsize = size + (offset & PAGE_MASK); + maxsize = imax(maxsize, bsize); +mapping_loop: + vm_map_lock(buffer_map); + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr)) { + vm_map_unlock(buffer_map); + scratch_bp = getnewbuf(vp, 0, 0, size, + maxsize, flags | (GB_NOTMAPPED | + GB_KVAALLOC)); + if (scratch_bp == NULL) { + if ((flags & GB_NOWAIT_BD) != 0) { + /* + * XXXKIB: not sure + * what else to do. + */ + panic("GB_NOWAIT_BD and " + "B_NOTMAPPED %p", bp); + } + atomic_add_int(&mappingrestarts, 1); + goto mapping_loop; + } + atomic_subtract_long(&bufspace, bp->b_kvasize); + bufspacewakeup(); + addr = (vm_offset_t)scratch_bp->b_kvaalloc; + maxsize = scratch_bp->b_kvasize; + scratch_bp->b_kvasize = 0; + scratch_bp->b_flags |= B_INVAL | B_NOTMAPPED; + brelse(scratch_bp); + } else { + KASSERT(addr != 0, ("addr == 0")); + vm_map_insert(buffer_map, NULL, 0, addr, + addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, + MAP_NOFAULT); + vm_map_unlock(buffer_map); + } + if (need_mapping) { + bp->b_kvabase = (caddr_t)addr; + bp->b_flags &= ~B_KVAALLOC; + atomic_subtract_long(&unmapped_bufspace, + bp->b_kvasize); + } else /* if (need_kva) */ { + bp->b_kvaalloc = (caddr_t)addr; + bp->b_flags |= B_KVAALLOC; + } + bp->b_kvasize = maxsize; + if (need_mapping) { +has_addr: + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; + bp->b_flags &= ~B_NOTMAPPED; + BUF_CHECK_MAPPED(bp); + bpmap_qenter(bp); + } + } + /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ - if (bp->b_bcount != size) allocbuf(bp, size); @@ -2741,9 +2924,6 @@ loop: } bp->b_flags &= ~B_DONE; } else { - int bsize, maxsize, vmio; - off_t offset; - /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned @@ -2989,10 +3169,14 @@ allocbuf(struct buf *bp, int size) if (desiredpages < bp->b_npages) { vm_page_t m; - pmap_qremove((vm_offset_t)trunc_page( - (vm_offset_t)bp->b_data) + - (desiredpages << PAGE_SHIFT), - (bp->b_npages - desiredpages)); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove((vm_offset_t)trunc_page( + (vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), + (bp->b_npages - desiredpages)); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* @@ -3098,21 +3282,12 @@ allocbuf(struct buf *bp, int size) VM_OBJECT_UNLOCK(obj); /* - * Step 3, fixup the KVM pmap. Remember that - * bp->b_data is relative to bp->b_offset, but - * bp->b_offset may be offset into the first page. + * Step 3, fixup the KVM pmap. */ - - bp->b_data = (caddr_t) - trunc_page((vm_offset_t)bp->b_data); - pmap_qenter( - (vm_offset_t)bp->b_data, - bp->b_pages, - bp->b_npages - ); - - bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | - (vm_offset_t)(bp->b_offset & PAGE_MASK)); + if ((bp->b_flags & B_NOTMAPPED) == 0) + bpmap_qenter(bp); + else + BUF_CHECK_UNMAPPED(bp); } } if (newbsize < bp->b_bufsize) @@ -3122,21 +3297,38 @@ allocbuf(struct buf *bp, int size) return 1; } +extern int inflight_transient_maps; + void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); + vm_offset_t start, end; + int transient; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; + if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { + start = trunc_page((vm_offset_t)bp->bio_data); + end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); + transient = 1; + } else { + transient = 0; + start = end = 0; + } done = bp->bio_done; if (done == NULL) wakeup(bp); mtx_unlock(mtxp); if (done != NULL) done(bp); + if (transient) { + pmap_qremove(start, OFF_TO_IDX(end - start)); + vm_map_remove(bio_transient_map, start, end); + atomic_add_int(&inflight_transient_maps, -1); + } } /* @@ -3393,9 +3585,11 @@ bufdone_finish(struct buf *bp) } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3438,8 +3632,12 @@ vfs_unbusy_pages(struct buf *bp) if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), - bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); @@ -3604,9 +3802,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify) foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3662,8 +3862,7 @@ vfs_bio_set_valid(struct buf *bp, int base, int size) void vfs_bio_clrbuf(struct buf *bp) { - int i, j, mask; - caddr_t sa, ea; + int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); @@ -3681,39 +3880,69 @@ vfs_bio_clrbuf(struct buf *bp) if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { - bzero(bp->b_data, bp->b_bufsize); + pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } - ea = sa = bp->b_data; - for(i = 0; i < bp->b_npages; i++, sa = ea) { - ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); - ea = (caddr_t)(vm_offset_t)ulmin( - (u_long)(vm_offset_t)ea, - (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); + sa = bp->b_offset & PAGE_MASK; + slide = 0; + for (i = 0; i < bp->b_npages; i++) { + slide = imin(slide + PAGE_SIZE, bp->b_bufsize + sa); + ea = slide & PAGE_MASK; + if (ea == 0) + ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; - j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; + j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) - bzero(sa, ea - sa); + pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { - if ((bp->b_pages[i]->valid & (1 << j)) == 0) - bzero(sa, DEV_BSIZE); + if ((bp->b_pages[i]->valid & (1 << j)) == 0) { + pmap_zero_page_area(bp->b_pages[i], + sa, DEV_BSIZE); + } } } bp->b_pages[i]->valid |= mask; + sa = 0; } unlock: VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } +void +vfs_bio_bzero_buf(struct buf *bp, int base, int size) +{ + vm_page_t m; + int i, n; + + if ((bp->b_flags & B_NOTMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + bzero(bp->b_data + base, size); + } else { + BUF_CHECK_UNMAPPED(bp); + n = PAGE_SIZE - (base & PAGE_MASK); + VM_OBJECT_LOCK(bp->b_bufobj->bo_object); + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + m = bp->b_pages[i]; + if (n > size) + n = size; + pmap_zero_page_area(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); + } +} + /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are @@ -3726,6 +3955,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) vm_page_t p; int index; + BUF_CHECK_MAPPED(bp); + to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; @@ -3757,6 +3988,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize) vm_page_t p; int index, newnpages; + BUF_CHECK_MAPPED(bp); + from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 19ee05f..9aa5ba0 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -60,11 +60,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); -static struct cluster_save * - cluster_collectbufs(struct vnode *vp, struct buf *last_bp); -static struct buf * - cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, - daddr_t blkno, long size, int run, struct buf *fbp); +static struct cluster_save *cluster_collectbufs(struct vnode *vp, + struct buf *last_bp, int gbflags); +static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, + daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, + struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; @@ -83,15 +83,9 @@ extern vm_page_t bogus_page; * cluster_read replaces bread. */ int -cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lblkno; - long size; - struct ucred *cred; - long totread; - int seqcount; - struct buf **bpp; +cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, + struct ucred *cred, long totread, int seqcount, int gbflags, + struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; @@ -117,7 +111,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) /* * get the requested block */ - *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); origblkno = lblkno; /* @@ -208,7 +202,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, - blkno, size, nblks, bp); + blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; @@ -252,14 +246,14 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, - size, ncontig, NULL); + size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { - rbp = getblk(vp, lblkno, size, 0, 0, 0); + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); @@ -298,14 +292,8 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * -cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lbn; - daddr_t blkno; - long size; - int run; - struct buf *fbp; +cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct bufobj *bo; struct buf *bp, *tbp; @@ -329,7 +317,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) tbp = fbp; tbp->b_iocmd = BIO_READ; } else { - tbp = getblk(vp, lbn, size, 0, 0, 0); + tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; @@ -350,9 +338,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + if ((gbflags & GB_NOTMAPPED) != 0) { + bp->b_flags |= B_NOTMAPPED; + bp->b_data = unmapped_buf; + } else { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; @@ -499,8 +492,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *)bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } return (bp); } @@ -523,7 +518,10 @@ cluster_callback(bp) if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), + bp->b_npages); + } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. @@ -565,18 +563,19 @@ cluster_callback(bp) */ static __inline int -cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { int r = 0; - switch(write_behind) { + switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: - r = cluster_wbuild(vp, size, start_lbn, len); + r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ @@ -596,7 +595,8 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) * 4. end of a cluster - asynchronously write cluster */ void -cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) +cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, + int gbflags) { daddr_t lbn; int maxclen, cursize; @@ -642,13 +642,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; - buflist = cluster_collectbufs(vp, bp); + buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { @@ -667,7 +667,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, - cursize); + cursize, gbflags); } } else { /* @@ -715,8 +715,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * update daemon handle it. */ bdwrite(bp); - if (seqcount > 1) - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) { + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, + vp->v_clen + 1, gbflags); + } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { @@ -742,11 +744,8 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * the current block (if last_bp == NULL). */ int -cluster_wbuild(vp, size, start_lbn, len) - struct vnode *vp; - long size; - daddr_t start_lbn; - int len; +cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; @@ -832,10 +831,16 @@ cluster_wbuild(vp, size, start_lbn, len) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); - bp->b_flags |= B_CLUSTER | - (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); + if ((gbflags & GB_NOTMAPPED) == 0 || + (tbp->b_flags & B_VMIO) == 0) { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } else { + bp->b_flags |= B_NOTMAPPED; + bp->b_data = unmapped_buf; + } + bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | + B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* @@ -956,8 +961,10 @@ cluster_wbuild(vp, size, start_lbn, len) tbp, b_cluster.cluster_entry); } finishcluster: - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *) bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", @@ -978,9 +985,7 @@ cluster_wbuild(vp, size, start_lbn, len) * Plus add one additional buffer. */ static struct cluster_save * -cluster_collectbufs(vp, last_bp) - struct vnode *vp; - struct buf *last_bp; +cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; @@ -993,7 +998,8 @@ cluster_collectbufs(vp, last_bp) buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { - (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); + (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, + gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3f65b05..97db8ab 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1121,6 +1121,45 @@ vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) return (error); } +int +vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio) +{ + struct thread *td; + vm_offset_t iov_base; + int cnt, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove_fromphys(ma, offset, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; + iov_base = (vm_offset_t)uio->uio_iov->iov_base; + switch (uio->uio_rw) { + case UIO_WRITE: + pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, + offset, cnt); + break; + case UIO_READ: + pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, + cnt); + break; + } + pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)(iov_base + cnt); + uio->uio_iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + return (0); +} + + /* * File table truncate routine. */ diff --git a/sys/sys/bio.h b/sys/sys/bio.h index c016ee6..d121e9a 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -55,10 +55,13 @@ #define BIO_DONE 0x02 #define BIO_ONQUEUE 0x04 #define BIO_ORDERED 0x08 +#define BIO_NOTMAPPED 0x10 +#define BIO_TRANSIENT_MAPPING 0x20 #ifdef _KERNEL struct disk; struct bio; +struct vm_map; /* Empty classifier tag, to prevent further classification. */ #define BIO_NOTCLASSIFIED (void *)(~0UL) @@ -78,6 +81,8 @@ struct bio { off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ + struct vm_page **bio_ma; /* Or unmapped. */ + int bio_ma_offset; /* Offset in the first page of bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); @@ -121,6 +126,8 @@ struct bio_queue_head { struct bio *insert_point; }; +extern struct vm_map *bio_transient_map; + void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 418d6c5..792f2b5 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -117,6 +117,7 @@ struct buf { long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ + caddr_t b_kvaalloc; /* allocated kva for B_KVAALLOC */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ @@ -202,8 +203,8 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ -#define B_00001000 0x00001000 /* Available flag. */ +#define B_NOTMAPPED 0x00000800 /* KVA is not mapped. */ +#define B_KVAALLOC 0x00001000 /* But allocated. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_00004000 0x00004000 /* Available flag. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ @@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ -#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ +#define GB_NOTMAPPED 0x0008 /* Do not mmap buffer pages. */ +#define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -470,6 +473,7 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ +extern caddr_t unmapped_buf; void runningbufwakeup(struct buf *); void waitrunningbufspace(void); @@ -480,7 +484,10 @@ int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ - breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp) + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) +#define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ + gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, @@ -506,9 +513,10 @@ void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, - struct ucred *, long, int, struct buf **); -int cluster_wbuild(struct vnode *, long, daddr_t, int); -void cluster_write(struct vnode *, struct buf *, u_quad_t, int); + struct ucred *, long, int, int, struct buf **); +int cluster_wbuild(struct vnode *, long, daddr_t, int, int); +void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); +void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); diff --git a/sys/sys/mount.h b/sys/sys/mount.h index ed2b002..fceaa10 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -374,6 +374,7 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNTK_VGONE_WAITER 0x00000400 #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 +#define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 42f9e5f..032a7fa 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -687,6 +687,8 @@ int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); +int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 5ad5775..f5a5253 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -254,7 +254,7 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; - int error; + int error, gbflags; ufs2_daddr_t bno; static struct timeval lastfail; static int curfail; @@ -265,6 +265,8 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) fs = ip->i_fs; bp = NULL; ump = ip->i_ump; + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; + mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) @@ -296,7 +298,7 @@ retry: /* * Allocate the extra space in the buffer. */ - error = bread(vp, lbprev, osize, NOCRED, &bp); + error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); @@ -332,7 +334,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; @@ -400,7 +402,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 0e29be87f..667e101 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -107,7 +107,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din1; @@ -123,6 +123,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, return (EOPNOTSUPP); if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -211,7 +212,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -255,7 +256,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { @@ -389,7 +390,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -418,16 +419,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, + gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); @@ -539,7 +541,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din2; @@ -553,6 +555,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, *bpp = NULL; if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_NOTMAPPED) != 0 ? GB_NOTMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -603,7 +606,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); nb = dp->di_extb[lbn]; if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { - error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp); + error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -620,7 +624,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, -1 - lbn, osize, NOCRED, &bp); + error = bread_gb(vp, -1 - lbn, osize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -653,7 +658,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0); + bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); bp->b_xflags |= BX_ALTDATA; if (flags & BA_CLRBUF) @@ -679,9 +684,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, if (osize < fs->fs_bsize && osize > 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, nb, dp->di_db[nb], - ffs_blkpref_ufs2(ip, lastlbn, (int)nb, - &dp->di_db[0]), osize, (int)fs->fs_bsize, - flags, cred, &bp); + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + &dp->di_db[0]), osize, (int)fs->fs_bsize, + flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -707,7 +712,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { - error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); + error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -723,7 +729,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, lbn, osize, NOCRED, &bp); + error = bread_gb(vp, lbn, osize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -733,7 +740,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, UFS_LOCK(ump); error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &dp->di_db[0]), osize, nsize, flags, + &dp->di_db[0]), osize, nsize, flags, cred, &bp); if (error) return (error); @@ -753,7 +760,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, &dp->di_db[0]), nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -797,7 +804,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, + GB_NOTMAPPED); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { @@ -862,7 +870,8 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[i].in_lbn; - nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, + GB_NOTMAPPED); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { @@ -931,7 +940,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -966,16 +975,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = bread_gb(vp, lbn, (int)fs->fs_bsize, + NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 83ae202..54dd140 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1076,7 +1076,7 @@ ffs_mountfs(devvp, mp, td) */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | - MNTK_NO_IOPF; + MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART @@ -2095,6 +2095,7 @@ ffs_bufwrite(struct buf *bp) * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ + KASSERT((bp->b_flags & B_NOTMAPPED) == 0, ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 5c99d5b..42d5263 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -508,7 +508,8 @@ ffs_read(ap) /* * Don't do readahead if this is the end of the file. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, + GB_NOTMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, @@ -518,7 +519,8 @@ ffs_read(ap) * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, - size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); + size, NOCRED, blkoffset + uio->uio_resid, + seqcount, GB_NOTMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then @@ -529,15 +531,16 @@ ffs_read(ap) * the 6th argument. */ int nextsize = blksize(fs, ip, nextlbn); - error = breadn(vp, lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); + error = breadn_flags(vp, lbn, size, &nextlbn, + &nextsize, 1, NOCRED, GB_NOTMAPPED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, + GB_NOTMAPPED, &bp); } if (error) { brelse(bp); @@ -568,8 +571,13 @@ ffs_read(ap) xfersize = size; } - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } if (error) break; @@ -700,6 +708,7 @@ ffs_write(ap) flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; + flags |= BA_NOTMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); @@ -739,8 +748,13 @@ ffs_write(ap) if (size < xfersize) xfersize = size; - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_NOTMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any @@ -783,7 +797,8 @@ ffs_write(ap) } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, + GB_NOTMAPPED); } else { bawrite(bp); } diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index c590748..82973a2 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -121,6 +121,7 @@ void softdep_revert_rmdir(struct inode *, struct inode *); */ #define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ #define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_NOTMAPPED 0x00040000 /* Do not mmap resulted buffer. */ #define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ #define BA_SEQSHIFT 24 #define BA_SEQMAX 0x7F diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h index d06c22b..c64a549 100644 --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -108,6 +108,8 @@ void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); +void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, + vm_page_t mb[], vm_offset_t b_offset, int xfersize); void pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t, vm_prot_t, boolean_t); void pmap_enter_object(pmap_t pmap, vm_offset_t start, diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 44bff25..95f4707 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -758,6 +758,16 @@ swp_pager_strategy(struct buf *bp) TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); + if ((sp->sw_flags & SW_NOTMAPPED) != 0) { + bp->b_kvaalloc = bp->b_data; + bp->b_data = unmapped_buf; + bp->b_kvabase = unmapped_buf; + bp->b_offset = 0; + bp->b_flags |= B_NOTMAPPED; + } else { + pmap_qenter((vm_offset_t)bp->b_data, + &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); + } sp->sw_strategy(bp, sp); return; } @@ -1155,11 +1165,6 @@ swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) bp = getpbuf(&nsw_rcount); bp->b_flags |= B_PAGING; - /* - * map our page(s) into kva for input - */ - pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i); - bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); @@ -1371,8 +1376,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; - pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); - bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; @@ -1484,7 +1487,12 @@ swp_pager_async_iodone(struct buf *bp) /* * remove the mapping for kernel virtual */ - pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); + if ((bp->b_flags & B_NOTMAPPED) != 0) { + bp->b_data = bp->b_kvaalloc; + bp->b_kvabase = bp->b_kvaalloc; + bp->b_flags &= ~B_NOTMAPPED; + } else + pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); if (bp->b_npages) { object = bp->b_pages[0]->object; @@ -2144,7 +2152,8 @@ swapon_check_swzone(unsigned long npages) } static void -swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev) +swaponsomething(struct vnode *vp, void *id, u_long nblks, + sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; @@ -2180,6 +2189,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strateg sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; + sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* @@ -2537,10 +2547,18 @@ swapgeom_strategy(struct buf *bp, struct swdevt *sp) bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; - bio->bio_data = bp->b_data; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; + if ((bp->b_flags & B_NOTMAPPED) != 0) { + bio->bio_ma = bp->b_pages; + bio->bio_data = unmapped_buf; + bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bio->bio_flags |= BIO_NOTMAPPED; + } else { + bio->bio_data = bp->b_data; + bio->bio_ma = NULL; + } g_io_request(bio, cp); return; } @@ -2630,9 +2648,9 @@ swapongeom_ev(void *arg, int flags) } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, - swapgeom_close, dev2udev(swh->dev)); + swapgeom_close, dev2udev(swh->dev), + (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_NOTMAPPED : 0); swh->error = 0; - return; } static int @@ -2721,6 +2739,6 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, - NODEV); + NODEV, 0); return (0); } diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 5c716d9..bd6d8e3 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -68,6 +68,7 @@ struct swdevt { sw_close_t *sw_close; }; +#define SW_NOTMAPPED 0x01 #define SW_CLOSING 0x04 #ifdef _KERNEL diff --git a/sys/vm/vm.h b/sys/vm/vm.h index 132c10e..106c510 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -136,6 +136,8 @@ struct kva_md_info { vm_offset_t clean_eva; vm_offset_t pager_sva; vm_offset_t pager_eva; + vm_offset_t bio_transient_sva; + vm_offset_t bio_transient_eva; }; extern struct kva_md_info kmi; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index c507691..089c827 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -186,10 +186,14 @@ again: panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva, - (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE); + (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS + + (long)/*XXXKIB*/1024 * BKVASIZE, TRUE); buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva, &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE); buffer_map->system_map = 1; + bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva, + &kmi->bio_transient_eva, /*XXXKIB*/1024 * BKVASIZE, FALSE); + bio_transient_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva, (long)nswbuf * MAXPHYS, FALSE); pager_map->system_map = 1; diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index f731895..1d78a7d 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -85,11 +85,12 @@ __FBSDID("$FreeBSD$"); #include #include -vm_map_t kernel_map=0; -vm_map_t kmem_map=0; -vm_map_t exec_map=0; +vm_map_t kernel_map; +vm_map_t kmem_map; +vm_map_t exec_map; vm_map_t pipe_map; -vm_map_t buffer_map=0; +vm_map_t buffer_map; +vm_map_t bio_transient_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index a6d78f4..0031b9f 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -697,6 +697,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) int runpg; int runend; struct buf *bp; + struct mount *mp; int count; int error; @@ -899,12 +900,22 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) } bp = getpbuf(&vnode_pbuf_freecnt); - kva = (vm_offset_t) bp->b_data; + kva = (vm_offset_t)bp->b_data; /* - * and map the pages to be read into the kva + * and map the pages to be read into the kva, if the filesystem + * requires mapped buffers. */ - pmap_qenter(kva, m, count); + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) { + bp->b_data = unmapped_buf; + bp->b_kvabase = unmapped_buf; + bp->b_offset = 0; + bp->b_flags |= B_NOTMAPPED; + for (i = 0; i < count; i++) + bp->b_pages[i] = m[i]; + } else + pmap_qenter(kva, m, count); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; @@ -933,11 +944,17 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; - if (!error) { - if (size != count * PAGE_SIZE) - bzero((caddr_t) kva + size, PAGE_SIZE * count - size); + if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) { + bp->b_data = (caddr_t)kva; + bp->b_kvabase = (caddr_t)kva; + bp->b_flags &= ~B_NOTMAPPED; + for (i = 0; i < count; i++) + bp->b_pages[i] = NULL; + } else { + if (error != 0 && size != count * PAGE_SIZE) + bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + pmap_qremove(kva, count); } - pmap_qremove(kva, count); /* * free the buffer header back to the swap buffer pool