diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index c6c62ae..ef4ad07 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4274,6 +4274,30 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) pagecopy((void *)src, (void *)dst); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]-> + phys_addr) + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]-> + phys_addr) + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 72ef310..dd209ab 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -3314,6 +3314,42 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) } void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + vm_page_t a_pg, b_pg; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + mtx_lock(&cmtx); + while (xfersize > 0) { + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + *csrc_pte = L2_S_PROTO | VM_PAGE_TO_PHYS(a_pg) | + pte_l2_s_cache_mode; + pmap_set_prot(csrc_pte, VM_PROT_READ, 0); + PTE_SYNC(csrc_pte); + *cdst_pte = L2_S_PROTO | VM_PAGE_TO_PHYS(b_pg) | + pte_l2_s_cache_mode; + pmap_set_prot(cdst_pte, VM_PROT_READ | VM_PROT_WRITE, 0); + PTE_SYNC(cdst_pte); + cpu_tlb_flushD_SE(csrcp); + cpu_tlb_flushD_SE(cdstp); + cpu_cpwait(); + bcopy((char *)csrcp + a_pg_offset, (char *)cdstp + b_pg_offset, + cnt); + cpu_idcache_wbinv_range(cdstp + b_pg_offset, cnt); + pmap_l2cache_wbinv_range(cdstp + b_pg_offset, + VM_PAGE_TO_PHYS(b_pg) + b_pg_offset, cnt); + } + mtx_unlock(&cmtx); +} + +void pmap_copy_page(vm_page_t src, vm_page_t dst) { diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c index 28b4912..9f2ec71 100644 --- a/sys/arm/arm/pmap.c +++ b/sys/arm/arm/pmap.c @@ -258,6 +258,9 @@ pt_entry_t pte_l1_c_proto; pt_entry_t pte_l2_s_proto; void (*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t); +void (*pmap_copy_page_offs_func)(vm_paddr_t a_phys, + vm_offset_t a_offs, vm_paddr_t b_phys, vm_offset_t b_offs, + int cnt); void (*pmap_zero_page_func)(vm_paddr_t, int, int); struct msgbuf *msgbufp = 0; @@ -401,6 +404,13 @@ static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static struct rwlock pvh_global_lock; +void pmap_copy_page_offs_generic(vm_paddr_t a_phys, vm_offset_t a_offs, + vm_paddr_t b_phys, vm_offset_t b_offs, int cnt); +#if ARM_MMU_XSCALE == 1 +void pmap_copy_page_offs_xscale(vm_paddr_t a_phys, vm_offset_t a_offs, + vm_paddr_t b_phys, vm_offset_t b_offs, int cnt); +#endif + /* * This list exists for the benefit of pmap_map_chunk(). It keeps track * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can @@ -485,6 +495,7 @@ pmap_pte_init_generic(void) pte_l2_s_proto = L2_S_PROTO_generic; pmap_copy_page_func = pmap_copy_page_generic; + pmap_copy_page_offs_func = pmap_copy_page_offs_generic; pmap_zero_page_func = pmap_zero_page_generic; } @@ -661,6 +672,7 @@ pmap_pte_init_xscale(void) #ifdef CPU_XSCALE_CORE3 pmap_copy_page_func = pmap_copy_page_generic; + pmap_copy_page_offs_func = pmap_copy_page_offs_generic; pmap_zero_page_func = pmap_zero_page_generic; xscale_use_minidata = 0; /* Make sure it is L2-cachable */ @@ -673,6 +685,7 @@ pmap_pte_init_xscale(void) #else pmap_copy_page_func = pmap_copy_page_xscale; + pmap_copy_page_offs_func = pmap_copy_page_offs_xscale; pmap_zero_page_func = pmap_zero_page_xscale; #endif @@ -4301,6 +4314,29 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) cpu_l2cache_inv_range(csrcp, PAGE_SIZE); cpu_l2cache_wbinv_range(cdstp, PAGE_SIZE); } + +void +pmap_copy_page_offs_generic(vm_paddr_t a_phys, vm_offset_t a_offs, + vm_paddr_t b_phys, vm_offset_t b_offs, int cnt) +{ + + mtx_lock(&cmtx); + *csrc_pte = L2_S_PROTO | a_phys | + L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode; + PTE_SYNC(csrc_pte); + *cdst_pte = L2_S_PROTO | b_phys | + L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; + PTE_SYNC(cdst_pte); + cpu_tlb_flushD_SE(csrcp); + cpu_tlb_flushD_SE(cdstp); + cpu_cpwait(); + bcopy((char *)csrcp + a_offs, (char *)cdstp + b_offs, cnt); + mtx_unlock(&cmtx); + cpu_dcache_inv_range(csrcp + a_offs, cnt); + cpu_dcache_wbinv_range(cdstp + b_offs, cnt); + cpu_l2cache_inv_range(csrcp + a_offs, cnt); + cpu_l2cache_wbinv_range(cdstp + b_offs, cnt); +} #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if ARM_MMU_XSCALE == 1 @@ -4345,6 +4381,28 @@ pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst) mtx_unlock(&cmtx); xscale_cache_clean_minidata(); } + +void +pmap_copy_page_offs_xscale(vm_paddr_t a_phys, vm_offset_t a_offs, + vm_paddr_t b_phys, vm_offset_t b_offs, int cnt) +{ + + mtx_lock(&cmtx); + *csrc_pte = L2_S_PROTO | a_phys | + L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | + L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); + PTE_SYNC(csrc_pte); + *cdst_pte = L2_S_PROTO | b_phys | + L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | + L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); + PTE_SYNC(cdst_pte); + cpu_tlb_flushD_SE(csrcp); + cpu_tlb_flushD_SE(cdstp); + cpu_cpwait(); + bcopy((char *)csrcp + a_offs, (char *)cdstp + b_offs, cnt); + mtx_unlock(&cmtx); + xscale_cache_clean_minidata(); +} #endif /* ARM_MMU_XSCALE == 1 */ void @@ -4371,8 +4429,38 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) #endif } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + vm_page_t a_pg, b_pg; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; +#ifdef ARM_USE_SMALL_ALLOC + vm_offset_t a_va, b_va; +#endif - + cpu_dcache_wbinv_all(); + cpu_l2cache_wbinv_all(); + while (xfersize > 0) { + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); +#ifdef ARM_USE_SMALL_ALLOC + a_va = arm_ptovirt(VM_PAGE_TO_PHYS(a_pg)) + a_pg_offset; + b_va = arm_ptovirt(VM_PAGE_TO_PHYS(b_pg)) + b_pg_offset; + bcopy((char *)a_va, (char *)b_va, cnt); + cpu_dcache_wbinv_range(b_va, cnt); + cpu_l2cache_wbinv_range(b_va, cnt); +#else + pmap_copy_page_offs_func(VM_PAGE_TO_PHYS(a_pg), a_pg_offset, + VM_PAGE_TO_PHYS(b_pg), b_pg_offset, cnt); +#endif + } +} /* * this routine returns true if a physical page resides diff --git a/sys/arm/include/pmap.h b/sys/arm/include/pmap.h index 523499f..7c8d073 100644 --- a/sys/arm/include/pmap.h +++ b/sys/arm/include/pmap.h @@ -533,6 +533,8 @@ extern pt_entry_t pte_l1_c_proto; extern pt_entry_t pte_l2_s_proto; extern void (*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t); +extern void (*pmap_copy_page_offs_func)(vm_paddr_t a_phys, + vm_offset_t a_offs, vm_paddr_t b_phys, vm_offset_t b_offs, int cnt); extern void (*pmap_zero_page_func)(vm_paddr_t, int, int); #if (ARM_MMU_GENERIC + ARM_MMU_V6 + ARM_MMU_V7 + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_81342) diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 4252197..c700e7c 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1167,6 +1167,8 @@ adaregister(struct cam_periph *periph, void *arg) ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))) softc->disk->d_flags |= DISKFLAG_CANDELETE; + if ((cpi.hba_misc & PIM_UNMAPPED) != 0) + softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; strlcpy(softc->disk->d_descr, cgd->ident_data.model, MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model))); strlcpy(softc->disk->d_ident, cgd->ident_data.serial, @@ -1431,13 +1433,19 @@ adastart(struct cam_periph *periph, union ccb *start_ccb) return; } #endif + KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || + round_page(bp->bio_bcount + bp->bio_ma_offset) / + PAGE_SIZE == bp->bio_ma_n, + ("Short bio %p", bp)); cam_fill_ataio(ataio, ada_retry_count, adadone, - bp->bio_cmd == BIO_READ ? - CAM_DIR_IN : CAM_DIR_OUT, + (bp->bio_cmd == BIO_READ ? CAM_DIR_IN : + CAM_DIR_OUT) | ((bp->bio_flags & BIO_UNMAPPED) + != 0 ? CAM_DATA_BIO : 0), tag_code, - bp->bio_data, + ((bp->bio_flags & BIO_UNMAPPED) != 0) ? (void *)bp : + bp->bio_data, bp->bio_bcount, ada_default_timeout*1000); diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h index a80880a..bcbf414 100644 --- a/sys/cam/cam_ccb.h +++ b/sys/cam/cam_ccb.h @@ -42,7 +42,6 @@ #include #include - /* General allocation length definitions for CCB structures */ #define IOCDBLEN CAM_MAX_CDBLEN /* Space for CDB bytes/pointer */ #define VUHBALEN 14 /* Vendor Unique HBA length */ @@ -572,7 +571,8 @@ typedef enum { PIM_NOINITIATOR = 0x20, /* Initiator role not supported. */ PIM_NOBUSRESET = 0x10, /* User has disabled initial BUS RESET */ PIM_NO_6_BYTE = 0x08, /* Do not send 6-byte commands */ - PIM_SEQSCAN = 0x04 /* Do bus scans sequentially, not in parallel */ + PIM_SEQSCAN = 0x04, /* Do bus scans sequentially, not in parallel */ + PIM_UNMAPPED = 0x02, } pi_miscflag; /* Path Inquiry CCB */ diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index 523e549..fa4fa04 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -734,6 +734,8 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) case XPT_CONT_TARGET_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); + KASSERT((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR, + ("not VADDR for SCSI_IO %p %x\n", ccb, ccb->ccb_h.flags)); data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; @@ -743,6 +745,8 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) case XPT_ATA_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); + KASSERT((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR, + ("not VADDR for ATA_IO %p %x\n", ccb, ccb->ccb_h.flags)); data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; @@ -846,7 +850,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) * into a larger area of VM, or if userland races against * vmapbuf() after the useracc() check. */ - if (vmapbuf(mapinfo->bp[i]) < 0) { + if (vmapbuf(mapinfo->bp[i], 1) < 0) { for (j = 0; j < i; ++j) { *data_ptrs[j] = mapinfo->bp[j]->b_saveaddr; vunmapbuf(mapinfo->bp[j]); diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c index 9dac9c0..332732f 100644 --- a/sys/cam/scsi/scsi_all.c +++ b/sys/cam/scsi/scsi_all.c @@ -5679,7 +5679,11 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { + int read; u_int8_t cdb_len; + + read = (readop & SCSI_RW_DIRMASK) == SCSI_RW_READ; + /* * Use the smallest possible command to perform the operation * as some legacy hardware does not support the 10 byte commands. @@ -5696,7 +5700,7 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, struct scsi_rw_6 *scsi_cmd; scsi_cmd = (struct scsi_rw_6 *)&csio->cdb_io.cdb_bytes; - scsi_cmd->opcode = readop ? READ_6 : WRITE_6; + scsi_cmd->opcode = read ? READ_6 : WRITE_6; scsi_ulto3b(lba, scsi_cmd->addr); scsi_cmd->length = block_count & 0xff; scsi_cmd->control = 0; @@ -5715,7 +5719,7 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, struct scsi_rw_10 *scsi_cmd; scsi_cmd = (struct scsi_rw_10 *)&csio->cdb_io.cdb_bytes; - scsi_cmd->opcode = readop ? READ_10 : WRITE_10; + scsi_cmd->opcode = read ? READ_10 : WRITE_10; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; @@ -5738,7 +5742,7 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, struct scsi_rw_12 *scsi_cmd; scsi_cmd = (struct scsi_rw_12 *)&csio->cdb_io.cdb_bytes; - scsi_cmd->opcode = readop ? READ_12 : WRITE_12; + scsi_cmd->opcode = read ? READ_12 : WRITE_12; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; @@ -5760,7 +5764,7 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, struct scsi_rw_16 *scsi_cmd; scsi_cmd = (struct scsi_rw_16 *)&csio->cdb_io.cdb_bytes; - scsi_cmd->opcode = readop ? READ_16 : WRITE_16; + scsi_cmd->opcode = read ? READ_16 : WRITE_16; scsi_cmd->byte2 = byte2; scsi_u64to8b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; @@ -5771,7 +5775,8 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, cam_fill_csio(csio, retries, cbfcnp, - /*flags*/readop ? CAM_DIR_IN : CAM_DIR_OUT, + (read ? CAM_DIR_IN : CAM_DIR_OUT) | + ((readop & SCSI_RW_BIO) != 0 ? CAM_DATA_BIO : 0), tag_action, data_ptr, dxfer_len, diff --git a/sys/cam/scsi/scsi_all.h b/sys/cam/scsi/scsi_all.h index 0693e1c..330330d 100644 --- a/sys/cam/scsi/scsi_all.h +++ b/sys/cam/scsi/scsi_all.h @@ -2354,6 +2354,10 @@ void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries, uint8_t *data_ptr, uint32_t param_list_length, uint8_t sense_len, uint32_t timeout); +#define SCSI_RW_READ 0x0001 +#define SCSI_RW_WRITE 0x0002 +#define SCSI_RW_DIRMASK 0x0003 +#define SCSI_RW_BIO 0x1000 void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c index a7c4c5b..a6d340f 100644 --- a/sys/cam/scsi/scsi_cd.c +++ b/sys/cam/scsi/scsi_cd.c @@ -1575,7 +1575,8 @@ cdstart(struct cam_periph *periph, union ccb *start_ccb) /*retries*/ cd_retry_count, /* cbfcnp */ cddone, MSG_SIMPLE_Q_TAG, - /* read */bp->bio_cmd == BIO_READ, + /* read */bp->bio_cmd == BIO_READ ? + SCSI_RW_READ : SCSI_RW_WRITE, /* byte2 */ 0, /* minimum_cmd_size */ 10, /* lba */ bp->bio_offset / diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 7854215..c886e9e 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -1180,7 +1180,7 @@ dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t leng /*retries*/0, dadone, MSG_ORDERED_Q_TAG, - /*read*/FALSE, + /*read*/SCSI_RW_WRITE, /*byte2*/0, /*minimum_cmd_size*/ softc->minimum_cmd_size, offset / secsize, @@ -1753,6 +1753,8 @@ daregister(struct cam_periph *periph, void *arg) softc->disk->d_flags = 0; if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; + if ((cpi.hba_misc & PIM_UNMAPPED) != 0) + softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); @@ -1981,14 +1983,18 @@ dastart(struct cam_periph *periph, union ccb *start_ccb) /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/tag_code, - /*read_op*/bp->bio_cmd - == BIO_READ, + /*read_op*/(bp->bio_cmd == BIO_READ ? + SCSI_RW_READ : SCSI_RW_WRITE) | + ((bp->bio_flags & BIO_UNMAPPED) != 0 ? + SCSI_RW_BIO : 0), /*byte2*/0, softc->minimum_cmd_size, /*lba*/bp->bio_pblkno, /*block_count*/bp->bio_bcount / softc->params.secsize, - /*data_ptr*/ bp->bio_data, + /*data_ptr*/ (bp->bio_flags & + BIO_UNMAPPED) != 0 ? (void *)bp : + bp->bio_data, /*dxfer_len*/ bp->bio_bcount, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); diff --git a/sys/dev/ahci/ahci.c b/sys/dev/ahci/ahci.c index 8e692bd..d03c8af 100644 --- a/sys/dev/ahci/ahci.c +++ b/sys/dev/ahci/ahci.c @@ -2903,7 +2903,7 @@ ahciaction(struct cam_sim *sim, union ccb *ccb) if (ch->caps & AHCI_CAP_SPM) cpi->hba_inquiry |= PI_SATAPM; cpi->target_sprt = 0; - cpi->hba_misc = PIM_SEQSCAN; + cpi->hba_misc = PIM_SEQSCAN | PIM_UNMAPPED; cpi->hba_eng_cnt = 0; if (ch->caps & AHCI_CAP_SPM) cpi->max_target = 15; diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index b72f294..eccb1ee 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -110,6 +110,19 @@ static int md_malloc_wait; SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "Allow malloc to wait for memory allocations"); +static int md_unmapped_swap; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_swap, CTLFLAG_RD, + &md_unmapped_swap, 0, + ""); +static int md_unmapped_vnode; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_vnode, CTLFLAG_RD, + &md_unmapped_vnode, 0, + ""); +static int md_unmapped_malloc; +SYSCTL_INT(_debug, OID_AUTO, md_unmapped_malloc, CTLFLAG_RD, + &md_unmapped_malloc, 0, + ""); + #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) #define MD_ROOT_FSTYPE "ufs" #endif @@ -414,13 +427,103 @@ g_md_start(struct bio *bp) wakeup(sc); } +#define MD_MALLOC_MOVE_ZERO 1 +#define MD_MALLOC_MOVE_FILL 2 +#define MD_MALLOC_MOVE_READ 3 +#define MD_MALLOC_MOVE_WRITE 4 +#define MD_MALLOC_MOVE_CMP 5 + +static int +md_malloc_move(vm_page_t **mp, vm_offset_t *ma_offs, unsigned sectorsize, + void *ptr, u_char fill, int op) +{ + struct sf_buf *sf; + vm_page_t m, *mp1; + char *p, first; + vm_offset_t ma_offs1; + off_t *uc; + unsigned n; + int error, i, sz, first_read; + + m = NULL; + error = 0; + sf = NULL; + /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ + first = 0; + first_read = 0; + uc = ptr; + mp1 = *mp; + ma_offs1 = *ma_offs; + /* } */ + sched_pin(); + for (n = sectorsize; n != 0; n -= sz) { + sz = imin(PAGE_SIZE - *ma_offs, n); + if (m != **mp) { + if (sf != NULL) + sf_buf_free(sf); + m = **mp; + sf = sf_buf_alloc(m, SFB_CPUPRIVATE | + (md_malloc_wait ? 0 : SFB_NOWAIT)); + if (sf == NULL) { + error = ENOMEM; + break; + } + } + p = (char *)sf_buf_kva(sf) + *ma_offs; + switch (op) { + case MD_MALLOC_MOVE_ZERO: + bzero(p, sz); + break; + case MD_MALLOC_MOVE_FILL: + memset(p, fill, sz); + break; + case MD_MALLOC_MOVE_READ: + bcopy(ptr, p, sz); + cpu_flush_dcache(p, sz); + break; + case MD_MALLOC_MOVE_WRITE: + bcopy(p, ptr, sz); + break; + case MD_MALLOC_MOVE_CMP: + for (i = 0; i < sz; i++, p++) { + if (!first_read) { + *uc = (u_char)*p; + first = (u_char)*p; + first_read = 1; + } else if (*p != first) { + error = EDOOFUS; + break; + } + } + break; + } + if (error != 0) + break; + *ma_offs += sz; + *ma_offs %= PAGE_SIZE; + if (*ma_offs == 0) + (*mp)++; + } + + if (sf != NULL) + sf_buf_free(sf); + sched_unpin(); + if (op == MD_MALLOC_MOVE_CMP && error != 0) { + *mp = mp1; + *ma_offs = ma_offs1; + } + return (error); +} + static int mdstart_malloc(struct md_s *sc, struct bio *bp) { - int i, error; u_char *dst; + vm_page_t *m; + int i, error, error1, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; + vm_offset_t ma_offs; switch (bp->bio_cmd) { case BIO_READ: @@ -431,9 +534,17 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) return (EOPNOTSUPP); } + notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; + if (notmapped) { + m = bp->bio_ma; + ma_offs = bp->bio_ma_offset; + dst = NULL; + } else { + dst = bp->bio_data; + } + nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; - dst = bp->bio_data; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); @@ -441,21 +552,45 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { - if (osp == 0) - bzero(dst, sc->sectorsize); - else if (osp <= 255) - memset(dst, osp, sc->sectorsize); - else { - bcopy((void *)osp, dst, sc->sectorsize); - cpu_flush_dcache(dst, sc->sectorsize); + if (osp == 0) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, 0, + MD_MALLOC_MOVE_ZERO); + } else + bzero(dst, sc->sectorsize); + } else if (osp <= 255) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, osp, + MD_MALLOC_MOVE_FILL); + } else + memset(dst, osp, sc->sectorsize); + } else { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, (void *)osp, 0, + MD_MALLOC_MOVE_READ); + } else { + bcopy((void *)osp, dst, sc->sectorsize); + cpu_flush_dcache(dst, sc->sectorsize); + } } osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { - uc = dst[0]; - for (i = 1; i < sc->sectorsize; i++) - if (dst[i] != uc) - break; + if (notmapped) { + error1 = md_malloc_move(&m, &ma_offs, + sc->sectorsize, &uc, 0, + MD_MALLOC_MOVE_CMP); + i = error1 == 0 ? sc->sectorsize : 0; + } else { + uc = dst[0]; + for (i = 1; i < sc->sectorsize; i++) { + if (dst[i] != uc) + break; + } + } } else { i = 0; uc = 0; @@ -472,10 +607,26 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) error = ENOSPC; break; } - bcopy(dst, (void *)sp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)sp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)sp, + sc->sectorsize); + } error = s_write(sc->indir, secno, sp); } else { - bcopy(dst, (void *)osp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)osp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)osp, + sc->sectorsize); + } osp = 0; } } @@ -487,7 +638,8 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) if (error != 0) break; secno++; - dst += sc->sectorsize; + if (!notmapped) + dst += sc->sectorsize; } bp->bio_resid = 0; return (error); @@ -628,11 +780,10 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) static int mdstart_swap(struct md_s *sc, struct bio *bp) { - struct sf_buf *sf; - int rv, offs, len, lastend; - vm_pindex_t i, lastp; vm_page_t m; u_char *p; + vm_pindex_t i, lastp; + int rv, ma_offs, offs, len, lastend; switch (bp->bio_cmd) { case BIO_READ: @@ -644,6 +795,12 @@ mdstart_swap(struct md_s *sc, struct bio *bp) } p = bp->bio_data; + if ((bp->bio_flags & BIO_UNMAPPED) == 0) { + ma_offs = 0; + } else { + atomic_add_int(&md_unmapped_swap, 1); + ma_offs = bp->bio_ma_offset; + } /* * offs is the offset at which to start operating on the @@ -661,19 +818,12 @@ mdstart_swap(struct md_s *sc, struct bio *bp) vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); - VM_OBJECT_UNLOCK(sc->object); - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - VM_OBJECT_LOCK(sc->object); + m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL | + VM_ALLOC_RETRY); if (bp->bio_cmd == BIO_READ) { if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } else if (rv == VM_PAGER_FAIL) { @@ -683,40 +833,44 @@ mdstart_swap(struct md_s *sc, struct bio *bp) * valid. Do not set dirty, the page * can be recreated if thrown out. */ - bzero((void *)sf_buf_kva(sf), PAGE_SIZE); + pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } - bcopy((void *)(sf_buf_kva(sf) + offs), p, len); - cpu_flush_dcache(p, len); + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + pmap_copy_pages(&m, offs, bp->bio_ma, + ma_offs, len); + } else { + physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); + cpu_flush_dcache(p, len); + } } else if (bp->bio_cmd == BIO_WRITE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } - bcopy(p, (void *)(sf_buf_kva(sf) + offs), len); + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + pmap_copy_pages(bp->bio_ma, ma_offs, &m, + offs, len); + } else { + physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); + } m->valid = VM_PAGE_BITS_ALL; } else if (bp->bio_cmd == BIO_DELETE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } if (len != PAGE_SIZE) { - bzero((void *)(sf_buf_kva(sf) + offs), len); + pmap_zero_page_area(m, offs, len); vm_page_clear_dirty(m, offs, len); m->valid = VM_PAGE_BITS_ALL; } else vm_pager_page_unswapped(m); } - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); vm_page_lock(m); if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) @@ -730,6 +884,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; + ma_offs += len; } vm_object_pip_subtract(sc->object, 1); VM_OBJECT_UNLOCK(sc->object); @@ -845,6 +1000,14 @@ mdinit(struct md_s *sc) pp = g_new_providerf(gp, "md%d", sc->unit); pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; + switch (sc->type) { + case MD_SWAP: + case MD_MALLOC: + pp->flags |= G_PF_ACCEPT_UNMAPPED; + break; + default: + break; + } sc->gp = gp; sc->pp = pp; g_error_provider(pp, 0); diff --git a/sys/fs/cd9660/cd9660_vnops.c b/sys/fs/cd9660/cd9660_vnops.c index 21ee0fc..47d4f75 100644 --- a/sys/fs/cd9660/cd9660_vnops.c +++ b/sys/fs/cd9660/cd9660_vnops.c @@ -329,7 +329,7 @@ cd9660_read(ap) if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, - (ap->a_ioflag >> 16), &bp); + (ap->a_ioflag >> 16), 0, &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { diff --git a/sys/fs/ext2fs/ext2_balloc.c b/sys/fs/ext2fs/ext2_balloc.c index 1c0cc0e..88ad710 100644 --- a/sys/fs/ext2fs/ext2_balloc.c +++ b/sys/fs/ext2fs/ext2_balloc.c @@ -276,7 +276,7 @@ ext2_balloc(struct inode *ip, int32_t lbn, int size, struct ucred *cred, if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->e2fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, 0, &nbp); } else { error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index 1c0b7a1..77eb74b 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1618,10 +1618,11 @@ ext2_read(struct vop_read_args *ap) if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); - else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) + else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, - NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); - else if (seqcount > 1) { + NOCRED, blkoffset + uio->uio_resid, seqcount, + 0, &bp); + } else if (seqcount > 1) { int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); @@ -1831,7 +1832,7 @@ ext2_write(struct vop_write_args *ap) } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index 8e045cb..213ae81 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -600,7 +600,7 @@ msdosfs_read(ap) error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, - NOCRED, on + uio->uio_resid, seqcount, &bp); + NOCRED, on + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, @@ -820,7 +820,7 @@ msdosfs_write(ap) else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, bp, dep->de_FileSize, - seqcount); + seqcount, 0); else bawrite(bp); } else diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index b1a3b1d..abe073e 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -478,8 +478,9 @@ udf_read(struct vop_read_args *ap) rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(udfmp, rablock) < fsize) { - error = cluster_read(vp, fsize, lbn, size, NOCRED, - uio->uio_resid, (ap->a_ioflag >> 16), &bp); + error = cluster_read(vp, fsize, lbn, size, + NOCRED, uio->uio_resid, + (ap->a_ioflag >> 16), 0, &bp); } else { error = bread(vp, lbn, size, NOCRED, &bp); } diff --git a/sys/geom/geom.h b/sys/geom/geom.h index 351b05d..660bf6e 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -205,6 +205,7 @@ struct g_provider { u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 +#define G_PF_ACCEPT_UNMAPPED 0x8 /* Two fields for the implementing class to use */ void *private; diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index 72e9162..7fec9da 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -320,13 +320,29 @@ g_disk_start(struct bio *bp) do { bp2->bio_offset += off; bp2->bio_length -= off; - bp2->bio_data += off; + if ((bp->bio_flags & BIO_UNMAPPED) == 0) { + bp2->bio_data += off; + } else { + KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) + != 0, + ("unmapped bio not supported by disk %s", + dp->d_name)); + bp2->bio_ma += off / PAGE_SIZE; + bp2->bio_ma_offset += off; + bp2->bio_ma_offset %= PAGE_SIZE; + bp2->bio_ma_n -= off / PAGE_SIZE; + } if (bp2->bio_length > dp->d_maxsize) { /* * XXX: If we have a stripesize we should really * use it here. */ bp2->bio_length = dp->d_maxsize; + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + bp2->bio_ma_n = howmany( + bp2->bio_ma_offset + + bp2->bio_length, PAGE_SIZE); + } off += dp->d_maxsize; /* * To avoid a race, we need to grab the next bio @@ -488,6 +504,8 @@ g_disk_create(void *arg, int flag) pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; + if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) + pp->flags |= G_PF_ACCEPT_UNMAPPED; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); diff --git a/sys/geom/geom_disk.h b/sys/geom/geom_disk.h index 33d8eb2..246fc49 100644 --- a/sys/geom/geom_disk.h +++ b/sys/geom/geom_disk.h @@ -103,6 +103,7 @@ struct disk { #define DISKFLAG_OPEN 0x2 #define DISKFLAG_CANDELETE 0x4 #define DISKFLAG_CANFLUSHCACHE 0x8 +#define DISKFLAG_UNMAPPED_BIO 0x10 struct disk *disk_alloc(void); void disk_create(struct disk *disk, int version); diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index c6a5da8..4c84bcc 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -51,6 +52,13 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include +#include +#include +#include +#include static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; @@ -180,12 +188,17 @@ g_clone_bio(struct bio *bp) /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. + * BIO_UNMAPPED should be inherited, to properly indicate + * which way the buffer is passed. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & BIO_ORDERED; + bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; @@ -210,11 +223,15 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); + bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED; bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR @@ -575,6 +592,76 @@ g_io_deliver(struct bio *bp, int error) return; } +SYSCTL_DECL(_kern_geom); + +static long transient_maps; +SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, + &transient_maps, 0, + ""); +int transient_map_retries; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RD, + &transient_map_retries, 0, + ""); +int transient_map_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_failures, CTLFLAG_RD, + &transient_map_failures, 0, + ""); +int inflight_transient_maps; +SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, + &inflight_transient_maps, 0, + ""); + +static int +g_io_transient_map_bio(struct bio *bp) +{ + vm_offset_t addr; + long size; + int retried, rv; + + size = round_page(bp->bio_ma_offset + bp->bio_length); + KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); + addr = 0; + retried = 0; + atomic_add_long(&transient_maps, 1); +retry: + vm_map_lock(bio_transient_map); + if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map), + size, &addr)) { + vm_map_unlock(bio_transient_map); + if (retried >= 3) { + g_io_deliver(bp, EDEADLK/* XXXKIB */); + CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", + bp, bp->bio_to->name); + atomic_add_int(&transient_map_failures, 1); + return (1); + } else { + /* + * Naive attempt to quisce the I/O to get more + * in-flight requests completed and defragment + * the bio_transient_map. + */ + CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", + bp, bp->bio_to->name, retried); + pause("g_d_tra", hz / 10); + retried++; + atomic_add_int(&transient_map_retries, 1); + goto retry; + } + } + rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, + ("vm_map_insert(bio_transient_map) rv %d %jx %lx", + rv, (uintmax_t)addr, size)); + vm_map_unlock(bio_transient_map); + atomic_add_int(&inflight_transient_maps, 1); + pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); + bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; + bp->bio_flags |= BIO_TRANSIENT_MAPPING; + bp->bio_flags &= ~BIO_UNMAPPED; + return (0); +} + void g_io_schedule_down(struct thread *tp __unused) { @@ -636,6 +723,12 @@ g_io_schedule_down(struct thread *tp __unused) default: break; } + if ((bp->bio_flags & BIO_UNMAPPED) != 0 && + (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && + (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { + if (g_io_transient_map_bio(bp)) + continue; + } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index bbed550..92f1ad2 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; - bip->bio_data = bp->b_data; - bip->bio_done = g_vfs_done; - bip->bio_caller2 = bp; bip->bio_length = bp->b_bcount; - if (bp->b_flags & B_BARRIER) { + bdata2bio(bp, bip); + if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } + bip->bio_done = g_vfs_done; + bip->bio_caller2 = bp; g_io_request(bip, cp); } diff --git a/sys/geom/part/g_part.c b/sys/geom/part/g_part.c index e2ba79e..7650499 100644 --- a/sys/geom/part/g_part.c +++ b/sys/geom/part/g_part.c @@ -427,6 +427,7 @@ g_part_new_provider(struct g_geom *gp, struct g_part_table *table, entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset; if (pp->stripesize > 0) entry->gpe_pp->stripeoffset %= pp->stripesize; + entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(entry->gpe_pp, 0); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 5fee565..27548dc 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -4240,6 +4240,49 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + struct sysmaps *sysmaps; + vm_page_t a_pg, b_pg; + char *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1 != 0) + panic("pmap_copy_pages: CMAP1 busy"); + if (*sysmaps->CMAP2 != 0) + panic("pmap_copy_pages: CMAP2 busy"); + sched_pin(); + while (xfersize > 0) { + invlpg((u_int)sysmaps->CADDR1); + invlpg((u_int)sysmaps->CADDR2); + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | + pmap_cache_bits(b_pg->md.pat_mode, 0); + *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | + PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); + a_cp = sysmaps->CADDR1 + a_pg_offset; + b_cp = sysmaps->CADDR2 + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + *sysmaps->CMAP1 = 0; + *sysmaps->CMAP2 = 0; + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c index a8f11a4..28ba21b 100644 --- a/sys/i386/xen/pmap.c +++ b/sys/i386/xen/pmap.c @@ -3448,6 +3448,46 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + struct sysmaps *sysmaps; + vm_page_t a_pg, b_pg; + char *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1 != 0) + panic("pmap_copy_pages: CMAP1 busy"); + if (*sysmaps->CMAP2 != 0) + panic("pmap_copy_pages: CMAP2 busy"); + sched_pin(); + while (xfersize > 0) { + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(a_pg) | PG_A); + PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | + VM_PAGE_TO_MACH(b_pg) | PG_A | PG_M); + a_cp = sysmaps->CADDR1 + a_pg_offset; + b_cp = sysmaps->CADDR2 + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + PT_SET_MA(sysmaps->CADDR1, 0); + PT_SET_MA(sysmaps->CADDR2, 0); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c index 594f8c6..28610c6 100644 --- a/sys/ia64/ia64/pmap.c +++ b/sys/ia64/ia64/pmap.c @@ -2014,6 +2014,30 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) bcopy(src, dst, PAGE_SIZE); } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)pmap_page_to_va(ma[a_offset >> PAGE_SHIFT]) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)pmap_page_to_va(mb[b_offset >> PAGE_SHIFT]) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index 34072f3..922ebb6 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -92,7 +92,7 @@ physio(struct cdev *dev, struct uio *uio, int ioflag) bp->b_blkno = btodb(bp->b_offset); if (uio->uio_segflg == UIO_USERSPACE) - if (vmapbuf(bp) < 0) { + if (vmapbuf(bp, 0) < 0) { error = EFAULT; goto doerror; } diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c index 773d01a..1ca1f89 100644 --- a/sys/kern/subr_bus_dma.c +++ b/sys/kern/subr_bus_dma.c @@ -126,11 +126,27 @@ static int _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio, int *nsegs, int flags) { - int error; + vm_paddr_t paddr; + bus_size_t len, tlen; + int error, i, ma_offs; - error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data, - bio->bio_bcount, kernel_pmap, flags, NULL, nsegs); + if ((bio->bio_flags & BIO_UNMAPPED) == 0) { + error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data, + bio->bio_bcount, kernel_pmap, flags, NULL, nsegs); + return (error); + } + tlen = bio->bio_bcount; + ma_offs = bio->bio_ma_offset; + for (i = 0; tlen > 0; i++, tlen -= len) { + len = min(PAGE_SIZE - ma_offs, tlen); + paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs; + error = _bus_dmamap_load_phys(dmat, map, paddr, len, + flags, NULL, nsegs); + if (error != 0) + break; + ma_offs = 0; + } return (error); } diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f36c769..1fb344e 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -91,6 +91,7 @@ int maxfilesperproc; /* per-proc open files limit */ int msgbufsize; /* size of kernel message buffer */ int ncallout; /* maximum # of timer events */ int nbuf; +int bio_transient_maxcnt; int ngroups_max; /* max # groups per process */ int nswbuf; pid_t pid_max = PID_MAX; @@ -119,6 +120,9 @@ SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0, "Maximum memory for swap metadata"); SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0, "Maximum value of vfs.maxbufspace"); +SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN, + &bio_transient_maxcnt, 0, + "Maximum number of transient BIOs mappings"); SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0, "Maximum text size"); SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0, @@ -321,6 +325,7 @@ init_param2(long physpages) */ nbuf = NBUF; TUNABLE_INT_FETCH("kern.nbuf", &nbuf); + TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt); /* * XXX: Does the callout wheel have to be so big? diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 99b0197..ae6ae8e 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -1322,7 +1322,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe) /* * Bring buffer into kernel space. */ - if (vmapbuf(bp) < 0) { + if (vmapbuf(bp, 1) < 0) { error = EFAULT; goto doerror; } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 6393399..a53f16b 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -91,6 +91,7 @@ struct buf_ops buf_ops_bio = { * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ +caddr_t unmapped_buf; static struct proc *bufdaemonproc; @@ -131,6 +132,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Virtual memory used for buffers"); #endif +static long unmapped_bufspace; +SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD, + &unmapped_bufspace, 0, + "Amount of unmapped buffers, inclusive in the bufspace"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); @@ -200,6 +205,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); +static int mappingrestarts; +SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, + "Number of times getblk has had to restart a buffer mapping for " + "unmapped buffer"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); @@ -280,6 +289,9 @@ static struct mtx nblock; /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; +#ifdef INVARIANTS +static int bq_len[BUFFER_QUEUES]; +#endif /* Lock for the bufqueues */ static struct mtx bqlock; @@ -510,7 +522,7 @@ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; - long maxbuf; + long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes @@ -554,6 +566,52 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) } /* + * Ideal allocation size for the transient bio submap if 10% + * of the maximal space buffer map. This roughly corresponds + * to the amount of the buffer mapped for typical UFS load. + * + * Clip the buffer map to reserve space for the transient + * BIOs, if its extent is bigger than 90% of the maximum + * buffer map extent on the platform. + * + * The fall-back to the maxbuf in case of maxbcache unset, + * allows to not trim the buffer KVA for the architectures + * with ample KVA space. + */ + if (bio_transient_maxcnt == 0) { + maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; + buf_sz = (long)nbuf * BKVASIZE; + if (buf_sz < maxbuf_sz / 10 * 9) { + /* + * There is more KVA than memory. Do not + * adjust buffer map size, and assign the rest + * of maxbuf to transient map. + */ + biotmap_sz = maxbuf_sz - buf_sz; + } else { + /* + * Buffer map spans all KVA we could afford on + * this platform. Give 10% of the buffer map + * to the transient bio map. + */ + biotmap_sz = buf_sz / 10; + buf_sz -= biotmap_sz; + } + if (biotmap_sz / INT_MAX > MAXPHYS) + bio_transient_maxcnt = INT_MAX; + else + bio_transient_maxcnt = biotmap_sz / MAXPHYS; + /* + * Artifically limit to 1024 simultaneous in-flight I/Os + * using the transient mapping. + */ + if (bio_transient_maxcnt > 1024) + bio_transient_maxcnt = 1024; + if (tuned_nbuf) + nbuf = buf_sz / BKVASIZE; + } + + /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ @@ -606,6 +664,9 @@ bufinit(void) LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); +#ifdef INVARIANTS + bq_len[QUEUE_EMPTY]++; +#endif } /* @@ -674,6 +735,55 @@ bufinit(void) bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS); +} + +#ifdef INVARIANTS +static inline void +vfs_buf_check_mapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_UNMAPPED) == 0, + ("mapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase != unmapped_buf, + ("mapped buf: b_kvabase was not updated %p", bp)); + KASSERT(bp->b_data != unmapped_buf, + ("mapped buf: b_data was not updated %p", bp)); +} + +static inline void +vfs_buf_check_unmapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED, + ("unmapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase == unmapped_buf, + ("unmapped buf: corrupted b_kvabase %p", bp)); + KASSERT(bp->b_data == unmapped_buf, + ("unmapped buf: corrupted b_data %p", bp)); +} + +#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) +#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) +#else +#define BUF_CHECK_MAPPED(bp) do {} while (0) +#define BUF_CHECK_UNMAPPED(bp) do {} while (0) +#endif + +static void +bpmap_qenter(struct buf *bp) +{ + + BUF_CHECK_MAPPED(bp); + + /* + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* @@ -685,14 +795,26 @@ static void bfreekva(struct buf *bp) { - if (bp->b_kvasize) { - atomic_add_int(&buffreekvacnt, 1); - atomic_subtract_long(&bufspace, bp->b_kvasize); - vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, - (vm_offset_t) bp->b_kvabase + bp->b_kvasize); - bp->b_kvasize = 0; - bufspacewakeup(); + if (bp->b_kvasize == 0) + return; + + atomic_add_int(&buffreekvacnt, 1); + atomic_subtract_long(&bufspace, bp->b_kvasize); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase, + (vm_offset_t)bp->b_kvabase + bp->b_kvasize); + } else { + BUF_CHECK_UNMAPPED(bp); + if ((bp->b_flags & B_KVAALLOC) != 0) { + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc, + (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize); + } + atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); + bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); } + bp->b_kvasize = 0; + bufspacewakeup(); } /* @@ -759,6 +881,11 @@ bremfreel(struct buf *bp) mtx_assert(&bqlock, MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef INVARIANTS + KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", + bp->b_qindex)); + bq_len[bp->b_qindex]--; +#endif bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer @@ -829,9 +956,8 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, * getblk(). Also starts asynchronous I/O on read-ahead blocks. */ int -breadn_flags(struct vnode * vp, daddr_t blkno, int size, - daddr_t * rablkno, int *rabsize, int cnt, - struct ucred * cred, int flags, struct buf **bpp) +breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, + int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; @@ -1405,7 +1531,8 @@ brelse(struct buf *bp) } } - if ((bp->b_flags & B_INVAL) == 0) { + if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); @@ -1506,11 +1633,17 @@ brelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + if (bp->b_flags & B_AGE) { + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, + b_freelist); + } else { + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, + b_freelist); + } } +#ifdef INVARIANTS + bq_len[bp->b_qindex]++; +#endif mtx_unlock(&bqlock); /* @@ -1601,6 +1734,9 @@ bqrelse(struct buf *bp) if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef INVARIANTS + bq_len[bp->b_qindex]++; +#endif } else { /* * The locking of the BO_LOCK for checking of the @@ -1613,6 +1749,9 @@ bqrelse(struct buf *bp) bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); +#ifdef INVARIANTS + bq_len[QUEUE_CLEAN]++; +#endif } else { /* * We are too low on memory, we have to try to free @@ -1654,7 +1793,11 @@ vfs_vmio_release(struct buf *bp) int i; vm_page_t m; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; @@ -1758,8 +1901,10 @@ vfs_bio_awrite(struct buf *bp) int nwritten; int size; int maxcl; + int gbflags; bo = &vp->v_bufobj; + gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster @@ -1790,8 +1935,9 @@ vfs_bio_awrite(struct buf *bp) */ if (ncl != 1) { BUF_UNLOCK(bp); - nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); - return nwritten; + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, + gbflags); + return (nwritten); } } bremfree(bp); @@ -1807,46 +1953,206 @@ vfs_bio_awrite(struct buf *bp) return nwritten; } +static void +setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags) +{ + + KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && + bp->b_kvasize == 0, ("call bfreekva(%p)", bp)); + if ((gbflags & GB_UNMAPPED) == 0) { + bp->b_kvabase = (caddr_t)addr; + } else if ((gbflags & GB_KVAALLOC) != 0) { + KASSERT((gbflags & GB_UNMAPPED) != 0, + ("GB_KVAALLOC without GB_UNMAPPED")); + bp->b_kvaalloc = (caddr_t)addr; + bp->b_flags |= B_UNMAPPED | B_KVAALLOC; + atomic_add_long(&unmapped_bufspace, bp->b_kvasize); + } + bp->b_kvasize = maxsize; +} + /* - * getnewbuf: - * - * Find and initialize a new buffer header, freeing up existing buffers - * in the bufqueues as necessary. The new buffer is returned locked. - * - * Important: B_INVAL is not set. If the caller wishes to throw the - * buffer away, the caller must set B_INVAL prior to calling brelse(). - * - * We block if: - * We have insufficient buffer headers - * We have insufficient buffer space - * buffer_map is too fragmented ( space reservation fails ) - * If we have to flush dirty buffers ( but we try to avoid this ) - * - * To avoid VFS layer recursion we do not flush dirty buffers ourselves. - * Instead we ask the buf daemon to do it for us. We attempt to - * avoid piecemeal wakeups of the pageout daemon. + * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if + * needed. */ +static int +allocbufkva(struct buf *bp, int maxsize, int gbflags) +{ + vm_offset_t addr; + int rv; -static struct buf * -getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, - int gbflags) + bfreekva(bp); + addr = 0; + + vm_map_lock(buffer_map); + if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, + &addr)) { + vm_map_unlock(buffer_map); + /* + * Buffer map is too fragmented. Request the caller + * to defragment the map. + */ + atomic_add_int(&bufdefragcnt, 1); + return (1); + } + rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv)); + vm_map_unlock(buffer_map); + setbufkva(bp, addr, maxsize, gbflags); + atomic_add_long(&bufspace, bp->b_kvasize); + return (0); +} + +/* + * Ask the bufdaemon for help, or act as bufdaemon itself, when a + * locked vnode is supplied. + */ +static void +getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, + int defrag) { struct thread *td; - struct buf *bp; - struct buf *nbp; - int defrag = 0; - int nqindex; - static int flushingbufs; + char *waitmsg; + int fl, flags, norunbuf; + + mtx_assert(&bqlock, MA_OWNED); + + if (defrag) { + flags = VFS_BIO_NEED_BUFSPACE; + waitmsg = "nbufkv"; + } else if (bufspace >= hibufspace) { + waitmsg = "nbufbs"; + flags = VFS_BIO_NEED_BUFSPACE; + } else { + waitmsg = "newbuf"; + flags = VFS_BIO_NEED_ANY; + } + mtx_lock(&nblock); + needsbuffer |= flags; + mtx_unlock(&nblock); + mtx_unlock(&bqlock); + + bd_speedup(); /* heeeelp */ + if ((gbflags & GB_NOWAIT_BD) != 0) + return; td = curthread; + mtx_lock(&nblock); + while (needsbuffer & flags) { + if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { + mtx_unlock(&nblock); + /* + * getblk() is called with a vnode locked, and + * some majority of the dirty buffers may as + * well belong to the vnode. Flushing the + * buffers there would make a progress that + * cannot be achieved by the buf_daemon, that + * cannot lock the vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + /* play bufdaemon */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_do_flush(vp); + td->td_pflags &= norunbuf; + mtx_lock(&nblock); + if (fl != 0) + continue; + if ((needsbuffer & flags) == 0) + break; + } + if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, + waitmsg, slptimeo)) + break; + } + mtx_unlock(&nblock); +} + +static void +getnewbuf_reuse_bp(struct buf *bp, int qindex) +{ + + CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " + "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, + bp->b_kvasize, bp->b_bufsize, qindex); + mtx_assert(&bqlock, MA_NOTOWNED); + /* - * We can't afford to block since we might be holding a vnode lock, - * which may prevent system daemons from running. We deal with - * low-memory situations by proactively returning memory and running - * async I/O rather then sync I/O. + * Note: we no longer distinguish between VMIO and non-VMIO + * buffers. */ - atomic_add_int(&getnewbufcalls, 1); - atomic_subtract_int(&getnewbufrestarts, 1); + KASSERT((bp->b_flags & B_DELWRI) == 0, + ("delwri buffer %p found in queue %d", bp, qindex)); + + if (qindex == QUEUE_CLEAN) { + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + if (bp->b_vp != NULL) + brelvp(bp); + } + + /* + * Get the rest of the buffer freed up. b_kva* is still valid + * after this operation. + */ + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 3"); + KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", + bp, bp->b_vp, qindex)); + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); + + if (bp->b_bufsize) + allocbuf(bp, 0); + + bp->b_flags &= B_UNMAPPED | B_KVAALLOC; + bp->b_ioflags = 0; + bp->b_xflags = 0; + KASSERT((bp->b_vflags & BV_INFREECNT) == 0, + ("buf %p still counted as free?", bp)); + bp->b_vflags = 0; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_bufobj = NULL; + bp->b_pin_count = 0; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; + + LIST_INIT(&bp->b_dep); +} + +static int flushingbufs; + +static struct buf * +getnewbuf_scan(int maxsize, int defrag, int unmapped) +{ + struct buf *bp, *nbp; + int nqindex, qindex; + + KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); + restart: atomic_add_int(&getnewbufrestarts, 1); @@ -1856,63 +2162,80 @@ restart: * that if we are specially marked process, we are allowed to * dip into our reserves. * - * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN + * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN + * for the allocation of the mapped buffer. For unmapped, the + * easiest is to start with EMPTY outright. * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ + nbp = NULL; mtx_lock(&bqlock); - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - + if (!defrag && unmapped) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + } if (nbp == NULL) { - /* - * If no EMPTYKVA buffers and we are either - * defragging or reusing, locate a CLEAN buffer - * to free or reuse. If bufspace useage is low - * skip this step so we can allocate a new buffer. - */ - if (defrag || bufspace >= lobufspace) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } + nqindex = QUEUE_EMPTYKVA; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + } - /* - * If we could not find or were not allowed to reuse a - * CLEAN buffer, check to see if it is ok to use an EMPTY - * buffer. We can only use an EMPTY buffer if allocating - * its KVA would not otherwise run us out of buffer space. - */ - if (nbp == NULL && defrag == 0 && - bufspace + maxsize < hibufspace) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } + /* + * If no EMPTYKVA buffers and we are either defragging or + * reusing, locate a CLEAN buffer to free or reuse. If + * bufspace useage is low skip this step so we can allocate a + * new buffer. + */ + if (nbp == NULL && (defrag || bufspace >= lobufspace)) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + } + + /* + * If we could not find or were not allowed to reuse a CLEAN + * buffer, check to see if it is ok to use an EMPTY buffer. + * We can only use an EMPTY buffer if allocating its KVA would + * not otherwise run us out of buffer space. No KVA is needed + * for the unmapped allocation. + */ + if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + } + + /* + * All available buffers might be clean, retry ignoring the + * lobufspace as the last resort. + */ + if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ - while ((bp = nbp) != NULL) { - int qindex = nqindex; + qindex = nqindex; /* - * Calculate next bp ( we can only use it if we do not block - * or do other fancy things ). + * Calculate next bp (we can only use it if we do not + * block or do other fancy things). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_CLEAN: @@ -1948,22 +2271,9 @@ restart: } BO_UNLOCK(bp->b_bufobj); } - CTR6(KTR_BUF, - "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " - "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, - bp->b_kvasize, bp->b_bufsize, qindex); - /* - * Sanity Checks - */ - KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); - - /* - * Note: we no longer distinguish between VMIO and non-VMIO - * buffers. - */ - - KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); + KASSERT(bp->b_qindex == qindex, + ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); if (bp->b_bufobj != NULL) BO_LOCK(bp->b_bufobj); @@ -1971,68 +2281,13 @@ restart: if (bp->b_bufobj != NULL) BO_UNLOCK(bp->b_bufobj); mtx_unlock(&bqlock); - - if (qindex == QUEUE_CLEAN) { - if (bp->b_flags & B_VMIO) { - bp->b_flags &= ~B_ASYNC; - vfs_vmio_release(bp); - } - if (bp->b_vp) - brelvp(bp); - } - /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. - * - * Get the rest of the buffer freed up. b_kva* is still - * valid after this operation. */ - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } - if (!LIST_EMPTY(&bp->b_dep)) - buf_deallocate(bp); - if (bp->b_vflags & BV_BKGRDINPROG) - panic("losing buffer 3"); - KASSERT(bp->b_vp == NULL, - ("bp: %p still has vnode %p. qindex: %d", - bp, bp->b_vp, qindex)); - KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, - ("bp: %p still on a buffer list. xflags %X", - bp, bp->b_xflags)); - - if (bp->b_bufsize) - allocbuf(bp, 0); - - bp->b_flags = 0; - bp->b_ioflags = 0; - bp->b_xflags = 0; - KASSERT((bp->b_vflags & BV_INFREECNT) == 0, - ("buf %p still counted as free?", bp)); - bp->b_vflags = 0; - bp->b_vp = NULL; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_offset = NOOFFSET; - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_npages = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_bufobj = NULL; - bp->b_pin_count = 0; - bp->b_fsprivate1 = NULL; - bp->b_fsprivate2 = NULL; - bp->b_fsprivate3 = NULL; - - LIST_INIT(&bp->b_dep); + getnewbuf_reuse_bp(bp, qindex); + mtx_assert(&bqlock, MA_NOTOWNED); /* * If we are defragging then free the buffer. @@ -2073,6 +2328,52 @@ restart: flushingbufs = 0; break; } + return (bp); +} + +/* + * getnewbuf: + * + * Find and initialize a new buffer header, freeing up existing buffers + * in the bufqueues as necessary. The new buffer is returned locked. + * + * Important: B_INVAL is not set. If the caller wishes to throw the + * buffer away, the caller must set B_INVAL prior to calling brelse(). + * + * We block if: + * We have insufficient buffer headers + * We have insufficient buffer space + * buffer_map is too fragmented ( space reservation fails ) + * If we have to flush dirty buffers ( but we try to avoid this ) + * + * To avoid VFS layer recursion we do not flush dirty buffers ourselves. + * Instead we ask the buf daemon to do it for us. We attempt to + * avoid piecemeal wakeups of the pageout daemon. + */ +static struct buf * +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, + int gbflags) +{ + struct buf *bp; + int defrag; + + KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); + + defrag = 0; + /* + * We can't afford to block since we might be holding a vnode lock, + * which may prevent system daemons from running. We deal with + * low-memory situations by proactively returning memory and running + * async I/O rather then sync I/O. + */ + atomic_add_int(&getnewbufcalls, 1); + atomic_subtract_int(&getnewbufrestarts, 1); +restart: + bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | + GB_KVAALLOC)) == GB_UNMAPPED); + if (bp != NULL) + defrag = 0; /* * If we exhausted our list, sleep as appropriate. We may have to @@ -2080,65 +2381,23 @@ restart: * * Generally we are sleeping due to insufficient buffer space. */ - if (bp == NULL) { - int flags, norunbuf; - char *waitmsg; - int fl; - - if (defrag) { - flags = VFS_BIO_NEED_BUFSPACE; - waitmsg = "nbufkv"; - } else if (bufspace >= hibufspace) { - waitmsg = "nbufbs"; - flags = VFS_BIO_NEED_BUFSPACE; - } else { - waitmsg = "newbuf"; - flags = VFS_BIO_NEED_ANY; - } - mtx_lock(&nblock); - needsbuffer |= flags; - mtx_unlock(&nblock); - mtx_unlock(&bqlock); - - bd_speedup(); /* heeeelp */ - if (gbflags & GB_NOWAIT_BD) - return (NULL); - - mtx_lock(&nblock); - while (needsbuffer & flags) { - if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { - mtx_unlock(&nblock); - /* - * getblk() is called with a vnode - * locked, and some majority of the - * dirty buffers may as well belong to - * the vnode. Flushing the buffers - * there would make a progress that - * cannot be achieved by the - * buf_daemon, that cannot lock the - * vnode. - */ - norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | - (td->td_pflags & TDP_NORUNNINGBUF); - /* play bufdaemon */ - td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; - fl = buf_do_flush(vp); - td->td_pflags &= norunbuf; - mtx_lock(&nblock); - if (fl != 0) - continue; - if ((needsbuffer & flags) == 0) - break; - } - if (msleep(&needsbuffer, &nblock, - (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { - mtx_unlock(&nblock); - return (NULL); - } - } - mtx_unlock(&nblock); + mtx_assert(&bqlock, MA_OWNED); + getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); + mtx_assert(&bqlock, MA_NOTOWNED); + } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { + mtx_assert(&bqlock, MA_NOTOWNED); + + bfreekva(bp); + bp->b_flags |= B_UNMAPPED; + bp->b_kvabase = bp->b_data = unmapped_buf; + bp->b_kvasize = maxsize; + atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_long(&unmapped_bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } else { + mtx_assert(&bqlock, MA_NOTOWNED); + /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order @@ -2147,39 +2406,46 @@ restart: */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - if (maxsize != bp->b_kvasize) { - vm_offset_t addr = 0; - int rv; - - bfreekva(bp); - - vm_map_lock(buffer_map); - if (vm_map_findspace(buffer_map, - vm_map_min(buffer_map), maxsize, &addr)) { - /* - * Buffer map is too fragmented. - * We must defragment the map. - */ - atomic_add_int(&bufdefragcnt, 1); - vm_map_unlock(buffer_map); + if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED | + B_KVAALLOC)) == B_UNMAPPED) { + if (allocbufkva(bp, maxsize, gbflags)) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } - rv = vm_map_insert(buffer_map, NULL, 0, addr, - addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, - MAP_NOFAULT); - KASSERT(rv == KERN_SUCCESS, - ("vm_map_insert(buffer_map) rv %d", rv)); - vm_map_unlock(buffer_map); - bp->b_kvabase = (caddr_t)addr; - bp->b_kvasize = maxsize; - atomic_add_long(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); + } else if ((bp->b_flags & B_KVAALLOC) != 0 && + (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) { + /* + * If the reused buffer has KVA allocated, + * reassign b_kvaalloc to b_kvabase. + */ + bp->b_kvabase = bp->b_kvaalloc; + bp->b_flags &= ~B_KVAALLOC; + atomic_subtract_long(&unmapped_bufspace, + bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); + } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && + (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_KVAALLOC) { + /* + * The case of reused buffer already have KVA + * mapped, but the request is for unmapped + * buffer with KVA allocated. + */ + bp->b_kvaalloc = bp->b_kvabase; + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_flags |= B_UNMAPPED | B_KVAALLOC; + atomic_add_long(&unmapped_bufspace, + bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); + } + if ((gbflags & GB_UNMAPPED) == 0) { + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; + bp->b_flags &= ~B_UNMAPPED; + BUF_CHECK_MAPPED(bp); } - bp->b_saveaddr = bp->b_kvabase; - bp->b_data = bp->b_saveaddr; } return (bp); } @@ -2590,6 +2856,90 @@ vfs_setdirty_locked_object(struct buf *bp) } /* + * Allocate the KVA mapping for an existing buffer. It handles the + * cases of both B_UNMAPPED buffer, and buffer with the preallocated + * KVA which is not mapped (B_KVAALLOC). + */ +static void +bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) +{ + struct buf *scratch_bp; + int bsize, maxsize, need_mapping, need_kva; + off_t offset; + + need_mapping = (bp->b_flags & B_UNMAPPED) != 0 && + (gbflags & GB_UNMAPPED) == 0; + need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED && + (gbflags & GB_KVAALLOC) != 0; + if (!need_mapping && !need_kva) + return; + + BUF_CHECK_UNMAPPED(bp); + + if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) { + /* + * Buffer is not mapped, but the KVA was already + * reserved at the time of the instantiation. Use the + * allocated space. + */ + bp->b_flags &= ~B_KVAALLOC; + KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0")); + bp->b_kvabase = bp->b_kvaalloc; + atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); + goto has_addr; + } + + /* + * Calculate the amount of the address space we would reserve + * if the buffer was mapped. + */ + bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; + offset = blkno * bsize; + maxsize = size + (offset & PAGE_MASK); + maxsize = imax(maxsize, bsize); + +mapping_loop: + if (allocbufkva(bp, maxsize, gbflags)) { + /* + * Request defragmentation. getnewbuf() returns us the + * allocated space by the scratch buffer KVA. + */ + scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | + (GB_UNMAPPED | GB_KVAALLOC)); + if (scratch_bp == NULL) { + if ((gbflags & GB_NOWAIT_BD) != 0) { + /* + * XXXKIB: defragmentation cannot + * succeed, not sure what else to do. + */ + panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp); + } + atomic_add_int(&mappingrestarts, 1); + goto mapping_loop; + } + KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0, + ("scratch bp !B_KVAALLOC %p", scratch_bp)); + setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc, + scratch_bp->b_kvasize, gbflags); + + /* Get rid of the scratch buffer. */ + scratch_bp->b_kvasize = 0; + scratch_bp->b_flags |= B_INVAL | B_UNMAPPED; + scratch_bp->b_flags &= ~B_KVAALLOC; + brelse(scratch_bp); + } + if (!need_mapping) + return; + +has_addr: + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */ + bp->b_flags &= ~B_UNMAPPED; + BUF_CHECK_MAPPED(bp); + bpmap_qenter(bp); +} + +/* * getblk: * * Get a block given a specified block and offset into a file/device. @@ -2626,14 +2976,17 @@ vfs_setdirty_locked_object(struct buf *bp) * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * -getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, +getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; - int error; + int bsize, error, maxsize, vmio; + off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); + KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); @@ -2701,9 +3054,8 @@ loop: } /* - * check for size inconsistancies for non-VMIO case. + * check for size inconsistencies for non-VMIO case. */ - if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { @@ -2737,12 +3089,18 @@ loop: } /* + * Handle the case of unmapped buffer which should + * become mapped, or the buffer for which KVA + * reservation is requested. + */ + bp_unmapped_get_kva(bp, blkno, size, flags); + + /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ - if (bp->b_bcount != size) allocbuf(bp, size); @@ -2783,9 +3141,6 @@ loop: } bp->b_flags &= ~B_DONE; } else { - int bsize, maxsize, vmio; - off_t offset; - /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned @@ -2801,7 +3156,13 @@ loop: bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; offset = blkno * bsize; vmio = vp->v_object != NULL; - maxsize = vmio ? size + (offset & PAGE_MASK) : size; + if (vmio) { + maxsize = size + (offset & PAGE_MASK); + } else { + maxsize = size; + /* Do not allow non-VMIO notmapped buffers. */ + flags &= ~GB_UNMAPPED; + } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); @@ -2857,6 +3218,7 @@ loop: KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); + BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); @@ -3031,10 +3393,14 @@ allocbuf(struct buf *bp, int size) if (desiredpages < bp->b_npages) { vm_page_t m; - pmap_qremove((vm_offset_t)trunc_page( - (vm_offset_t)bp->b_data) + - (desiredpages << PAGE_SHIFT), - (bp->b_npages - desiredpages)); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove((vm_offset_t)trunc_page( + (vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), + (bp->b_npages - desiredpages)); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* @@ -3140,21 +3506,12 @@ allocbuf(struct buf *bp, int size) VM_OBJECT_UNLOCK(obj); /* - * Step 3, fixup the KVM pmap. Remember that - * bp->b_data is relative to bp->b_offset, but - * bp->b_offset may be offset into the first page. + * Step 3, fixup the KVM pmap. */ - - bp->b_data = (caddr_t) - trunc_page((vm_offset_t)bp->b_data); - pmap_qenter( - (vm_offset_t)bp->b_data, - bp->b_pages, - bp->b_npages - ); - - bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | - (vm_offset_t)(bp->b_offset & PAGE_MASK)); + if ((bp->b_flags & B_UNMAPPED) == 0) + bpmap_qenter(bp); + else + BUF_CHECK_UNMAPPED(bp); } } if (newbsize < bp->b_bufsize) @@ -3164,21 +3521,38 @@ allocbuf(struct buf *bp, int size) return 1; } +extern int inflight_transient_maps; + void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); + vm_offset_t start, end; + int transient; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; + if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { + start = trunc_page((vm_offset_t)bp->bio_data); + end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); + transient = 1; + } else { + transient = 0; + start = end = 0; + } done = bp->bio_done; if (done == NULL) wakeup(bp); mtx_unlock(mtxp); if (done != NULL) done(bp); + if (transient) { + pmap_qremove(start, OFF_TO_IDX(end - start)); + vm_map_remove(bio_transient_map, start, end); + atomic_add_int(&inflight_transient_maps, -1); + } } /* @@ -3281,7 +3655,7 @@ dev_strategy(struct cdev *dev, struct buf *bp) bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ - bip->bio_data = bp->b_data; + bdata2bio(bp, bip); bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; @@ -3435,9 +3809,11 @@ bufdone_finish(struct buf *bp) } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3480,8 +3856,12 @@ vfs_unbusy_pages(struct buf *bp) if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), - bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); @@ -3646,9 +4026,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify) foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3704,8 +4086,7 @@ vfs_bio_set_valid(struct buf *bp, int base, int size) void vfs_bio_clrbuf(struct buf *bp) { - int i, j, mask; - caddr_t sa, ea; + int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); @@ -3723,39 +4104,69 @@ vfs_bio_clrbuf(struct buf *bp) if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { - bzero(bp->b_data, bp->b_bufsize); + pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } - ea = sa = bp->b_data; - for(i = 0; i < bp->b_npages; i++, sa = ea) { - ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); - ea = (caddr_t)(vm_offset_t)ulmin( - (u_long)(vm_offset_t)ea, - (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); + sa = bp->b_offset & PAGE_MASK; + slide = 0; + for (i = 0; i < bp->b_npages; i++) { + slide = imin(slide + PAGE_SIZE, bp->b_bufsize + sa); + ea = slide & PAGE_MASK; + if (ea == 0) + ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; - j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; + j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) - bzero(sa, ea - sa); + pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { - if ((bp->b_pages[i]->valid & (1 << j)) == 0) - bzero(sa, DEV_BSIZE); + if ((bp->b_pages[i]->valid & (1 << j)) == 0) { + pmap_zero_page_area(bp->b_pages[i], + sa, DEV_BSIZE); + } } } bp->b_pages[i]->valid |= mask; + sa = 0; } unlock: VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } +void +vfs_bio_bzero_buf(struct buf *bp, int base, int size) +{ + vm_page_t m; + int i, n; + + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + bzero(bp->b_data + base, size); + } else { + BUF_CHECK_UNMAPPED(bp); + n = PAGE_SIZE - (base & PAGE_MASK); + VM_OBJECT_LOCK(bp->b_bufobj->bo_object); + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + m = bp->b_pages[i]; + if (n > size) + n = size; + pmap_zero_page_area(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); + } +} + /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are @@ -3768,6 +4179,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) vm_page_t p; int index; + BUF_CHECK_MAPPED(bp); + to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; @@ -3799,6 +4212,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize) vm_page_t p; int index, newnpages; + BUF_CHECK_MAPPED(bp); + from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) @@ -3829,7 +4244,7 @@ vm_hold_free_pages(struct buf *bp, int newbsize) * check the return value. */ int -vmapbuf(struct buf *bp) +vmapbuf(struct buf *bp, int mapbuf) { caddr_t kva; vm_prot_t prot; @@ -3844,12 +4259,19 @@ vmapbuf(struct buf *bp) (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); - pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); - - kva = bp->b_saveaddr; bp->b_npages = pidx; - bp->b_saveaddr = bp->b_data; - bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); + if (mapbuf) { + pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); + kva = bp->b_saveaddr; + bp->b_saveaddr = bp->b_data; + bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK); + bp->b_flags &= ~B_UNMAPPED; + } else { + bp->b_flags |= B_UNMAPPED; + bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; + bp->b_saveaddr = bp->b_data; + bp->b_data = unmapped_buf; + } return(0); } @@ -3863,7 +4285,10 @@ vunmapbuf(struct buf *bp) int npages; npages = bp->b_npages; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); + if (bp->b_flags & B_UNMAPPED) + bp->b_flags &= ~B_UNMAPPED; + else + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = bp->b_saveaddr; @@ -4000,6 +4425,29 @@ bunpin_wait(struct buf *bp) mtx_unlock(mtxp); } +/* + * Set bio_data or bio_ma for struct bio from the struct buf. + */ +void +bdata2bio(struct buf *bp, struct bio *bip) +{ + + if ((bp->b_flags & B_UNMAPPED) != 0) { + bip->bio_ma = bp->b_pages; + bip->bio_ma_n = bp->b_npages; + bip->bio_data = unmapped_buf; + bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bip->bio_flags |= BIO_UNMAPPED; + KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / + PAGE_SIZE == bp->b_npages, + ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset, + bip->bio_length, bip->bio_ma_n)); + } else { + bip->bio_data = bp->b_data; + bip->bio_ma = NULL; + } +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 663b66f..60338b2 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -60,11 +60,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); -static struct cluster_save * - cluster_collectbufs(struct vnode *vp, struct buf *last_bp); -static struct buf * - cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, - daddr_t blkno, long size, int run, struct buf *fbp); +static struct cluster_save *cluster_collectbufs(struct vnode *vp, + struct buf *last_bp, int gbflags); +static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, + daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, + struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; @@ -83,15 +83,9 @@ extern vm_page_t bogus_page; * cluster_read replaces bread. */ int -cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lblkno; - long size; - struct ucred *cred; - long totread; - int seqcount; - struct buf **bpp; +cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, + struct ucred *cred, long totread, int seqcount, int gbflags, + struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; @@ -117,7 +111,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) /* * get the requested block */ - *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); origblkno = lblkno; /* @@ -208,7 +202,7 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, - blkno, size, nblks, bp); + blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; @@ -252,14 +246,14 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, - size, ncontig, NULL); + size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { - rbp = getblk(vp, lblkno, size, 0, 0, 0); + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); @@ -298,14 +292,8 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * -cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lbn; - daddr_t blkno; - long size; - int run; - struct buf *fbp; +cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct bufobj *bo; struct buf *bp, *tbp; @@ -329,7 +317,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) tbp = fbp; tbp->b_iocmd = BIO_READ; } else { - tbp = getblk(vp, lbn, size, 0, 0, 0); + tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; @@ -350,9 +338,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + if ((gbflags & GB_UNMAPPED) != 0) { + bp->b_flags |= B_UNMAPPED; + bp->b_data = unmapped_buf; + } else { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; @@ -376,7 +369,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) break; } - tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT); + tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | + (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) @@ -499,8 +493,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *)bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } return (bp); } @@ -523,7 +519,10 @@ cluster_callback(bp) if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), + bp->b_npages); + } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. @@ -565,18 +564,19 @@ cluster_callback(bp) */ static __inline int -cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { int r = 0; - switch(write_behind) { + switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: - r = cluster_wbuild(vp, size, start_lbn, len); + r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ @@ -596,7 +596,8 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) * 4. end of a cluster - asynchronously write cluster */ void -cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) +cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, + int gbflags) { daddr_t lbn; int maxclen, cursize; @@ -642,13 +643,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; - buflist = cluster_collectbufs(vp, bp); + buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { @@ -667,7 +668,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, - cursize); + cursize, gbflags); } } else { /* @@ -715,8 +716,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * update daemon handle it. */ bdwrite(bp); - if (seqcount > 1) - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) { + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, + vp->v_clen + 1, gbflags); + } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { @@ -742,11 +745,8 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) * the current block (if last_bp == NULL). */ int -cluster_wbuild(vp, size, start_lbn, len) - struct vnode *vp; - long size; - daddr_t start_lbn; - int len; +cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; @@ -832,10 +832,16 @@ cluster_wbuild(vp, size, start_lbn, len) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); - bp->b_flags |= B_CLUSTER | - (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); + if ((gbflags & GB_UNMAPPED) == 0 || + (tbp->b_flags & B_VMIO) == 0) { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } else { + bp->b_flags |= B_UNMAPPED; + bp->b_data = unmapped_buf; + } + bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | + B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* @@ -962,8 +968,10 @@ cluster_wbuild(vp, size, start_lbn, len) tbp, b_cluster.cluster_entry); } finishcluster: - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *) bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", @@ -984,9 +992,7 @@ cluster_wbuild(vp, size, start_lbn, len) * Plus add one additional buffer. */ static struct cluster_save * -cluster_collectbufs(vp, last_bp) - struct vnode *vp; - struct buf *last_bp; +cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; @@ -999,7 +1005,8 @@ cluster_collectbufs(vp, last_bp) buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { - (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); + (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, + gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 32c0978..6b28580 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1121,6 +1121,45 @@ vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) return (error); } +int +vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio) +{ + struct thread *td; + vm_offset_t iov_base; + int cnt, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove_fromphys(ma, offset, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; + iov_base = (vm_offset_t)uio->uio_iov->iov_base; + switch (uio->uio_rw) { + case UIO_WRITE: + pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, + offset, cnt); + break; + case UIO_READ: + pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, + cnt); + break; + } + pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)(iov_base + cnt); + uio->uio_iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + return (0); +} + + /* * File table truncate routine. */ diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 7925b8c..4fdc88d 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2576,6 +2576,51 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) } } +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + char *a_cp, *b_cp; + vm_page_t a_m, b_m; + vm_offset_t a_pg_offset, b_pg_offset; + vm_paddr_t a_phys, b_phys; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_m = ma[a_offset >> PAGE_SHIFT]; + a_phys = VM_PAGE_TO_PHYS(a_m); + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_m = mb[b_offset >> PAGE_SHIFT]; + b_phys = VM_PAGE_TO_PHYS(b_m); + if (MIPS_DIRECT_MAPPABLE(a_phys) && + MIPS_DIRECT_MAPPABLE(b_phys)) { + pmap_flush_pvcache(a_m); + mips_dcache_wbinv_range_index( + MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); + a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + + a_pg_offset; + b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); + } else { + a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); + b_cp = (char *)a_cp + PAGE_SIZE; + a_cp += a_pg_offset; + b_cp += b_pg_offset; + bcopy(a_cp, b_cp, cnt); + mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); + pmap_lmem_unmap(); + } + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c index b173760..9b496ac 100644 --- a/sys/powerpc/aim/mmu_oea.c +++ b/sys/powerpc/aim/mmu_oea.c @@ -276,6 +276,8 @@ void moea_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t); void moea_clear_modify(mmu_t, vm_page_t); void moea_clear_reference(mmu_t, vm_page_t); void moea_copy_page(mmu_t, vm_page_t, vm_page_t); +void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); void moea_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t); void moea_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); @@ -321,6 +323,7 @@ static mmu_method_t moea_methods[] = { MMUMETHOD(mmu_clear_modify, moea_clear_modify), MMUMETHOD(mmu_clear_reference, moea_clear_reference), MMUMETHOD(mmu_copy_page, moea_copy_page), + MMUMETHOD(mmu_copy_pages, moea_copy_pages), MMUMETHOD(mmu_enter, moea_enter), MMUMETHOD(mmu_enter_object, moea_enter_object), MMUMETHOD(mmu_enter_quick, moea_enter_quick), @@ -1044,6 +1047,30 @@ moea_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) bcopy((void *)src, (void *)dst, PAGE_SIZE); } +void +moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Zero a page of physical memory by temporarily mapping it into the tlb. */ diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 00dab9b..a7bacf4 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -291,6 +291,8 @@ void moea64_change_wiring(mmu_t, pmap_t, vm_offset_t, boolean_t); void moea64_clear_modify(mmu_t, vm_page_t); void moea64_clear_reference(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); +void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); void moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); @@ -335,6 +337,7 @@ static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_clear_reference, moea64_clear_reference), MMUMETHOD(mmu_copy_page, moea64_copy_page), + MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), @@ -1105,6 +1108,72 @@ moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) } } +static inline void +moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + +static inline void +moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + mtx_lock(&moea64_scratchpage_mtx); + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + moea64_set_scratchpage_pa(mmu, 0, + VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); + a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + moea64_set_scratchpage_pa(mmu, 1, + VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); + b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + mtx_unlock(&moea64_scratchpage_mtx); +} + +void +moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + + if (hw_direct_map) { + moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, + xfersize); + } else { + moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, + xfersize); + } +} + void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { diff --git a/sys/powerpc/booke/pmap.c b/sys/powerpc/booke/pmap.c index f6e5f9c..233e1e0 100644 --- a/sys/powerpc/booke/pmap.c +++ b/sys/powerpc/booke/pmap.c @@ -275,6 +275,8 @@ static void mmu_booke_clear_reference(mmu_t, vm_page_t); static void mmu_booke_copy(mmu_t, pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); static void mmu_booke_copy_page(mmu_t, vm_page_t, vm_page_t); +static void mmu_booke_copy_pages(mmu_t, vm_page_t *, + vm_offset_t, vm_page_t *, vm_offset_t, int); static void mmu_booke_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t); static void mmu_booke_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, @@ -335,6 +337,7 @@ static mmu_method_t mmu_booke_methods[] = { MMUMETHOD(mmu_clear_reference, mmu_booke_clear_reference), MMUMETHOD(mmu_copy, mmu_booke_copy), MMUMETHOD(mmu_copy_page, mmu_booke_copy_page), + MMUMETHOD(mmu_copy_pages, mmu_booke_copy_pages), MMUMETHOD(mmu_enter, mmu_booke_enter), MMUMETHOD(mmu_enter_object, mmu_booke_enter_object), MMUMETHOD(mmu_enter_quick, mmu_booke_enter_quick), @@ -2138,6 +2141,36 @@ mmu_booke_copy_page(mmu_t mmu, vm_page_t sm, vm_page_t dm) mtx_unlock(©_page_mutex); } +static inline void +mmu_booke_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + mtx_lock(©_page_mutex); + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + mmu_booke_kenter(mmu, copy_page_src_va, + VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); + a_cp = (char *)copy_page_src_va + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + mmu_booke_kenter(mmu, copy_page_dst_va, + VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); + b_cp = (char *)copy_page_dst_va + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + mmu_booke_kremove(mmu, copy_page_dst_va); + mmu_booke_kremove(mmu, copy_page_src_va); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + mtx_unlock(©_page_mutex); +} + /* * mmu_booke_zero_page_idle zeros the specified hardware page by mapping it * into virtual memory and using bzero to clear its contents. This is intended diff --git a/sys/powerpc/powerpc/mmu_if.m b/sys/powerpc/powerpc/mmu_if.m index 8cd6e52..0382bd8 100644 --- a/sys/powerpc/powerpc/mmu_if.m +++ b/sys/powerpc/powerpc/mmu_if.m @@ -215,6 +215,14 @@ METHOD void copy_page { vm_page_t _dst; }; +METHOD void copy_pages { + mmu_t _mmu; + vm_page_t *_ma; + vm_offset_t _a_offset; + vm_page_t *_mb; + vm_offset_t _b_offset; + int _xfersize; +}; /** * @brief Create a mapping between a virtual/physical address pair in the diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c index c919196..42f1a39 100644 --- a/sys/powerpc/powerpc/pmap_dispatch.c +++ b/sys/powerpc/powerpc/pmap_dispatch.c @@ -133,6 +133,16 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) } void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, + a_offset, mb, b_offset, xfersize); + MMU_COPY_PAGES(mmu_obj, ma, a_offset, mb, b_offset, xfersize); +} + +void pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t p, vm_prot_t prot, boolean_t wired) { diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c index 08f008c..27947dd 100644 --- a/sys/sparc64/sparc64/pmap.c +++ b/sys/sparc64/sparc64/pmap.c @@ -1835,8 +1835,9 @@ pmap_zero_page_idle(vm_page_t m) } } -void -pmap_copy_page(vm_page_t msrc, vm_page_t mdst) +static void +pmap_copy_page_offs(vm_page_t msrc, int src_off, vm_page_t mdst, int dst_off, + int cnt) { vm_offset_t vdst; vm_offset_t vsrc; @@ -1857,16 +1858,17 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) PMAP_STATS_INC(pmap_ncopy_page_c); vdst = TLB_PHYS_TO_DIRECT(pdst); vsrc = TLB_PHYS_TO_DIRECT(psrc); - cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); + cpu_block_copy((char *)vsrc + src_off, (char *)vdst + dst_off, + cnt); } else if (msrc->md.color == -1 && mdst->md.color == -1) { PMAP_STATS_INC(pmap_ncopy_page_nc); - ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE); + ascopy(ASI_PHYS_USE_EC, psrc + src_off, pdst + dst_off, cnt); } else if (msrc->md.color == -1) { if (mdst->md.color == DCACHE_COLOR(pdst)) { PMAP_STATS_INC(pmap_ncopy_page_dc); vdst = TLB_PHYS_TO_DIRECT(pdst); - ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, - PAGE_SIZE); + ascopyfrom(ASI_PHYS_USE_EC, psrc + src_off, + (char *)vdst + dst_off, cnt); } else { PMAP_STATS_INC(pmap_ncopy_page_doc); PMAP_LOCK(kernel_pmap); @@ -1875,8 +1877,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) tp->tte_data = TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vdst, TS_8K); - ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, - PAGE_SIZE); + ascopyfrom(ASI_PHYS_USE_EC, psrc + src_off, + (char *)vdst + dst_off, cnt); tlb_page_demap(kernel_pmap, vdst); PMAP_UNLOCK(kernel_pmap); } @@ -1884,8 +1886,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) if (msrc->md.color == DCACHE_COLOR(psrc)) { PMAP_STATS_INC(pmap_ncopy_page_sc); vsrc = TLB_PHYS_TO_DIRECT(psrc); - ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, - PAGE_SIZE); + ascopyto((char *)vsrc + src_off, ASI_PHYS_USE_EC, + pdst + dst_off, cnt); } else { PMAP_STATS_INC(pmap_ncopy_page_soc); PMAP_LOCK(kernel_pmap); @@ -1894,8 +1896,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); - ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, - PAGE_SIZE); + ascopyto((char *)vsrc + src_off, ASI_PHYS_USE_EC, + pdst + dst_off, cnt); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } @@ -1912,13 +1914,41 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); - cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); + cpu_block_copy((char *)vsrc + src_off, (char *)vdst + dst_off, + cnt); tlb_page_demap(kernel_pmap, vdst); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } } +void +pmap_copy_page(vm_page_t msrc, vm_page_t mdst) +{ + + pmap_copy_page_offs(msrc, 0, mdst, 0, PAGE_SIZE); +} + +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + pmap_copy_page_offs(ma[a_offset >> PAGE_SHIFT], a_pg_offset, + mb[b_offset >> PAGE_SHIFT], b_pg_offset, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may diff --git a/sys/sys/bio.h b/sys/sys/bio.h index c016ee6..7678f5a 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -55,10 +55,13 @@ #define BIO_DONE 0x02 #define BIO_ONQUEUE 0x04 #define BIO_ORDERED 0x08 +#define BIO_UNMAPPED 0x10 +#define BIO_TRANSIENT_MAPPING 0x20 #ifdef _KERNEL struct disk; struct bio; +struct vm_map; /* Empty classifier tag, to prevent further classification. */ #define BIO_NOTCLASSIFIED (void *)(~0UL) @@ -78,6 +81,9 @@ struct bio { off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ + struct vm_page **bio_ma; /* Or unmapped. */ + int bio_ma_offset; /* Offset in the first page of bio_ma. */ + int bio_ma_n; /* Number of pages in bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); @@ -121,6 +127,9 @@ struct bio_queue_head { struct bio *insert_point; }; +extern struct vm_map *bio_transient_map; +extern int bio_transient_maxcnt; + void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 672ef5a..0c7a6f4 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -117,6 +117,7 @@ struct buf { long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ + caddr_t b_kvaalloc; /* allocated kva for B_KVAALLOC */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ @@ -202,8 +203,8 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ -#define B_00001000 0x00001000 /* Available flag. */ +#define B_UNMAPPED 0x00000800 /* KVA is not mapped. */ +#define B_KVAALLOC 0x00001000 /* But allocated. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ @@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ -#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ +#define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ +#define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -470,17 +473,22 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ +extern caddr_t unmapped_buf; void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); +void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ - breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp) + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) +#define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ + breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ + gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, @@ -508,14 +516,15 @@ void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, - struct ucred *, long, int, struct buf **); -int cluster_wbuild(struct vnode *, long, daddr_t, int); -void cluster_write(struct vnode *, struct buf *, u_quad_t, int); + struct ucred *, long, int, int, struct buf **); +int cluster_wbuild(struct vnode *, long, daddr_t, int, int); +void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); +void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); -int vmapbuf(struct buf *); +int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); diff --git a/sys/sys/mount.h b/sys/sys/mount.h index bbbc569..f8e7662 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -351,6 +351,7 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNTK_VGONE_WAITER 0x00000400 #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 +#define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index b54dc04..e6a41a4 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -692,6 +692,8 @@ int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); +int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index abe4073..0bdbbae 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -254,7 +254,7 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; - int error; + int error, gbflags; ufs2_daddr_t bno; static struct timeval lastfail; static int curfail; @@ -265,6 +265,8 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) fs = ip->i_fs; bp = NULL; ump = ip->i_ump; + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; + mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) @@ -296,7 +298,7 @@ retry: /* * Allocate the extra space in the buffer. */ - error = bread(vp, lbprev, osize, NOCRED, &bp); + error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); @@ -332,7 +334,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; @@ -400,7 +402,7 @@ retry: ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; - bzero(bp->b_data + osize, nsize - osize); + vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 0e29be87f..d20df77 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -107,7 +107,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din1; @@ -123,6 +123,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, return (EOPNOTSUPP); if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -211,7 +212,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -255,7 +256,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { @@ -389,7 +390,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -418,16 +419,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, + gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); @@ -539,7 +541,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, int saved_inbdflush; static struct timeval lastfail; static int curfail; - int reclaimed; + int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din2; @@ -553,6 +555,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, *bpp = NULL; if (lbn < 0) return (EFBIG); + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); @@ -603,7 +606,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); nb = dp->di_extb[lbn]; if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { - error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp); + error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -620,7 +624,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, -1 - lbn, osize, NOCRED, &bp); + error = bread_gb(vp, -1 - lbn, osize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -653,7 +658,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0); + bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); bp->b_xflags |= BX_ALTDATA; if (flags & BA_CLRBUF) @@ -679,9 +684,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, if (osize < fs->fs_bsize && osize > 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, nb, dp->di_db[nb], - ffs_blkpref_ufs2(ip, lastlbn, (int)nb, - &dp->di_db[0]), osize, (int)fs->fs_bsize, - flags, cred, &bp); + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + &dp->di_db[0]), osize, (int)fs->fs_bsize, + flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -707,7 +712,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { - error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); + error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -723,7 +729,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, lbn, osize, NOCRED, &bp); + error = bread_gb(vp, lbn, osize, NOCRED, + gbflags, &bp); if (error) { brelse(bp); return (error); @@ -733,7 +740,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, UFS_LOCK(ump); error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &dp->di_db[0]), osize, nsize, flags, + &dp->di_db[0]), osize, nsize, flags, cred, &bp); if (error) return (error); @@ -753,7 +760,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, &dp->di_db[0]), nsize, flags, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0, 0); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); @@ -797,7 +804,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; - bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, + GB_UNMAPPED); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { @@ -862,7 +870,8 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = indirs[i].in_lbn; - nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, + GB_UNMAPPED); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { @@ -931,7 +940,7 @@ retry: nb = newb; *allocblk++ = nb; *lbns_remfree++ = lbn; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); @@ -966,16 +975,17 @@ retry: if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, - MAXBSIZE, seqcount, &nbp); + MAXBSIZE, seqcount, gbflags, &nbp); } else { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + error = bread_gb(vp, lbn, (int)fs->fs_bsize, + NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); diff --git a/sys/ufs/ffs/ffs_rawread.c b/sys/ufs/ffs/ffs_rawread.c index f8e3e00..45cb730 100644 --- a/sys/ufs/ffs/ffs_rawread.c +++ b/sys/ufs/ffs/ffs_rawread.c @@ -240,7 +240,7 @@ ffs_rawread_readahead(struct vnode *vp, bp->b_bcount = bsize - blockoff * DEV_BSIZE; bp->b_bufsize = bp->b_bcount; - if (vmapbuf(bp) < 0) + if (vmapbuf(bp, 1) < 0) return EFAULT; maybe_yield(); @@ -259,7 +259,7 @@ ffs_rawread_readahead(struct vnode *vp, bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; bp->b_bufsize = bp->b_bcount; - if (vmapbuf(bp) < 0) + if (vmapbuf(bp, 1) < 0) return EFAULT; BO_STRATEGY(&dp->v_bufobj, bp); diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 0204613..f1a3aab 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1076,7 +1076,7 @@ ffs_mountfs(devvp, mp, td) */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | - MNTK_NO_IOPF; + MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART @@ -2091,6 +2091,7 @@ ffs_bufwrite(struct buf *bp) * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ + KASSERT((bp->b_flags & B_UNMAPPED) == 0, ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 5c99d5b..ef6194c 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -508,7 +508,8 @@ ffs_read(ap) /* * Don't do readahead if this is the end of the file. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, + GB_UNMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, @@ -518,7 +519,8 @@ ffs_read(ap) * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, - size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); + size, NOCRED, blkoffset + uio->uio_resid, + seqcount, GB_UNMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then @@ -529,15 +531,16 @@ ffs_read(ap) * the 6th argument. */ int nextsize = blksize(fs, ip, nextlbn); - error = breadn(vp, lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); + error = breadn_flags(vp, lbn, size, &nextlbn, + &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ - error = bread(vp, lbn, size, NOCRED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, + GB_UNMAPPED, &bp); } if (error) { brelse(bp); @@ -568,8 +571,13 @@ ffs_read(ap) xfersize = size; } - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_UNMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } if (error) break; @@ -700,6 +708,7 @@ ffs_write(ap) flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; + flags |= BA_UNMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); @@ -739,8 +748,13 @@ ffs_write(ap) if (size < xfersize) xfersize = size; - error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); + if ((bp->b_flags & B_UNMAPPED) == 0) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any @@ -783,7 +797,8 @@ ffs_write(ap) } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(vp, bp, ip->i_size, seqcount); + cluster_write(vp, bp, ip->i_size, seqcount, + GB_UNMAPPED); } else { bawrite(bp); } diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index c590748..31a2ba8 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -121,6 +121,7 @@ void softdep_revert_rmdir(struct inode *, struct inode *); */ #define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ #define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_UNMAPPED 0x00040000 /* Do not mmap resulted buffer. */ #define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ #define BA_SEQSHIFT 24 #define BA_SEQMAX 0x7F diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h index d06c22b..c64a549 100644 --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -108,6 +108,8 @@ void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); +void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, + vm_page_t mb[], vm_offset_t b_offset, int xfersize); void pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t, vm_prot_t, boolean_t); void pmap_enter_object(pmap_t pmap, vm_offset_t start, diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 44bff25..10a2c28 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -758,6 +758,16 @@ swp_pager_strategy(struct buf *bp) TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); + if ((sp->sw_flags & SW_UNMAPPED) != 0) { + bp->b_kvaalloc = bp->b_data; + bp->b_data = unmapped_buf; + bp->b_kvabase = unmapped_buf; + bp->b_offset = 0; + bp->b_flags |= B_UNMAPPED; + } else { + pmap_qenter((vm_offset_t)bp->b_data, + &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); + } sp->sw_strategy(bp, sp); return; } @@ -1155,11 +1165,6 @@ swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) bp = getpbuf(&nsw_rcount); bp->b_flags |= B_PAGING; - /* - * map our page(s) into kva for input - */ - pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i); - bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); @@ -1371,8 +1376,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; - pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); - bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; @@ -1484,7 +1487,12 @@ swp_pager_async_iodone(struct buf *bp) /* * remove the mapping for kernel virtual */ - pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) != 0) { + bp->b_data = bp->b_kvaalloc; + bp->b_kvabase = bp->b_kvaalloc; + bp->b_flags &= ~B_UNMAPPED; + } else + pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); if (bp->b_npages) { object = bp->b_pages[0]->object; @@ -2144,7 +2152,8 @@ swapon_check_swzone(unsigned long npages) } static void -swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev) +swaponsomething(struct vnode *vp, void *id, u_long nblks, + sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; @@ -2180,6 +2189,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strateg sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; + sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* @@ -2537,10 +2547,19 @@ swapgeom_strategy(struct buf *bp, struct swdevt *sp) bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; - bio->bio_data = bp->b_data; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; + if ((bp->b_flags & B_UNMAPPED) != 0) { + bio->bio_ma = bp->b_pages; + bio->bio_data = unmapped_buf; + bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bio->bio_ma_n = bp->b_npages; + bio->bio_flags |= BIO_UNMAPPED; + } else { + bio->bio_data = bp->b_data; + bio->bio_ma = NULL; + } g_io_request(bio, cp); return; } @@ -2630,9 +2649,9 @@ swapongeom_ev(void *arg, int flags) } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, - swapgeom_close, dev2udev(swh->dev)); + swapgeom_close, dev2udev(swh->dev), + (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); swh->error = 0; - return; } static int @@ -2721,6 +2740,6 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, - NODEV); + NODEV, 0); return (0); } diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 5c716d9..79f8767 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -68,6 +68,7 @@ struct swdevt { sw_close_t *sw_close; }; +#define SW_UNMAPPED 0x01 #define SW_CLOSING 0x04 #ifdef _KERNEL diff --git a/sys/vm/vm.h b/sys/vm/vm.h index 132c10e..106c510 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -136,6 +136,8 @@ struct kva_md_info { vm_offset_t clean_eva; vm_offset_t pager_sva; vm_offset_t pager_eva; + vm_offset_t bio_transient_sva; + vm_offset_t bio_transient_eva; }; extern struct kva_md_info kmi; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index c507691..2eb1070 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -186,10 +186,15 @@ again: panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva, - (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE); + (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS + + (long)bio_transient_maxcnt * MAXPHYS, TRUE); buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva, &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE); buffer_map->system_map = 1; + bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva, + &kmi->bio_transient_eva, (long)bio_transient_maxcnt * MAXPHYS, + FALSE); + bio_transient_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva, (long)nswbuf * MAXPHYS, FALSE); pager_map->system_map = 1; diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index ad9aa0d..efd2bf2 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -85,11 +85,12 @@ __FBSDID("$FreeBSD$"); #include #include -vm_map_t kernel_map=0; -vm_map_t kmem_map=0; -vm_map_t exec_map=0; +vm_map_t kernel_map; +vm_map_t kmem_map; +vm_map_t exec_map; vm_map_t pipe_map; -vm_map_t buffer_map=0; +vm_map_t buffer_map; +vm_map_t bio_transient_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index a6d78f4..86ca7b4 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -697,6 +697,7 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) int runpg; int runend; struct buf *bp; + struct mount *mp; int count; int error; @@ -899,12 +900,23 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) } bp = getpbuf(&vnode_pbuf_freecnt); - kva = (vm_offset_t) bp->b_data; + kva = (vm_offset_t)bp->b_data; /* - * and map the pages to be read into the kva + * and map the pages to be read into the kva, if the filesystem + * requires mapped buffers. */ - pmap_qenter(kva, m, count); + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) { + bp->b_data = unmapped_buf; + bp->b_kvabase = unmapped_buf; + bp->b_offset = 0; + bp->b_flags |= B_UNMAPPED; + bp->b_npages = count; + for (i = 0; i < count; i++) + bp->b_pages[i] = m[i]; + } else + pmap_qenter(kva, m, count); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; @@ -933,11 +945,22 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; - if (!error) { - if (size != count * PAGE_SIZE) - bzero((caddr_t) kva + size, PAGE_SIZE * count - size); + if (error != 0 && size != count * PAGE_SIZE) { + if ((bp->b_flags & B_UNMAPPED) != 0) { + bp->b_flags &= ~B_UNMAPPED; + pmap_qenter(kva, m, count); + } + bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + } + if ((bp->b_flags & B_UNMAPPED) == 0) + pmap_qremove(kva, count); + if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) { + bp->b_data = (caddr_t)kva; + bp->b_kvabase = (caddr_t)kva; + bp->b_flags &= ~B_UNMAPPED; + for (i = 0; i < count; i++) + bp->b_pages[i] = NULL; } - pmap_qremove(kva, count); /* * free the buffer header back to the swap buffer pool