Index: sys/cddl/compat/opensolaris/sys/time.h =================================================================== --- sys/cddl/compat/opensolaris/sys/time.h (revision 253925) +++ sys/cddl/compat/opensolaris/sys/time.h (working copy) @@ -50,6 +50,8 @@ #define SEC_TO_TICK(sec) ((sec) * hz) #define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz)) +#define NSEC_PER_USEC 1000 + #ifdef _KERNEL static __inline hrtime_t gethrtime(void) { Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 253925) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (working copy) @@ -33,6 +33,8 @@ #include #include +SYSCTL_DECL(_vfs_zfs_vdev); + /* * Virtual device vector for mirroring. */ @@ -41,6 +43,7 @@ vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_pending; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; @@ -54,7 +57,20 @@ mirror_child_t mm_child[1]; } mirror_map_t; -int vdev_mirror_shift = 21; +/* + * When the children are equally busy queue incoming requests to a single + * child for N microseconds. + * Otherwise, requests are queued to the least busy device. + * + * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us + * significantly to bound the worst case latencies. It would probably be + * ideal to calculate a decaying average of the last observed latencies and + * use that to dynamically adjust the zfs_vdev_mirror_switch_us time. + */ +int zfs_vdev_mirror_switch_us = 10000; +TUNABLE_INT("vfs.zfs.vdev.mirror_switch_us", &zfs_vdev_mirror_switch_us); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, mirror_switch_us, CTLFLAG_RW, + &zfs_vdev_mirror_switch_us, 0, "Switch mirrors every N usecs"); static void vdev_mirror_map_free(zio_t *zio) @@ -69,6 +85,19 @@ zio_vsd_default_cksum_report }; +static int +vdev_mirror_pending(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + int pending; + + mutex_enter(&vq->vq_lock); + pending = avl_numnodes(&vq->vq_pending_tree); + mutex_exit(&vq->vq_lock); + + return (pending); +} + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -108,6 +137,9 @@ mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { + int lowest_pending = INT_MAX; + int lowest_nr = 1; + c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); @@ -114,8 +146,7 @@ mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; + mm->mm_preferred = 0; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { @@ -122,7 +153,40 @@ mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (mm->mm_replacing) + continue; + + if (!vdev_readable(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; + mc->mc_skipped = 1; + mc->mc_pending = INT_MAX; + continue; + } + + mc->mc_pending = vdev_mirror_pending(mc->mc_vd); + if (mc->mc_pending < lowest_pending) { + lowest_pending = mc->mc_pending; + lowest_nr = 1; + } else if (mc->mc_pending == lowest_pending) { + lowest_nr++; + } } + + d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); + d = (d % lowest_nr) + 1; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mm->mm_child[c].mc_pending == lowest_pending) { + if (--d == 0) { + mm->mm_preferred = c; + break; + } + } + } } zio->io_vsd = mm;