FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 00023 * Use is subject to license terms. 00024 */ 00025 00026 #include <sys/zfs_context.h> 00027 #include <sys/spa.h> 00028 #include <sys/vdev_impl.h> 00029 #include <sys/zio.h> 00030 #include <sys/kstat.h> 00031 00067 /* 00068 * These tunables are for performance analysis. 00069 */ 00087 int zfs_vdev_cache_max = 1<<14; /* 16KB */ 00088 int zfs_vdev_cache_size = 0; 00089 int zfs_vdev_cache_bshift = 16; 00092 #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ 00093 00094 SYSCTL_DECL(_vfs_zfs_vdev); 00095 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); 00096 TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max); 00097 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, 00098 &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); 00099 TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size); 00100 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, 00101 &zfs_vdev_cache_size, 0, "Size of VDEV cache"); 00102 TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift); 00103 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN, 00104 &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value"); 00105 00106 kstat_t *vdc_ksp = NULL; 00107 00108 typedef struct vdc_stats { 00109 kstat_named_t vdc_stat_delegations; 00110 kstat_named_t vdc_stat_hits; 00111 kstat_named_t vdc_stat_misses; 00112 } vdc_stats_t; 00113 00114 static vdc_stats_t vdc_stats = { 00115 { "delegations", KSTAT_DATA_UINT64 }, 00116 { "hits", KSTAT_DATA_UINT64 }, 00117 { "misses", KSTAT_DATA_UINT64 } 00118 }; 00119 00120 #define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); 00121 00122 static int 00123 vdev_cache_offset_compare(const void *a1, const void *a2) 00124 { 00125 const vdev_cache_entry_t *ve1 = a1; 00126 const vdev_cache_entry_t *ve2 = a2; 00127 00128 if (ve1->ve_offset < ve2->ve_offset) 00129 return (-1); 00130 if (ve1->ve_offset > ve2->ve_offset) 00131 return (1); 00132 return (0); 00133 } 00134 00135 static int 00136 vdev_cache_lastused_compare(const void *a1, const void *a2) 00137 { 00138 const vdev_cache_entry_t *ve1 = a1; 00139 const vdev_cache_entry_t *ve2 = a2; 00140 00141 if (ve1->ve_lastused < ve2->ve_lastused) 00142 return (-1); 00143 if (ve1->ve_lastused > ve2->ve_lastused) 00144 return (1); 00145 00146 /* 00147 * Among equally old entries, sort by offset to ensure uniqueness. 00148 */ 00149 return (vdev_cache_offset_compare(a1, a2)); 00150 } 00151 00155 static void 00156 vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 00157 { 00158 ASSERT(MUTEX_HELD(&vc->vc_lock)); 00159 ASSERT(ve->ve_fill_io == NULL); 00160 ASSERT(ve->ve_data != NULL); 00161 00162 avl_remove(&vc->vc_lastused_tree, ve); 00163 avl_remove(&vc->vc_offset_tree, ve); 00164 zio_buf_free(ve->ve_data, VCBS); 00165 kmem_free(ve, sizeof (vdev_cache_entry_t)); 00166 } 00167 00173 static vdev_cache_entry_t * 00174 vdev_cache_allocate(zio_t *zio) 00175 { 00176 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 00177 uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 00178 vdev_cache_entry_t *ve; 00179 00180 ASSERT(MUTEX_HELD(&vc->vc_lock)); 00181 00182 if (zfs_vdev_cache_size == 0) 00183 return (NULL); 00184 00185 /* 00186 * If adding a new entry would exceed the cache size, 00187 * evict the oldest entry (LRU). 00188 */ 00189 if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 00190 zfs_vdev_cache_size) { 00191 ve = avl_first(&vc->vc_lastused_tree); 00192 if (ve->ve_fill_io != NULL) 00193 return (NULL); 00194 ASSERT(ve->ve_hits != 0); 00195 vdev_cache_evict(vc, ve); 00196 } 00197 00198 ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 00199 ve->ve_offset = offset; 00200 ve->ve_lastused = ddi_get_lbolt(); 00201 ve->ve_data = zio_buf_alloc(VCBS); 00202 00203 avl_add(&vc->vc_offset_tree, ve); 00204 avl_add(&vc->vc_lastused_tree, ve); 00205 00206 return (ve); 00207 } 00208 00209 static void 00210 vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 00211 { 00212 uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 00213 00214 ASSERT(MUTEX_HELD(&vc->vc_lock)); 00215 ASSERT(ve->ve_fill_io == NULL); 00216 00217 if (ve->ve_lastused != ddi_get_lbolt()) { 00218 avl_remove(&vc->vc_lastused_tree, ve); 00219 ve->ve_lastused = ddi_get_lbolt(); 00220 avl_add(&vc->vc_lastused_tree, ve); 00221 } 00222 00223 ve->ve_hits++; 00224 bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); 00225 } 00226 00230 static void 00231 vdev_cache_fill(zio_t *fio) 00232 { 00233 vdev_t *vd = fio->io_vd; 00234 vdev_cache_t *vc = &vd->vdev_cache; 00235 vdev_cache_entry_t *ve = fio->io_private; 00236 zio_t *pio; 00237 00238 ASSERT(fio->io_size == VCBS); 00239 00240 /* 00241 * Add data to the cache. 00242 */ 00243 mutex_enter(&vc->vc_lock); 00244 00245 ASSERT(ve->ve_fill_io == fio); 00246 ASSERT(ve->ve_offset == fio->io_offset); 00247 ASSERT(ve->ve_data == fio->io_data); 00248 00249 ve->ve_fill_io = NULL; 00250 00251 /* 00252 * Even if this cache line was invalidated by a missed write update, 00253 * any reads that were queued up before the missed update are still 00254 * valid, so we can satisfy them from this line before we evict it. 00255 */ 00256 while ((pio = zio_walk_parents(fio)) != NULL) 00257 vdev_cache_hit(vc, ve, pio); 00258 00259 if (fio->io_error || ve->ve_missed_update) 00260 vdev_cache_evict(vc, ve); 00261 00262 mutex_exit(&vc->vc_lock); 00263 } 00264 00270 int 00271 vdev_cache_read(zio_t *zio) 00272 { 00273 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 00274 vdev_cache_entry_t *ve, ve_search; 00275 uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 00276 uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 00277 zio_t *fio; 00278 00279 ASSERT(zio->io_type == ZIO_TYPE_READ); 00280 00281 if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 00282 return (EINVAL); 00283 00284 if (zio->io_size > zfs_vdev_cache_max) 00285 return (EOVERFLOW); 00286 00287 /* 00288 * If the I/O straddles two or more cache blocks, don't cache it. 00289 */ 00290 if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) 00291 return (EXDEV); 00292 00293 ASSERT(cache_phase + zio->io_size <= VCBS); 00294 00295 mutex_enter(&vc->vc_lock); 00296 00297 ve_search.ve_offset = cache_offset; 00298 ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 00299 00300 if (ve != NULL) { 00301 if (ve->ve_missed_update) { 00302 mutex_exit(&vc->vc_lock); 00303 return (ESTALE); 00304 } 00305 00306 if ((fio = ve->ve_fill_io) != NULL) { 00307 zio_vdev_io_bypass(zio); 00308 zio_add_child(zio, fio); 00309 mutex_exit(&vc->vc_lock); 00310 VDCSTAT_BUMP(vdc_stat_delegations); 00311 return (0); 00312 } 00313 00314 vdev_cache_hit(vc, ve, zio); 00315 zio_vdev_io_bypass(zio); 00316 00317 mutex_exit(&vc->vc_lock); 00318 VDCSTAT_BUMP(vdc_stat_hits); 00319 return (0); 00320 } 00321 00322 ve = vdev_cache_allocate(zio); 00323 00324 if (ve == NULL) { 00325 mutex_exit(&vc->vc_lock); 00326 return (ENOMEM); 00327 } 00328 00329 fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, 00330 ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, 00331 ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); 00332 00333 ve->ve_fill_io = fio; 00334 zio_vdev_io_bypass(zio); 00335 zio_add_child(zio, fio); 00336 00337 mutex_exit(&vc->vc_lock); 00338 zio_nowait(fio); 00339 VDCSTAT_BUMP(vdc_stat_misses); 00340 00341 return (0); 00342 } 00343 00347 void 00348 vdev_cache_write(zio_t *zio) 00349 { 00350 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 00351 vdev_cache_entry_t *ve, ve_search; 00352 uint64_t io_start = zio->io_offset; 00353 uint64_t io_end = io_start + zio->io_size; 00354 uint64_t min_offset = P2ALIGN(io_start, VCBS); 00355 uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 00356 avl_index_t where; 00357 00358 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 00359 00360 mutex_enter(&vc->vc_lock); 00361 00362 ve_search.ve_offset = min_offset; 00363 ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 00364 00365 if (ve == NULL) 00366 ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 00367 00368 while (ve != NULL && ve->ve_offset < max_offset) { 00369 uint64_t start = MAX(ve->ve_offset, io_start); 00370 uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 00371 00372 if (ve->ve_fill_io != NULL) { 00373 ve->ve_missed_update = 1; 00374 } else { 00375 bcopy((char *)zio->io_data + start - io_start, 00376 ve->ve_data + start - ve->ve_offset, end - start); 00377 } 00378 ve = AVL_NEXT(&vc->vc_offset_tree, ve); 00379 } 00380 mutex_exit(&vc->vc_lock); 00381 } 00382 00383 void 00384 vdev_cache_purge(vdev_t *vd) 00385 { 00386 vdev_cache_t *vc = &vd->vdev_cache; 00387 vdev_cache_entry_t *ve; 00388 00389 mutex_enter(&vc->vc_lock); 00390 while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 00391 vdev_cache_evict(vc, ve); 00392 mutex_exit(&vc->vc_lock); 00393 } 00394 00395 void 00396 vdev_cache_init(vdev_t *vd) 00397 { 00398 vdev_cache_t *vc = &vd->vdev_cache; 00399 00400 mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 00401 00402 avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 00403 sizeof (vdev_cache_entry_t), 00404 offsetof(struct vdev_cache_entry, ve_offset_node)); 00405 00406 avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 00407 sizeof (vdev_cache_entry_t), 00408 offsetof(struct vdev_cache_entry, ve_lastused_node)); 00409 } 00410 00411 void 00412 vdev_cache_fini(vdev_t *vd) 00413 { 00414 vdev_cache_t *vc = &vd->vdev_cache; 00415 00416 vdev_cache_purge(vd); 00417 00418 avl_destroy(&vc->vc_offset_tree); 00419 avl_destroy(&vc->vc_lastused_tree); 00420 00421 mutex_destroy(&vc->vc_lock); 00422 } 00423 00424 void 00425 vdev_cache_stat_init(void) 00426 { 00427 vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", 00428 KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), 00429 KSTAT_FLAG_VIRTUAL); 00430 if (vdc_ksp != NULL) { 00431 vdc_ksp->ks_data = &vdc_stats; 00432 kstat_install(vdc_ksp); 00433 } 00434 } 00435 00436 void 00437 vdev_cache_stat_fini(void) 00438 { 00439 if (vdc_ksp != NULL) { 00440 kstat_delete(vdc_ksp); 00441 vdc_ksp = NULL; 00442 } 00443 }