FreeBSD ZFS
The Zettabyte File System
|
00001 /* 00002 * CDDL HEADER START 00003 * 00004 * The contents of this file are subject to the terms of the 00005 * Common Development and Distribution License (the "License"). 00006 * You may not use this file except in compliance with the License. 00007 * 00008 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 00009 * or http://www.opensolaris.org/os/licensing. 00010 * See the License for the specific language governing permissions 00011 * and limitations under the License. 00012 * 00013 * When distributing Covered Code, include this CDDL HEADER in each 00014 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 00015 * If applicable, add the following below this CDDL HEADER, with the 00016 * fields enclosed by brackets "[]" replaced with your own identifying 00017 * information: Portions Copyright [yyyy] [name of copyright owner] 00018 * 00019 * CDDL HEADER END 00020 */ 00021 /* 00022 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 00023 * Use is subject to license terms. 00024 */ 00025 /* 00026 * Copyright (c) 2012 by Delphix. All rights reserved. 00027 */ 00028 00101 #include <sys/zfs_rlock.h> 00102 00106 static void 00107 zfs_range_lock_writer(znode_t *zp, rl_t *new) 00108 { 00109 avl_tree_t *tree = &zp->z_range_avl; 00110 rl_t *rl; 00111 avl_index_t where; 00112 uint64_t end_size; 00113 uint64_t off = new->r_off; 00114 uint64_t len = new->r_len; 00115 00116 for (;;) { 00117 /* 00118 * Range locking is also used by zvol and uses a 00119 * dummied up znode. However, for zvol, we don't need to 00120 * append or grow blocksize, and besides we don't have 00121 * a "sa" data or z_zfsvfs - so skip that processing. 00122 * 00123 * Yes, this is ugly, and would be solved by not handling 00124 * grow or append in range lock code. If that was done then 00125 * we could make the range locking code generically available 00126 * to other non-zfs consumers. 00127 */ 00128 if (zp->z_vnode) { /* caller is ZPL */ 00129 /* 00130 * If in append mode pick up the current end of file. 00131 * This is done under z_range_lock to avoid races. 00132 */ 00133 if (new->r_type == RL_APPEND) 00134 new->r_off = zp->z_size; 00135 00136 /* 00137 * If we need to grow the block size then grab the whole 00138 * file range. This is also done under z_range_lock to 00139 * avoid races. 00140 */ 00141 end_size = MAX(zp->z_size, new->r_off + len); 00142 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 00143 zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { 00144 new->r_off = 0; 00145 new->r_len = UINT64_MAX; 00146 } 00147 } 00148 00149 /* 00150 * First check for the usual case of no locks 00151 */ 00152 if (avl_numnodes(tree) == 0) { 00153 new->r_type = RL_WRITER; /* convert to writer */ 00154 avl_add(tree, new); 00155 return; 00156 } 00157 00158 /* 00159 * Look for any locks in the range. 00160 */ 00161 rl = avl_find(tree, new, &where); 00162 if (rl) 00163 goto wait; /* already locked at same offset */ 00164 00165 rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); 00166 if (rl && (rl->r_off < new->r_off + new->r_len)) 00167 goto wait; 00168 00169 rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); 00170 if (rl && rl->r_off + rl->r_len > new->r_off) 00171 goto wait; 00172 00173 new->r_type = RL_WRITER; /* convert possible RL_APPEND */ 00174 avl_insert(tree, new, where); 00175 return; 00176 wait: 00177 if (!rl->r_write_wanted) { 00178 cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); 00179 rl->r_write_wanted = B_TRUE; 00180 } 00181 cv_wait(&rl->r_wr_cv, &zp->z_range_lock); 00182 00183 /* reset to original */ 00184 new->r_off = off; 00185 new->r_len = len; 00186 } 00187 } 00188 00193 static rl_t * 00194 zfs_range_proxify(avl_tree_t *tree, rl_t *rl) 00195 { 00196 rl_t *proxy; 00197 00198 if (rl->r_proxy) 00199 return (rl); /* already a proxy */ 00200 00201 ASSERT3U(rl->r_cnt, ==, 1); 00202 ASSERT(rl->r_write_wanted == B_FALSE); 00203 ASSERT(rl->r_read_wanted == B_FALSE); 00204 avl_remove(tree, rl); 00205 rl->r_cnt = 0; 00206 00207 /* create a proxy range lock */ 00208 proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); 00209 proxy->r_off = rl->r_off; 00210 proxy->r_len = rl->r_len; 00211 proxy->r_cnt = 1; 00212 proxy->r_type = RL_READER; 00213 proxy->r_proxy = B_TRUE; 00214 proxy->r_write_wanted = B_FALSE; 00215 proxy->r_read_wanted = B_FALSE; 00216 avl_add(tree, proxy); 00217 00218 return (proxy); 00219 } 00220 00225 static rl_t * 00226 zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) 00227 { 00228 rl_t *front, *rear; 00229 00230 ASSERT3U(rl->r_len, >, 1); 00231 ASSERT3U(off, >, rl->r_off); 00232 ASSERT3U(off, <, rl->r_off + rl->r_len); 00233 ASSERT(rl->r_write_wanted == B_FALSE); 00234 ASSERT(rl->r_read_wanted == B_FALSE); 00235 00236 /* create the rear proxy range lock */ 00237 rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); 00238 rear->r_off = off; 00239 rear->r_len = rl->r_off + rl->r_len - off; 00240 rear->r_cnt = rl->r_cnt; 00241 rear->r_type = RL_READER; 00242 rear->r_proxy = B_TRUE; 00243 rear->r_write_wanted = B_FALSE; 00244 rear->r_read_wanted = B_FALSE; 00245 00246 front = zfs_range_proxify(tree, rl); 00247 front->r_len = off - rl->r_off; 00248 00249 avl_insert_here(tree, rear, front, AVL_AFTER); 00250 return (front); 00251 } 00252 00256 static void 00257 zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) 00258 { 00259 rl_t *rl; 00260 00261 ASSERT(len); 00262 rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); 00263 rl->r_off = off; 00264 rl->r_len = len; 00265 rl->r_cnt = 1; 00266 rl->r_type = RL_READER; 00267 rl->r_proxy = B_TRUE; 00268 rl->r_write_wanted = B_FALSE; 00269 rl->r_read_wanted = B_FALSE; 00270 avl_add(tree, rl); 00271 } 00272 00273 static void 00274 zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) 00275 { 00276 rl_t *next; 00277 uint64_t off = new->r_off; 00278 uint64_t len = new->r_len; 00279 00280 /* 00281 * prev arrives either: 00282 * - pointing to an entry at the same offset 00283 * - pointing to the entry with the closest previous offset whose 00284 * range may overlap with the new range 00285 * - null, if there were no ranges starting before the new one 00286 */ 00287 if (prev) { 00288 if (prev->r_off + prev->r_len <= off) { 00289 prev = NULL; 00290 } else if (prev->r_off != off) { 00291 /* 00292 * convert to proxy if needed then 00293 * split this entry and bump ref count 00294 */ 00295 prev = zfs_range_split(tree, prev, off); 00296 prev = AVL_NEXT(tree, prev); /* move to rear range */ 00297 } 00298 } 00299 ASSERT((prev == NULL) || (prev->r_off == off)); 00300 00301 if (prev) 00302 next = prev; 00303 else 00304 next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); 00305 00306 if (next == NULL || off + len <= next->r_off) { 00307 /* no overlaps, use the original new rl_t in the tree */ 00308 avl_insert(tree, new, where); 00309 return; 00310 } 00311 00312 if (off < next->r_off) { 00313 /* Add a proxy for initial range before the overlap */ 00314 zfs_range_new_proxy(tree, off, next->r_off - off); 00315 } 00316 00317 new->r_cnt = 0; /* will use proxies in tree */ 00318 /* 00319 * We now search forward through the ranges, until we go past the end 00320 * of the new range. For each entry we make it a proxy if it 00321 * isn't already, then bump its reference count. If there's any 00322 * gaps between the ranges then we create a new proxy range. 00323 */ 00324 for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { 00325 if (off + len <= next->r_off) 00326 break; 00327 if (prev && prev->r_off + prev->r_len < next->r_off) { 00328 /* there's a gap */ 00329 ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); 00330 zfs_range_new_proxy(tree, prev->r_off + prev->r_len, 00331 next->r_off - (prev->r_off + prev->r_len)); 00332 } 00333 if (off + len == next->r_off + next->r_len) { 00334 /* exact overlap with end */ 00335 next = zfs_range_proxify(tree, next); 00336 next->r_cnt++; 00337 return; 00338 } 00339 if (off + len < next->r_off + next->r_len) { 00340 /* new range ends in the middle of this block */ 00341 next = zfs_range_split(tree, next, off + len); 00342 next->r_cnt++; 00343 return; 00344 } 00345 ASSERT3U(off + len, >, next->r_off + next->r_len); 00346 next = zfs_range_proxify(tree, next); 00347 next->r_cnt++; 00348 } 00349 00350 /* Add the remaining end range. */ 00351 zfs_range_new_proxy(tree, prev->r_off + prev->r_len, 00352 (off + len) - (prev->r_off + prev->r_len)); 00353 } 00354 00358 static void 00359 zfs_range_lock_reader(znode_t *zp, rl_t *new) 00360 { 00361 avl_tree_t *tree = &zp->z_range_avl; 00362 rl_t *prev, *next; 00363 avl_index_t where; 00364 uint64_t off = new->r_off; 00365 uint64_t len = new->r_len; 00366 00367 /* 00368 * Look for any writer locks in the range. 00369 */ 00370 retry: 00371 prev = avl_find(tree, new, &where); 00372 if (prev == NULL) 00373 prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); 00374 00375 /* 00376 * Check the previous range for a writer lock overlap. 00377 */ 00378 if (prev && (off < prev->r_off + prev->r_len)) { 00379 if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { 00380 if (!prev->r_read_wanted) { 00381 cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); 00382 prev->r_read_wanted = B_TRUE; 00383 } 00384 cv_wait(&prev->r_rd_cv, &zp->z_range_lock); 00385 goto retry; 00386 } 00387 if (off + len < prev->r_off + prev->r_len) 00388 goto got_lock; 00389 } 00390 00391 /* 00392 * Search through the following ranges to see if there's 00393 * write lock any overlap. 00394 */ 00395 if (prev) 00396 next = AVL_NEXT(tree, prev); 00397 else 00398 next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); 00399 for (; next; next = AVL_NEXT(tree, next)) { 00400 if (off + len <= next->r_off) 00401 goto got_lock; 00402 if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { 00403 if (!next->r_read_wanted) { 00404 cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); 00405 next->r_read_wanted = B_TRUE; 00406 } 00407 cv_wait(&next->r_rd_cv, &zp->z_range_lock); 00408 goto retry; 00409 } 00410 if (off + len <= next->r_off + next->r_len) 00411 goto got_lock; 00412 } 00413 00414 got_lock: 00415 /* 00416 * Add the read lock, which may involve splitting existing 00417 * locks and bumping ref counts (r_cnt). 00418 */ 00419 zfs_range_add_reader(tree, new, prev, where); 00420 } 00421 00422 rl_t * 00423 zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) 00424 { 00425 rl_t *new; 00426 00427 ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); 00428 00429 new = kmem_alloc(sizeof (rl_t), KM_SLEEP); 00430 new->r_zp = zp; 00431 new->r_off = off; 00432 if (len + off < off) /* overflow */ 00433 len = UINT64_MAX - off; 00434 new->r_len = len; 00435 new->r_cnt = 1; /* assume it's going to be in the tree */ 00436 new->r_type = type; 00437 new->r_proxy = B_FALSE; 00438 new->r_write_wanted = B_FALSE; 00439 new->r_read_wanted = B_FALSE; 00440 00441 mutex_enter(&zp->z_range_lock); 00442 if (type == RL_READER) { 00443 /* 00444 * First check for the usual case of no locks 00445 */ 00446 if (avl_numnodes(&zp->z_range_avl) == 0) 00447 avl_add(&zp->z_range_avl, new); 00448 else 00449 zfs_range_lock_reader(zp, new); 00450 } else 00451 zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ 00452 mutex_exit(&zp->z_range_lock); 00453 return (new); 00454 } 00455 00459 static void 00460 zfs_range_unlock_reader(znode_t *zp, rl_t *remove) 00461 { 00462 avl_tree_t *tree = &zp->z_range_avl; 00463 rl_t *rl, *next; 00464 uint64_t len; 00465 00466 /* 00467 * The common case is when the remove entry is in the tree 00468 * (cnt == 1) meaning there's been no other reader locks overlapping 00469 * with this one. Otherwise the remove entry will have been 00470 * removed from the tree and replaced by proxies (one or 00471 * more ranges mapping to the entire range). 00472 */ 00473 if (remove->r_cnt == 1) { 00474 avl_remove(tree, remove); 00475 if (remove->r_write_wanted) { 00476 cv_broadcast(&remove->r_wr_cv); 00477 cv_destroy(&remove->r_wr_cv); 00478 } 00479 if (remove->r_read_wanted) { 00480 cv_broadcast(&remove->r_rd_cv); 00481 cv_destroy(&remove->r_rd_cv); 00482 } 00483 } else { 00484 ASSERT0(remove->r_cnt); 00485 ASSERT0(remove->r_write_wanted); 00486 ASSERT0(remove->r_read_wanted); 00487 /* 00488 * Find start proxy representing this reader lock, 00489 * then decrement ref count on all proxies 00490 * that make up this range, freeing them as needed. 00491 */ 00492 rl = avl_find(tree, remove, NULL); 00493 ASSERT(rl); 00494 ASSERT(rl->r_cnt); 00495 ASSERT(rl->r_type == RL_READER); 00496 for (len = remove->r_len; len != 0; rl = next) { 00497 len -= rl->r_len; 00498 if (len) { 00499 next = AVL_NEXT(tree, rl); 00500 ASSERT(next); 00501 ASSERT(rl->r_off + rl->r_len == next->r_off); 00502 ASSERT(next->r_cnt); 00503 ASSERT(next->r_type == RL_READER); 00504 } 00505 rl->r_cnt--; 00506 if (rl->r_cnt == 0) { 00507 avl_remove(tree, rl); 00508 if (rl->r_write_wanted) { 00509 cv_broadcast(&rl->r_wr_cv); 00510 cv_destroy(&rl->r_wr_cv); 00511 } 00512 if (rl->r_read_wanted) { 00513 cv_broadcast(&rl->r_rd_cv); 00514 cv_destroy(&rl->r_rd_cv); 00515 } 00516 kmem_free(rl, sizeof (rl_t)); 00517 } 00518 } 00519 } 00520 kmem_free(remove, sizeof (rl_t)); 00521 } 00522 00523 void 00524 zfs_range_unlock(rl_t *rl) 00525 { 00526 znode_t *zp = rl->r_zp; 00527 00528 ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); 00529 ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); 00530 ASSERT(!rl->r_proxy); 00531 00532 mutex_enter(&zp->z_range_lock); 00533 if (rl->r_type == RL_WRITER) { 00534 /* writer locks can't be shared or split */ 00535 avl_remove(&zp->z_range_avl, rl); 00536 mutex_exit(&zp->z_range_lock); 00537 if (rl->r_write_wanted) { 00538 cv_broadcast(&rl->r_wr_cv); 00539 cv_destroy(&rl->r_wr_cv); 00540 } 00541 if (rl->r_read_wanted) { 00542 cv_broadcast(&rl->r_rd_cv); 00543 cv_destroy(&rl->r_rd_cv); 00544 } 00545 kmem_free(rl, sizeof (rl_t)); 00546 } else { 00547 /* 00548 * lock may be shared, let zfs_range_unlock_reader() 00549 * release the lock and free the rl_t 00550 */ 00551 zfs_range_unlock_reader(zp, rl); 00552 mutex_exit(&zp->z_range_lock); 00553 } 00554 } 00555 00561 void 00562 zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) 00563 { 00564 znode_t *zp = rl->r_zp; 00565 00566 /* Ensure there are no other locks */ 00567 ASSERT(avl_numnodes(&zp->z_range_avl) == 1); 00568 ASSERT(rl->r_off == 0); 00569 ASSERT(rl->r_type == RL_WRITER); 00570 ASSERT(!rl->r_proxy); 00571 ASSERT3U(rl->r_len, ==, UINT64_MAX); 00572 ASSERT3U(rl->r_cnt, ==, 1); 00573 00574 mutex_enter(&zp->z_range_lock); 00575 rl->r_off = off; 00576 rl->r_len = len; 00577 mutex_exit(&zp->z_range_lock); 00578 if (rl->r_write_wanted) 00579 cv_broadcast(&rl->r_wr_cv); 00580 if (rl->r_read_wanted) 00581 cv_broadcast(&rl->r_rd_cv); 00582 } 00583 00584 int 00585 zfs_range_compare(const void *arg1, const void *arg2) 00586 { 00587 const rl_t *rl1 = arg1; 00588 const rl_t *rl2 = arg2; 00589 00590 if (rl1->r_off > rl2->r_off) 00591 return (1); 00592 if (rl1->r_off < rl2->r_off) 00593 return (-1); 00594 return (0); 00595 }