Index: sys/lock.h =================================================================== RCS file: /home/ncvs/src/sys/sys/lock.h,v retrieving revision 1.73 diff -u -p -r1.73 lock.h --- sys/lock.h 15 May 2008 20:10:06 -0000 1.73 +++ sys/lock.h 19 Jul 2008 02:31:23 -0000 @@ -216,6 +216,7 @@ void witness_assert(struct lock_object * void witness_display_spinlock(struct lock_object *, struct thread *); int witness_line(struct lock_object *); const char *witness_file(struct lock_object *); +void witness_thread_exit(struct thread *); #ifdef WITNESS Index: kern/kern_thread.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_thread.c,v retrieving revision 1.274 diff -u -p -r1.274 kern_thread.c --- kern/kern_thread.c 17 Apr 2008 04:20:10 -0000 1.274 +++ kern/kern_thread.c 19 Jul 2008 02:31:23 -0000 @@ -26,6 +26,8 @@ * DAMAGE. */ +#include "opt_witness.h" + #include __FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.274 2008/04/17 04:20:10 jeff Exp $"); @@ -403,6 +405,9 @@ thread_exit(void) ruxagg(&p->p_rux, td); PROC_SUNLOCK(p); td->td_state = TDS_INACTIVE; +#ifdef WITNESS + witness_thread_exit(td); +#endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); Index: kern/subr_witness.c =================================================================== RCS file: /home/ncvs/src/sys/kern/subr_witness.c,v retrieving revision 1.249 diff -u -p -r1.249 subr_witness.c --- kern/subr_witness.c 15 May 2008 20:10:06 -0000 1.249 +++ kern/subr_witness.c 19 Jul 2008 02:31:23 -0000 @@ -101,10 +101,14 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_wi #include #include #include +#include #include #include +#include + +MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* Note that these traces do not work with KTR_ALQ. */ #if 0 @@ -119,22 +123,19 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_wi /* Define this to check for blessed mutexes */ #undef BLESSING -#define WITNESS_COUNT 1024 -#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4) -#define WITNESS_SBUFSIZE 32768 +#define WITNESS_COUNT 1024 +#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4) #define WITNESS_PENDLIST 512 + /* - * XXX: This is somewhat bogus, as we assume here that at most 1024 threads - * will hold LOCK_NCHILDREN * 2 locks. We handle failure ok, and we should + * XXX: This is somewhat bogus, as we assume here that at most 2048 threads + * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ -#define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2 - -#define WITNESS_NCHILDREN 6 - -#define LOCK_NCHILDREN 3 +#define LOCK_NCHILDREN 5 +#define LOCK_CHILDCOUNT 2048 -struct witness_child_list_entry; +#define MAX_W_NAME 64 /* * Lock instances. A lock instance is the data associated with a lock while @@ -165,29 +166,129 @@ struct lock_list_entry { u_int ll_count; }; +/* + * The main witness structure. One of these per named lock type in the system + * (for example, "vnode interlock"). + */ struct witness { - const char *w_name; - struct lock_class *w_class; - STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ - STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ - struct witness_child_list_entry *w_children; /* Great evilness... */ - const char *w_file; - int w_line; - u_int w_level; - u_int w_refcount; - u_char w_Giant_squawked:1; - u_char w_other_squawked:1; - u_char w_same_squawked:1; - u_char w_displayed:1; + char w_name[MAX_W_NAME]; + u_int32_t w_index; /* Index in the relationship matrix */ + struct lock_class * w_class; + STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ + STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ + struct witness * w_hash_next; /* Linked list in hash buckets. */ + const char * w_file; /* File where last acquired */ + u_int32_t w_line; /* Line where last acquired */ + u_int32_t w_refcount; + u_int16_t w_num_ancestors; /* direct/indirect + * ancestor count */ + u_int16_t w_num_descendants; /* direct/indirect + * descendant count */ + int16_t w_ddb_level; + int w_displayed:1; + int w_badmalloc:1; + int w_reversed:1; +}; + +STAILQ_HEAD(witness_list, witness); + +/* + * WITNESS hash table definitions. Hash tables are used to look up witnesses by + * name, and to look up known lock order reversals by witness index pairs. + */ + +static u_int32_t witness_hash_djb2(const u_int8_t *key, u_int32_t size); +static void witness_init_hash_tables(void); + +/* + * The witness hash table. Keys are witness names (const char *), elements are + * witness objects (struct witness *). + */ +#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ +struct witness_hash { + u_int32_t wh_size; + u_int32_t wh_count; + struct witness *wh_array[WITNESS_HASH_SIZE]; }; -struct witness_child_list_entry { - struct witness_child_list_entry *wcl_next; - struct witness *wcl_children[WITNESS_NCHILDREN]; - u_int wcl_count; +static struct witness *witness_hash_get(const char *key); +static void witness_hash_put(struct witness *w); + +/* + * Key type for the lock order data hash table. + */ +struct witness_lock_order_key { + uint16_t from; + uint16_t to; }; -STAILQ_HEAD(witness_list, witness); +static inline int +witness_lock_order_key_empty(struct witness_lock_order_key *key) +{ + return key->from == 0 && key->to == 0; +} + +static inline int +witness_lock_order_key_equal(struct witness_lock_order_key *a, + struct witness_lock_order_key *b) +{ + return a->from == b->from && a->to == b->to; +} + +struct witness_lock_order_data { + struct witness_lock_order_key wlod_key; + struct witness_lock_order_data *wlod_next; + struct stack wlod_stack; +}; + +/* Allocate 256 KB of stack data space */ +#define WITNESS_LOCK_ORDER_DATA_COUNT 2048 +/* Prime, gives load factor of ~2 at full load */ +#define WITNESS_LOCK_ORDER_HASH_SIZE 1021 + +/* + * The witness lock order data hash table. Keys are witness index tuples + * (struct witness_lock_order_key), elements are lock order data objects + * (struct witness_lock_order_data). + */ +struct witness_lock_order_hash { + unsigned int wloh_size; + unsigned int wloh_count; + struct witness_lock_order_data *wloh_array[WITNESS_LOCK_ORDER_HASH_SIZE]; +}; + +static struct witness_lock_order_data *witness_lock_order_get( + struct witness *parent, struct witness *child); +static int witness_lock_order_add(struct witness *parent, + struct witness *child); +static int witness_lock_order_check(struct witness *parent, + struct witness *child); + +/* + * These flags go in the witness relationship matrix and describe the + * relationship between any two struct witness objects. + */ +#define WITNESS_UNRELATED 0x00 /* No lock order relation. */ +#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ +#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ +#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ +#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ +#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) +#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) +#define WITNESS_RELATED_MASK \ + (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) +#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been + * observed. */ +#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ +#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ +#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ +/* Descendant to ancestor flags */ +#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) +/* Ancestor to descendant flags */ +#define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) + +#define WITNESS_INDEX_ASSERT(i) \ + MPASS((i) > 0 && (i) <= w_max_used_index && (i) < WITNESS_COUNT) #ifdef BLESSING struct witness_blessed { @@ -196,11 +297,6 @@ struct witness_blessed { }; #endif -struct witness_order_list_entry { - const char *w_name; - struct lock_class *w_class; -}; - struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; @@ -211,58 +307,63 @@ static int blessed(struct witness *, str #endif static void depart(struct witness *w); static struct witness *enroll(const char *description, - struct lock_class *lock_class); -static int insertchild(struct witness *parent, struct witness *child); + struct lock_class *lock_class); +static int adopt(struct witness *parent, struct witness *child); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static int itismychild(struct witness *parent, struct witness *child); -static void removechild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); -static int sysctl_debug_witness_graphs(SYSCTL_HANDLER_ARGS); static const char *fixup_filename(const char *file); -static void witness_addgraph(struct sbuf *sb, struct witness *parent); static struct witness *witness_get(void); static void witness_free(struct witness *m); -static struct witness_child_list_entry *witness_child_get(void); -static void witness_child_free(struct witness_child_list_entry *wcl); static struct lock_list_entry *witness_lock_list_get(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_instance *find_instance(struct lock_list_entry *lock_list, struct lock_object *lock); static void witness_list_lock(struct lock_instance *instance); #ifdef DDB -static void witness_leveldescendents(struct witness *parent, int level); -static void witness_levelall(void); -static void witness_displaydescendants(void(*)(const char *fmt, ...), +static void witness_ddb_level_descendants(struct witness *parent, int level); +static void witness_ddb_compute_levels(void); +static void witness_ddb_display_descendants(void(*)(const char *fmt, ...), struct witness *, int indent); -static void witness_display_list(void(*prnt)(const char *fmt, ...), +static void witness_ddb_display_list(void(*prnt)(const char *fmt, ...), struct witness_list *list); -static void witness_display(void(*)(const char *fmt, ...)); -static void witness_list(struct thread *td); +static void witness_ddb_display(void(*)(const char *fmt, ...)); +static void witness_ddb_list(struct thread *td); #endif +static void witness_increment_graph_generation(void); + +/* + * Returns 0 if one of the locks is a spin lock and the other is not. + * Returns 1 otherwise. + */ +static __inline int +witness_lock_type_equal(struct witness *w1, struct witness *w2) +{ + return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == + (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); +} SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, 0, "Witness Locking"); /* - * If set to 0, witness is disabled. If set to a non-zero value, witness - * performs full lock order checking for all locks. At runtime, this - * value may be set to 0 to turn off witness. witness is not allowed be - * turned on once it is turned off, however. + * If set to 0, witness is disabled. Otherwise witness performs full lock order + * checking for all locks. At runtime, witness is allowed to be turned off. + * witness is not allowed be turned on once it is turned off, however. */ static int witness_watch = 1; TUNABLE_INT("debug.witness.watch", &witness_watch); SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RW | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); -SYSCTL_PROC(_debug_witness, OID_AUTO, graphs, CTLTYPE_STRING | CTLFLAG_RD, - NULL, 0, sysctl_debug_witness_graphs, "A", "Show locks relation graphs"); #ifdef KDB /* - * When KDB is enabled and witness_kdb is set to 1, it will cause the system + * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ +static void witness_debugger(int cond, const char *msg); #ifdef WITNESS_KDB int witness_kdb = 1; #else @@ -272,7 +373,7 @@ TUNABLE_INT("debug.witness.kdb", &witnes SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RW, &witness_kdb, 0, ""); /* - * When KDB is enabled and witness_trace is set to 1, it will cause the system + * When KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. @@ -291,29 +392,63 @@ TUNABLE_INT("debug.witness.skipspin", &w SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); +static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); +/* + * Call this to print out the internal witness structure as a dot graph. + */ +SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_debug_witness_fullgraph, "A", "Dot graph of witness info"); + +static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); +/* + * Call this to print out the internal witness structure as a dot graph. + */ +SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_debug_witness_badstacks, "A", "Print bad witness stacks"); + +static int sysctl_debug_witness_cyclegraph(SYSCTL_HANDLER_ARGS); +/* + * Call this to print out the internal witness structure as a dot graph. + */ +SYSCTL_PROC(_debug_witness, OID_AUTO, cyclegraph, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_debug_witness_cyclegraph, "A", + "Print bad part of witness graph"); + static struct mtx w_mtx; +/* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); +/* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); -static struct witness_child_list_entry *w_child_free = NULL; +/* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; -static int w_free_cnt, w_spin_cnt, w_sleep_cnt, w_child_free_cnt, w_child_cnt; +static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); -SYSCTL_INT(_debug_witness, OID_AUTO, child_free_cnt, CTLFLAG_RD, - &w_child_free_cnt, 0, ""); -SYSCTL_INT(_debug_witness, OID_AUTO, child_cnt, CTLFLAG_RD, &w_child_cnt, 0, - ""); -static struct witness w_data[WITNESS_COUNT]; -static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT]; +/* Statically allocated memory */ +static struct witness *w_data; +static u_int8_t w_rmatrix[WITNESS_COUNT+1][WITNESS_COUNT+1]; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; +/* The witness hash */ +static struct witness_hash w_hash; +/* The lock order data hash */ +static struct witness_lock_order_data w_lodata[WITNESS_LOCK_ORDER_DATA_COUNT]; +static struct witness_lock_order_data *w_lofree = NULL; +static struct witness_lock_order_hash w_lohash; +static int w_max_used_index = 0; +static unsigned int w_generation = 0; + +struct witness_order_list_entry { + const char *w_name; + struct lock_class *w_class; +}; static struct witness_order_list_entry order_lists[] = { /* @@ -552,6 +687,10 @@ witness_initialize(void *dummy __unused) struct witness *w, *w1; int i; + MALLOC(w_data, struct witness *, + sizeof (struct witness) * WITNESS_COUNT, + M_WITNESS, M_NOWAIT | M_ZERO); + /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. @@ -562,13 +701,26 @@ witness_initialize(void *dummy __unused) CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); - for (i = 0; i < WITNESS_COUNT; i++) - witness_free(&w_data[i]); - for (i = 0; i < WITNESS_CHILDCOUNT; i++) - witness_child_free(&w_childdata[i]); + for (i = WITNESS_COUNT - 1; i >= 0; i--) { + w = &w_data[i]; + memset(w, 0, sizeof(*w)); + w_data[i].w_index = i; /* Witness index never changes. */ + witness_free(w); + } + KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, + ("%s: XXX!", __func__)); + /* Witness with index 0 is not used to aid in debugging. */ + STAILQ_REMOVE_HEAD(&w_free, w_list); + w_free_cnt--; + + memset(w_rmatrix, 0, + (sizeof(**w_rmatrix) * (WITNESS_COUNT+1) * (WITNESS_COUNT+1))); + for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); + witness_init_hash_tables(); + /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); @@ -622,39 +774,6 @@ sysctl_debug_witness_watch(SYSCTL_HANDLE return (0); } -static int -sysctl_debug_witness_graphs(SYSCTL_HANDLER_ARGS) -{ - struct witness *w; - struct sbuf *sb; - int error; - - KASSERT(witness_cold == 0, ("%s: witness is still cold", __func__)); - - sb = sbuf_new(NULL, NULL, WITNESS_SBUFSIZE, SBUF_FIXEDLEN); - if (sb == NULL) - return (ENOMEM); - - mtx_lock_spin(&w_mtx); - STAILQ_FOREACH(w, &w_all, w_list) - w->w_displayed = 0; - STAILQ_FOREACH(w, &w_all, w_list) - witness_addgraph(sb, w); - mtx_unlock_spin(&w_mtx); - - if (sbuf_overflowed(sb)) { - sbuf_delete(sb); - panic("%s: sbuf overflowed, bump the static buffer size\n", - __func__); - } - - sbuf_finish(sb); - error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); - sbuf_delete(sb); - - return (error); -} - void witness_init(struct lock_object *lock, const char *type) { @@ -674,7 +793,6 @@ witness_init(struct lock_object *lock, c (class->lc_flags & LC_UPGRADABLE) == 0) panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); - /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to @@ -702,174 +820,144 @@ witness_destroy(struct lock_object *lock struct witness *w; class = LOCK_CLASS(lock); + if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ - if ((lock->lo_flags & LO_WITNESS) && lock->lo_witness != NULL) { - w = lock->lo_witness; - mtx_lock_spin(&w_mtx); - MPASS(w->w_refcount > 0); - w->w_refcount--; + if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) + return; + w = lock->lo_witness; - if (w->w_refcount == 0) - depart(w); - mtx_unlock_spin(&w_mtx); - } + mtx_lock_spin(&w_mtx); + MPASS(w->w_refcount > 0); + w->w_refcount--; + + if (w->w_refcount == 0) + depart(w); + mtx_unlock_spin(&w_mtx); } #ifdef DDB static void -witness_levelall (void) +witness_ddb_compute_levels(void) { - struct witness_list *list; - struct witness *w, *w1; + struct witness *w; /* * First clear all levels. */ - STAILQ_FOREACH(w, &w_all, w_list) { - w->w_level = 0; - } + STAILQ_FOREACH(w, &w_all, w_list) + w->w_ddb_level = -1; /* - * Look for locks with no parent and level all their descendants. + * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { - /* - * This is just an optimization, technically we could get - * away just walking the all list each time. - */ - if (w->w_class->lc_flags & LC_SLEEPLOCK) - list = &w_sleep; - else - list = &w_spin; - STAILQ_FOREACH(w1, list, w_typelist) { - if (isitmychild(w1, w)) - goto skip; - } - witness_leveldescendents(w, 0); - skip: - ; /* silence GCC 3.x */ + /* If the witness has ancestors (is not a root), skip it. */ + if (w->w_num_ancestors > 0) + continue; + + witness_ddb_level_descendants(w, 0); } } static void -witness_leveldescendents(struct witness *parent, int level) +witness_ddb_level_descendants(struct witness *w, int level) { - struct witness_child_list_entry *wcl; int i; - if (parent->w_level < level) - parent->w_level = level; + if (w->w_ddb_level >= level) + return; + + w->w_ddb_level = level; level++; - for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) - for (i = 0; i < wcl->wcl_count; i++) - witness_leveldescendents(wcl->wcl_children[i], level); + + for (i = 1; i <= w_max_used_index; i++) { + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) + witness_ddb_level_descendants(&w_data[i], level); + } } static void -witness_displaydescendants(void(*prnt)(const char *fmt, ...), - struct witness *parent, int indent) +witness_ddb_display_descendants(void(*prnt)(const char *fmt, ...), + struct witness *w, int indent) { - struct witness_child_list_entry *wcl; - int i, level; + int i; - level = parent->w_level; - prnt("%-2d", level); - for (i = 0; i < indent; i++) - prnt(" "); - if (parent->w_refcount > 0) - prnt("%s", parent->w_name); + for (i = 0; i < indent; i++) + prnt(" "); + prnt("%s (type: %s, depth: %d, active refs: %d)", + w->w_name, w->w_class->lc_name, + w->w_ddb_level, w->w_refcount); + if (w->w_displayed) { + prnt(" -- (already displayed)\n"); + return; + } + w->w_displayed = 1; + if (w->w_file != NULL && w->w_line != 0) + prnt(" -- last acquired @ %s:%d\n", w->w_file, + w->w_line); else - prnt("(dead)"); - if (parent->w_displayed) { - prnt(" -- (already displayed)\n"); - return; - } - parent->w_displayed = 1; - if (parent->w_refcount > 0) { - if (parent->w_file != NULL) - prnt(" -- last acquired @ %s:%d", parent->w_file, - parent->w_line); + prnt(" -- never acquired\n"); + indent++; + WITNESS_INDEX_ASSERT(w->w_index); + for (i = 1; i <= w_max_used_index; i++) { + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) + witness_ddb_display_descendants(prnt, &w_data[i], + indent); } - prnt("\n"); - for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) - for (i = 0; i < wcl->wcl_count; i++) - witness_displaydescendants(prnt, - wcl->wcl_children[i], indent + 1); } static void -witness_display_list(void(*prnt)(const char *fmt, ...), +witness_ddb_display_list(void(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { - if (w->w_file == NULL || w->w_level > 0) + if (w->w_ddb_level > 0) continue; - /* - * This lock has no anscestors, display its descendants. - */ - witness_displaydescendants(prnt, w, 0); + /* This lock has no anscestors - display its descendants. */ + witness_ddb_display_descendants(prnt, w, 0); } } static void -witness_addgraph(struct sbuf *sb, struct witness *parent) -{ - struct witness_child_list_entry *wcl; - int i; - - if (parent->w_displayed != 0 || parent->w_refcount == 0 || - parent->w_file == NULL) - return; - - parent->w_displayed = 1; - for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) - for (i = 0; i < wcl->wcl_count; i++) { - sbuf_printf(sb, "\"%s\",\"%s\"\n", parent->w_name, - wcl->wcl_children[i]->w_name); - witness_addgraph(sb, wcl->wcl_children[i]); - } -} - -static void -witness_display(void(*prnt)(const char *fmt, ...)) +witness_ddb_display(void(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(!witness_cold, ("%s: witness_cold", __func__)); - witness_levelall(); + witness_ddb_compute_levels(); /* Clear all the displayed flags. */ - STAILQ_FOREACH(w, &w_all, w_list) { + STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; - } /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); - witness_display_list(prnt, &w_sleep); + witness_ddb_display_list(prnt, &w_sleep); /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); - witness_display_list(prnt, &w_spin); + witness_ddb_display_list(prnt, &w_spin); /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { - if (w->w_file != NULL || w->w_refcount == 0) + if (w->w_file != NULL) continue; - prnt("%s\n", w->w_name); + prnt("%s (type: %s, depth: %d)\n", w->w_name, + w->w_class->lc_name, w->w_ddb_level); } } #endif /* DDB */ @@ -934,17 +1022,6 @@ witness_checkorder(struct lock_object *l panicstr != NULL) return; - /* - * Try locks do not block if they fail to acquire the lock, thus - * there is no danger of deadlocks or of switching while holding a - * spin lock if we acquire a lock via a try operation. This - * function shouldn't even be called for try locks, so panic if - * that happens. - */ - if (flags & LOP_TRYLOCK) - panic("%s should not be called for try lock operations", - __func__); - w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; @@ -981,6 +1058,9 @@ witness_checkorder(struct lock_object *l return; lock_list = PCPU_PTR(spinlocks); } + /* Empty list? */ + if ((*lock_list)->ll_count == 0) + return; /* * Check to see if we are recursing on a lock we already own. If @@ -1007,41 +1087,49 @@ witness_checkorder(struct lock_object *l } return; } - + /* + * Try to perform most checks without a lock. If this succeeds we + * can skip acquiring the lock and return success. + */ + lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; + w1 = lock1->li_lock->lo_witness; + if (witness_lock_order_check(w1, w)) + return; /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ - lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; - w1 = lock1->li_lock->lo_witness; + mtx_lock_spin(&w_mtx); + witness_lock_order_add(w1, w); if (w1 == w) { - if (w->w_same_squawked || (lock->lo_flags & LO_DUPOK) || - (flags & LOP_DUPOK)) - return; - w->w_same_squawked = 1; - printf("acquiring duplicate lock of same type: \"%s\"\n", - w->w_name); - printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name, - lock1->li_file, lock1->li_line); - printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line); + i = w->w_index; + if (!(lock->lo_flags & LO_DUPOK) && + !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { + w_rmatrix[i][i] |= WITNESS_REVERSAL; + w->w_reversed = 1; + mtx_unlock_spin(&w_mtx); + printf("acquiring duplicate lock of same type: \"%s\"\n", + w->w_name); + printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name, + lock1->li_file, lock1->li_line); + printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line); #ifdef KDB - goto debugger; -#else - return; + witness_debugger(1, __func__); #endif + } else + mtx_unlock_spin(&w_mtx); + return; } - MPASS(!mtx_owned(&w_mtx)); - mtx_lock_spin(&w_mtx); + MPASS(mtx_owned(&w_mtx)); /* * If we know that the the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ - if (isitmychild(w1, w)) { - mtx_unlock_spin(&w_mtx); - return; - } + if (isitmychild(w1, w)) + goto out; + for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { @@ -1098,7 +1186,7 @@ witness_checkorder(struct lock_object *l * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ - mtx_unlock_spin(&w_mtx); + #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't @@ -1106,19 +1194,19 @@ witness_checkorder(struct lock_object *l * may be a bug. */ if (blessed(w, w1)) - return; + goto out; #endif - if (lock1->li_lock == &Giant.lock_object) { - if (w1->w_Giant_squawked) - return; - else - w1->w_Giant_squawked = 1; - } else { - if (w1->w_other_squawked) - return; - else - w1->w_other_squawked = 1; - } + /* Bail if this violation is known */ + if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) + goto out; + + /* Record this as a violation */ + w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; + w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; + w->w_reversed = w1->w_reversed = 1; + witness_increment_graph_generation(); + mtx_unlock_spin(&w_mtx); + /* * Ok, yell about it. */ @@ -1166,10 +1254,9 @@ witness_checkorder(struct lock_object *l lock->lo_name, w->w_name, file, line); } #ifdef KDB - goto debugger; -#else - return; + witness_debugger(1, __func__); #endif + return; } } lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1]; @@ -1187,17 +1274,11 @@ witness_checkorder(struct lock_object *l if (!itismychild(lock1->li_lock->lo_witness, w)) /* Witness is dead. */ return; - } + } +out: mtx_unlock_spin(&w_mtx); - return; -#ifdef KDB -debugger: - if (witness_trace) - kdb_backtrace(); - if (witness_kdb) - kdb_enter(KDB_WHY_WITNESS, __func__); -#endif + return; } void @@ -1270,6 +1351,7 @@ witness_upgrade(struct lock_object *lock if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); + file = fixup_filename(file); if ((lock->lo_flags & LO_UPGRADABLE) == 0) panic("upgrade of non-upgradable lock (%s) %s @ %s:%d", @@ -1302,6 +1384,7 @@ witness_downgrade(struct lock_object *lo if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; class = LOCK_CLASS(lock); + file = fixup_filename(file); if ((lock->lo_flags & LO_UPGRADABLE) == 0) panic("downgrade of non-upgradable lock (%s) %s @ %s:%d", @@ -1345,6 +1428,7 @@ witness_unlock(struct lock_object *lock, lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); + lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; @@ -1393,8 +1477,10 @@ found: (*lock_list)->ll_count--; intr_restore(s); - /* If this lock list entry is now empty, free it. */ - if ((*lock_list)->ll_count == 0) { + /* + * If this lock list entry is not the first and is now empty, free it. + */ + if (*lock_list != lle && (*lock_list)->ll_count == 0) { lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, @@ -1403,6 +1489,19 @@ found: } } +void +witness_thread_exit(struct thread *td) +{ + struct lock_list_entry *lle; + + lle = td->td_sleeplocks; + if (lle == NULL) + return; + if (lle->ll_count != 0) + panic("Thread %p: lock list entry not empty", td); + witness_lock_list_free(lle); +} + /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any @@ -1463,12 +1562,10 @@ witness_warn(int flags, struct lock_obje n += witness_list_locks(PCPU_PTR(spinlocks)); } if (flags & WARN_PANIC && n) - panic("witness_warn"); + panic("%s", __func__); #ifdef KDB - else if (witness_kdb && n) - kdb_enter(KDB_WHY_WITNESS, __func__); - else if (witness_trace && n) - kdb_backtrace(); + else + witness_debugger(n, __func__); #endif return (n); } @@ -1499,30 +1596,32 @@ static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; + struct witness_list *typelist; + + MPASS(description != NULL); if (witness_watch == 0 || panicstr != NULL) - return (NULL); - if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin) - return (NULL); + return NULL; + if ((lock_class->lc_flags & LC_SPINLOCK)) { + if (witness_skipspin) + return NULL; + else + typelist = &w_spin; + } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) + typelist = &w_sleep; + else + panic("lock class %s is not sleep or spin", + lock_class->lc_name); + mtx_lock_spin(&w_mtx); - STAILQ_FOREACH(w, &w_all, w_list) { - if (w->w_name == description || (w->w_refcount > 0 && - strcmp(description, w->w_name) == 0)) { - w->w_refcount++; - mtx_unlock_spin(&w_mtx); - if (lock_class != w->w_class) - panic( - "lock (%s) %s does not match earlier (%s) lock", - description, lock_class->lc_name, - w->w_class->lc_name); - return (w); - } - } - if ((w = witness_get()) == NULL) { - printf("WITNESS: unable to allocate a new witness object\n"); - goto out; - } - w->w_name = description; + w = witness_hash_get(description); + if (w) + goto found; + + if ((w = witness_get()) == NULL) + return NULL; + MPASS(strlen(description) < MAX_W_NAME); + strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); @@ -1532,36 +1631,30 @@ enroll(const char *description, struct l } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; - } else { - mtx_unlock_spin(&w_mtx); - panic("lock class %s is not sleep or spin", - lock_class->lc_name); } + /* Insert new witness into the hash */ + witness_hash_put(w); + witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); -out: - /* - * We issue a warning for any spin locks not defined in the static - * order list as a way to discourage their use (folks should really - * be using non-spin mutexes most of the time). However, several - * 3rd part device drivers use spin locks because that is all they - * have available on Windows and Linux and they think that normal - * mutexes are insufficient. - */ - if ((lock_class->lc_flags & LC_SPINLOCK) && witness_spin_warn) - printf("WITNESS: spin lock %s not in order list\n", - description); + return w; +found: + w->w_refcount++; + mtx_unlock_spin(&w_mtx); + if (lock_class != w->w_class) + panic( + "lock (%s) %s does not match earlier (%s) lock", + description, lock_class->lc_name, + w->w_class->lc_name); return (w); } -/* Don't let the door bang you on the way out... */ static void depart(struct witness *w) { - struct witness_child_list_entry *wcl, *nwcl; struct witness_list *list; - struct witness *parent; MPASS(w->w_refcount == 0); + if (w->w_class->lc_flags & LC_SLEEPLOCK) { list = &w_sleep; w_sleep_cnt--; @@ -1570,137 +1663,169 @@ depart(struct witness *w) w_spin_cnt--; } /* - * First, we run through the entire tree looking for any - * witnesses that the outgoing witness is a child of. For - * each parent that we find, we reparent all the direct - * children of the outgoing witness to its parent. - */ - STAILQ_FOREACH(parent, list, w_typelist) { - if (!isitmychild(parent, w)) - continue; - removechild(parent, w); - } - - /* - * Now we go through and free up the child list of the - * outgoing witness. - */ - for (wcl = w->w_children; wcl != NULL; wcl = nwcl) { - nwcl = wcl->wcl_next; - w_child_cnt--; - witness_child_free(wcl); - } - - /* - * Detach from various lists and free. + * Set file to NULL as it may point into a loadable module. */ - STAILQ_REMOVE(list, w, witness, w_typelist); - STAILQ_REMOVE(&w_all, w, witness, w_list); - witness_free(w); + w->w_file = NULL; + w->w_line = 0; + witness_increment_graph_generation(); } -/* - * Add "child" as a direct child of "parent". Returns false if - * we fail due to out of memory. - */ + static int -insertchild(struct witness *parent, struct witness *child) +adopt(struct witness *parent, struct witness *child) { - struct witness_child_list_entry **wcl; + int pi, ci, i, j; - MPASS(child != NULL && parent != NULL); + MPASS(mtx_owned(&w_mtx) || witness_cold); + + /* If the relationship is already known, there's no work to be done. */ + if (isitmychild(parent, child)) + return 1; + + /* When the structure of the graph changes, bump up the generation. */ + witness_increment_graph_generation(); /* - * Insert "child" after "parent" + * The hard part ... create the direct relationship, then propagate all + * indirect relationships. + */ + pi = parent->w_index; + ci = child->w_index; + WITNESS_INDEX_ASSERT(pi); + WITNESS_INDEX_ASSERT(ci); + MPASS(pi != ci); + w_rmatrix[pi][ci] |= WITNESS_PARENT; + w_rmatrix[ci][pi] |= WITNESS_CHILD; + /* + * If parent was not already an ancestor of child, + * then we increment the descendant and ancestor counters. */ - wcl = &parent->w_children; - while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN) - wcl = &(*wcl)->wcl_next; - if (*wcl == NULL) { - *wcl = witness_child_get(); - if (*wcl == NULL) - return (0); - w_child_cnt++; + if (!(w_rmatrix[pi][ci] & WITNESS_ANCESTOR)) { + parent->w_num_descendants++; + child->w_num_ancestors++; } - (*wcl)->wcl_children[(*wcl)->wcl_count++] = child; + /* + * Find each ancestor of 'pi'. Note that 'pi' itself is counted as + * an ancestor of 'pi' during this loop. + */ + for (i = 1; i <= w_max_used_index; i++) { + if (!(w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) && + (i != pi)) + continue; - return (1); -} + /* Find each descendant of 'i' and mark it as a descendant. */ + for (j = 1; j <= w_max_used_index; j++) { + /* + * Skip children that are already marked as + * descendants of 'i'. + */ + if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) + continue; + + /* + * We are only interested in descendants of 'ci'. Note + * that 'ci' itself is counted as a descendant of 'ci'. + */ + if (!(w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) && + (j != ci)) + continue; + w_rmatrix[i][j] |= WITNESS_ANCESTOR; + w_rmatrix[j][i] |= WITNESS_DESCENDANT; + w_data[i].w_num_descendants++; + w_data[j].w_num_ancestors++; + + /* + * Make sure we aren't marking a node as both an + * ancestor and descendant. We should have caught + * this as a lock order reversal earlier. + */ + if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && + (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { + printf("witness rmatrix paradox! [%d][%d]=%d " + "both ancestor and descendant\n", + i, j, w_rmatrix[i][j]); + kdb_backtrace(); + printf("Witness disabled.\n"); + witness_watch = 0; + } + if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && + (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { + printf("witness rmatrix paradox! [%d][%d]=%d " + "both ancestor and descendant\n", + j, i, w_rmatrix[j][i]); + kdb_backtrace(); + printf("Witness disabled.\n"); + witness_watch = 0; + } + } + } + return 1; +} static int itismychild(struct witness *parent, struct witness *child) { - + MPASS(witness_cold || mtx_owned(&w_mtx)); MPASS(child != NULL && parent != NULL); - if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) != - (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))) - panic( - "%s: parent (%s) and child (%s) are not the same lock type", - __func__, parent->w_class->lc_name, + if (!witness_lock_type_equal(parent, child)) { + if (mtx_owned(&w_mtx)) + mtx_unlock_spin(&w_mtx); + panic("%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " + "the same lock type", __func__, parent->w_name, + parent->w_class->lc_name, child->w_name, child->w_class->lc_name); - - return (insertchild(parent, child)); + } + return adopt(parent, child); } -static void -removechild(struct witness *parent, struct witness *child) +/* + * Generic code for the isitmy*() functions. The rmask parameter is the + * expected relationship of w1 to w2. + */ +static int +_isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { - struct witness_child_list_entry **wcl, *wcl1; - int i; + unsigned char r1, r2; + int i1, i2; - for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next) - for (i = 0; i < (*wcl)->wcl_count; i++) - if ((*wcl)->wcl_children[i] == child) - goto found; - return; -found: - (*wcl)->wcl_count--; - if ((*wcl)->wcl_count > i) - (*wcl)->wcl_children[i] = - (*wcl)->wcl_children[(*wcl)->wcl_count]; - MPASS((*wcl)->wcl_children[i] != NULL); - if ((*wcl)->wcl_count != 0) - return; - wcl1 = *wcl; - *wcl = wcl1->wcl_next; - w_child_cnt--; - witness_child_free(wcl1); + i1 = w1->w_index; + i2 = w2->w_index; + WITNESS_INDEX_ASSERT(i1); + WITNESS_INDEX_ASSERT(i2); + r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; + r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; + /* The flags on one better be the inverse of the flags on the other */ + if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || + (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { + printf("%s: rmatrix mismatch between %s (index %d) and %s " + "(index %d): w_rmatrix[%d][%d] == %hhx but " + "w_rmatrix[%d][%d] == %hhx\n", + fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, + i2, i1, r2); + kdb_backtrace(); + printf("Witness disabled.\n"); + witness_watch = 0; + } + return (r1 & rmask); } +/* + * Checks if @child is a direct child of @parent. + */ static int isitmychild(struct witness *parent, struct witness *child) { - struct witness_child_list_entry *wcl; - int i; - - for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { - for (i = 0; i < wcl->wcl_count; i++) { - if (wcl->wcl_children[i] == child) - return (1); - } - } - return (0); + return _isitmyx(parent, child, WITNESS_PARENT, __func__); } +/* + * Checks if @descendant is a direct or inderect descendant of @ancestor. + */ static int -isitmydescendant(struct witness *parent, struct witness *child) +isitmydescendant(struct witness *ancestor, struct witness *descendant) { - struct witness_child_list_entry *wcl; - int i, j; - - if (isitmychild(parent, child)) - return (1); - j = 0; - for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) { - MPASS(j < 1000); - for (i = 0; i < wcl->wcl_count; i++) { - if (isitmydescendant(wcl->wcl_children[i], child)) - return (1); - } - j++; - } - return (0); + return _isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__); } #ifdef BLESSING @@ -1729,6 +1854,9 @@ static struct witness * witness_get(void) { struct witness *w; + int index; + + MPASS(mtx_owned(&w_mtx) || witness_cold); if (witness_watch == 0) { mtx_unlock_spin(&w_mtx); @@ -1737,13 +1865,19 @@ witness_get(void) if (STAILQ_EMPTY(&w_free)) { witness_watch = 0; mtx_unlock_spin(&w_mtx); - printf("%s: witness exhausted\n", __func__); + printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; + index = w->w_index; + MPASS(index > 0 && index == w_max_used_index+1 && + index < WITNESS_COUNT); bzero(w, sizeof(*w)); + w->w_index = index; + if (index > w_max_used_index) + w_max_used_index = index; return (w); } @@ -1755,37 +1889,6 @@ witness_free(struct witness *w) w_free_cnt++; } -static struct witness_child_list_entry * -witness_child_get(void) -{ - struct witness_child_list_entry *wcl; - - if (witness_watch == 0) { - mtx_unlock_spin(&w_mtx); - return (NULL); - } - wcl = w_child_free; - if (wcl == NULL) { - witness_watch = 0; - mtx_unlock_spin(&w_mtx); - printf("%s: witness exhausted\n", __func__); - return (NULL); - } - w_child_free = wcl->wcl_next; - w_child_free_cnt--; - bzero(wcl, sizeof(*wcl)); - return (wcl); -} - -static void -witness_child_free(struct witness_child_list_entry *wcl) -{ - - wcl->wcl_next = w_child_free; - w_child_free = wcl; - w_child_free_cnt++; -} - static struct lock_list_entry * witness_lock_list_get(void) { @@ -1966,9 +2069,11 @@ witness_assert(struct lock_object *lock, struct lock_instance *instance; struct lock_class *class; + class = LOCK_CLASS(lock); + if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL) return; - class = LOCK_CLASS(lock); + if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) @@ -2024,7 +2129,7 @@ witness_assert(struct lock_object *lock, #ifdef DDB static void -witness_list(struct thread *td) +witness_ddb_list(struct thread *td) { KASSERT(!witness_cold, ("%s: witness_cold", __func__)); @@ -2060,7 +2165,7 @@ DB_SHOW_COMMAND(locks, db_witness_list) td = db_lookup_thread(addr, TRUE); else td = kdb_thread; - witness_list(td); + witness_ddb_list(td); } DB_SHOW_COMMAND(alllocks, db_witness_list_all) @@ -2081,14 +2186,1126 @@ DB_SHOW_COMMAND(alllocks, db_witness_lis continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, td->td_name, td, td->td_tid); - witness_list(td); + witness_ddb_list(td); } } } DB_SHOW_COMMAND(witness, db_witness_display) { + witness_ddb_display(db_printf); +} +#endif + +/* + * sysctl debug.witness.dotgraph and friends. + */ - witness_display(db_printf); +#if 1 //ndef INVARIANT_SUPPORT +#define WITNESS_SANE() (1) +#else +#define WITNESS_SANE() witness_sane() + +static int witness_sane(void); + +#define _LC_TYPEMASK (LC_SLEEPLOCK | LC_SPINLOCK) + +/* + * Check if a node in the witness graph got all of its' pointers right. + */ +static int +witness_sane() +{ + int i, j, error = 0; + unsigned char r1, r2; + struct witness *w1, *w2; + + mtx_assert(&w_mtx, MA_OWNED | MA_NOTRECURSED); + + for (i = 0; i <= WITNESS_COUNT; i++) { + for (j = 0; j <= WITNESS_COUNT; j++) { + if ((i == 0 || j == 0 || + i > w_max_used_index || j > w_max_used_index)) { + if (w_rmatrix[i][j] != WITNESS_UNRELATED) { + mtx_unlock_spin(&w_mtx); + printf("%s: rmatrix[%d][%d] is not 0!\n", + __func__, i, j); + error = 1; + goto out; + } + if (w_rmatrix[j][i] != WITNESS_UNRELATED) { + mtx_unlock_spin(&w_mtx); + printf("%s: rmatrix[%d][%d] is not 0!\n", + __func__, j, i); + error = 1; + goto out; + } + } + r1 = w_rmatrix[i][j] & WITNESS_RELATED_MASK; + r2 = w_rmatrix[j][i] & WITNESS_RELATED_MASK; + if ((WITNESS_ATOD(r1) != r2 && + WITNESS_DTOA(r1) != r2) || + (WITNESS_DTOA(r2) != r1 && + WITNESS_ATOD(r2) != r1)) { + mtx_unlock_spin(&w_mtx); + printf("%s: rmatrix[%d][%d] == %hhx and [%d][%d] == %hhx dont match!", + __func__, i, j, r1, j, i, r2); + error = 1; + goto out; + } + if (r1 == WITNESS_UNRELATED && r2 == WITNESS_UNRELATED) + continue; + + w1 = &w_data[i]; + w2 = &w_data[j]; + if ((w1->w_class->lc_flags & _LC_TYPEMASK) != + (w2->w_class->lc_flags & _LC_TYPEMASK)) { + mtx_unlock_spin(&w_mtx); + printf("%s: %s and %s have different lock classes!\n", + __func__, w1->w_name, w2->w_name); + error = 1; + goto out; + } + } + } + +out: + return !error; +} + +#undef _LC_TYPEMASK + +#endif /* INVARIANT_SUPPORT */ + +static void witness_dotty_fixname(const char *in, char *out, size_t len); +static void witness_fixup_string(const char *in, char *out, size_t len, + const char *before, size_t blen, const char *after, size_t alen); + +/* + * Iterates through the string "in". For every character in "in" that matches a + * character in the string "before", replace it with the corresponding character + * in the string "after" (i.e. before[1] -> after[1]). Store the result in the + * string "out". + */ +static void +witness_fixup_string(const char *in, char *out, size_t len, const char *before, + size_t blen, const char *after, size_t alen) +{ + size_t size; + unsigned int i, j; + if (!in || !out) + return; + MPASS(blen == alen); + size = min(strlen(in), len-1); + memcpy(out, in, size); + for (i = 0; i < size; i++) { + for (j = 0; j < blen; j++) { + if (out[i] == before[j]) + out[i] = after[j]; + } + } + out[i] = '\0'; +} + +/* fix up the witness names so they're acceptable to dotty */ +static void +witness_dotty_fixname(const char *in, char *out, size_t len) +{ + const char before[] = "-./ >#"; + const char after[] = "______"; + + witness_fixup_string(in, out, len, before, strlen(before), after, + strlen(after)); +} + +static const char *w_notrunning = "Witness not running, witness_watch == 0\n"; +static const char *w_stillcold = "Witness is still cold\n"; + +#define BADSTACK_SBUF_SIZE (256 * WITNESS_COUNT) + +static int +sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) +{ + int error, generation, i, j; + struct witness *w1, *w2, *tmp_w1 = NULL, *tmp_w2 = NULL; + struct witness_lock_order_data *data1, *data2; + struct witness_lock_order_data *tmp_data1 = NULL, *tmp_data2 = NULL; + struct sbuf *sb; + unsigned int w_rmatrix1, w_rmatrix2; + + if (witness_watch == 0) { + error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); + return (error); + } + if (witness_cold) { + error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); + return (error); + } + + /* Allocate and init temporary storage space. */ + tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK|M_ZERO); + tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK|M_ZERO); + tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + M_WAITOK|M_ZERO); + tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, + M_WAITOK|M_ZERO); + stack_zero(&tmp_data1->wlod_stack); + stack_zero(&tmp_data2->wlod_stack); + error = 0; + sb = sbuf_new(NULL, NULL, BADSTACK_SBUF_SIZE, SBUF_AUTOEXTEND); + +restart: + mtx_lock_spin(&w_mtx); + generation = w_generation; + mtx_unlock_spin(&w_mtx); + sbuf_printf(sb, "Number of known direct relationships is %d\n", + w_lohash.wloh_count); + for (i = 1; i < w_max_used_index; i++) { + mtx_lock_spin(&w_mtx); + if (generation != w_generation) { + mtx_unlock_spin(&w_mtx); + /* The graph has changed, try again. */ + req->oldidx = 0; + sbuf_clear(sb); + goto restart; + } + + w1 = &w_data[i]; + if (!w1->w_badmalloc && !w1->w_reversed) { + mtx_unlock_spin(&w_mtx); + continue; + } + + /* Copy w1 locally so we can release the spin lock. */ + *tmp_w1 = *w1; + mtx_unlock_spin(&w_mtx); + + if (tmp_w1->w_badmalloc) { + sbuf_printf(sb, + "\nLock \"%s\"(%s) was held during malloc(M_WAITOK)\n", + tmp_w1->w_name, tmp_w1->w_class->lc_name); + } + if (!tmp_w1->w_reversed) + continue; + for (j = 1; j < w_max_used_index; j++) { + if (!(w_rmatrix[i][j] & WITNESS_REVERSAL) || i > j) + continue; + + mtx_lock_spin(&w_mtx); + if (generation != w_generation) { + mtx_unlock_spin(&w_mtx); + /* The graph has changed, try again. */ + req->oldidx = 0; + sbuf_clear(sb); + goto restart; + } + + w2 = &w_data[j]; + data1 = witness_lock_order_get(w1, w2); + data2 = witness_lock_order_get(w2, w1); + + /* + * Copy information locally so we can release the + * spin lock. + */ + *tmp_w2 = *w2; + w_rmatrix1 = (unsigned int)w_rmatrix[i][j]; + w_rmatrix2 = (unsigned int)w_rmatrix[j][i]; + + if (data1) { + stack_zero(&tmp_data1->wlod_stack); + stack_copy(&data1->wlod_stack, + &tmp_data1->wlod_stack); + } + if (data2 && data2 != data1) { + stack_zero(&tmp_data2->wlod_stack); + stack_copy(&data2->wlod_stack, + &tmp_data2->wlod_stack); + } + mtx_unlock_spin(&w_mtx); + + sbuf_printf(sb, + "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", + tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w2->w_name, tmp_w2->w_class->lc_name); +#if 0 + sbuf_printf(sb, + "w_rmatrix[%s][%s] == %x, w_rmatrix[%s][%s] == %x\n", + tmp_w1->name, tmp_w2->w_name, w_rmatrix1, + tmp_w2->name, tmp_w1->w_name, w_rmatrix2); +#endif + if (data1) { + sbuf_printf(sb, + "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", + tmp_w1->w_name, tmp_w1->w_class->lc_name, + tmp_w2->w_name, tmp_w2->w_class->lc_name); + stack_sbuf_print(sb, &tmp_data1->wlod_stack); + sbuf_printf(sb, "\n"); + } + if (data2 && data2 != data1) { + sbuf_printf(sb, + "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", + tmp_w2->w_name, tmp_w2->w_class->lc_name, + tmp_w1->w_name, tmp_w1->w_class->lc_name); + stack_sbuf_print(sb, &tmp_data2->wlod_stack); + sbuf_printf(sb, "\n"); + } + } + } + mtx_lock_spin(&w_mtx); + if (generation != w_generation) { + mtx_unlock_spin(&w_mtx); + /* + * The graph changed while we were printing stack data, + * try again. + */ + req->oldidx = 0; + sbuf_clear(sb); + goto restart; + } + mtx_unlock_spin(&w_mtx); + + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + /* Free temporary storage space. */ + free(tmp_data1, M_TEMP); + free(tmp_data2, M_TEMP); + free(tmp_w1, M_TEMP); + free(tmp_w2, M_TEMP); + + return error; +} + +/* + * Witness display graph code. When a graph display is requested, the internal + * 2D array is converted into a more programmer-friendly graph representation, + * transformed according to the kind of graph requested (full graph / cycles + * only) and printed out using dot notation. + */ + +struct witness_dotnode { + LIST_ENTRY(witness_dotnode) wn_list; /* linked list of nodes. */ + const char *wn_name; + struct lock_class *wn_class; + const char *wn_file; + u_int32_t wn_line; + u_int32_t wn_index; + u_int32_t wn_refcount; + u_int32_t wn_badmalloc:1; + u_int32_t wn_reversed:1; + short wn_level; /* Depth from root along longest path */ + short wn_vgen; /* Visit generation */ + int wn_flags; + int wn_indegree; /* number of incoming edges. */ + int wn_outdegree; /* number of outgoing edges. */ + struct witness_dotnode **wn_inlist; /* Adjacent incoming edges. */ + struct witness_dotnode **wn_outlist; /* Adjacent outgoing edges. */ +}; + +#define WITNESS_NODE_BAD 0x01 +LIST_HEAD(witness_dotnode_list, witness_dotnode); + +static void witness_dotnode_level_descendants(struct witness_dotnode *node, + int level, int vgen); + +struct witness_dotgraph { + struct witness_lock_order_data wg_lodata[WITNESS_LOCK_ORDER_DATA_COUNT]; + struct witness_dotnode_list wg_nodes; + u_int8_t wg_matrix[WITNESS_COUNT+1][WITNESS_COUNT+1]; + int wg_generation; + int wg_nodecount; + int wg_max_used_index; + int wg_mcount; /* Count of allocated objects. */ + int wg_fcount; /* Count of freed objects. */ + int wg_refcount; +}; + +static struct witness_dotgraph *w_last_full_graph = NULL; +static struct witness_dotgraph *w_last_cycle_graph = NULL; + +static struct witness_dotgraph *witness_fullgraph(void); +static struct witness_dotgraph *witness_cyclegraph(void); +//static void witness_cyclegraph(struct sbuf *sb); + +static void +witness_dotnode_free(struct witness_dotnode *node, struct malloc_type *type, int *fcount) +{ + int fc = 0; + + if (node == NULL) + return; + + if (node->wn_outlist) { + free(node->wn_outlist, type); + node->wn_outlist = NULL; + fc++; + } + if (node->wn_inlist) { + free(node->wn_inlist, type); + node->wn_inlist = NULL; + fc++; + } + free(node, type); + fc++; + if (fcount) + *fcount += fc; +} + +static struct witness_dotgraph * +witness_dotgraph_init(struct malloc_type *type, int *mcount) +{ + struct witness_dotgraph *graph; + + MPASS(!mtx_owned(&w_mtx)); + graph = malloc(sizeof(*graph), type, M_WAITOK|M_ZERO); + graph->wg_nodes.lh_first = NULL; + + if (mcount) + (*mcount)++; + return graph; +} + +static void +witness_dotgraph_destroy(struct witness_dotgraph *graph, struct malloc_type *type, int *fcount) +{ + struct witness_dotnode *node, *node2; + int fc = 0; + + if (!graph) + goto out; + node = LIST_FIRST(&graph->wg_nodes); + while (node != NULL) { + node2 = LIST_NEXT(node, wn_list); + witness_dotnode_free(node, type, &fc); + node = node2; + } + free(graph, type); + fc++; + if (fcount) + *fcount+= fc; +out: + return; +} + +static void +witness_dotgraph_print(struct sbuf *sb, const char *title, + struct witness_dotgraph *graph) +{ + struct witness_dotnode *node, *node2; + const char *color; + char buf[64], buf2[64]; + int i; + int bad_malloc = 0; + + MPASS(!mtx_owned(&w_mtx)); + + sbuf_printf(sb, "digraph \"%s\" {\n", title); + node = LIST_FIRST(&graph->wg_nodes); + while (node != NULL) { + witness_dotty_fixname(node->wn_name, buf, sizeof(buf)); + if (node->wn_reversed) + color = "red"; + else if (node->wn_badmalloc) { + color = "orange"; + bad_malloc = 1; + } else if (node->wn_refcount == 0) + color = "gray"; + else + color = "green"; + sbuf_printf(sb, "\"%s\" [shape=record, color=\"%s\", label=\"%s|" + "{type=%s|refcount=%u|level=%hd}\"];\n", buf, color, + buf, node->wn_class->lc_name, + node->wn_refcount, node->wn_level); + if (node->wn_badmalloc) + sbuf_printf(sb, "%s -> _bad_malloc_ [color=orange];\n", + buf); + for (i = 0; i < node->wn_outdegree; i++) { + node2 = node->wn_outlist[i]; + witness_dotty_fixname(node2->wn_name, buf2, + sizeof(buf2)); + if (graph->wg_matrix[node->wn_index][node2->wn_index] & + WITNESS_REVERSAL) + color = "red"; + else + color = "black"; + sbuf_printf(sb, "\"%s\" -> \"%s\" [color=\"%s\"];\n", buf, + buf2, color); + } + node = LIST_NEXT(node, wn_list); + } + if (bad_malloc) + sbuf_printf(sb, "_bad_malloc_ [shape=record, color=orange, label=\"" + "malloc(M_WAITOK)\"];\n"); + sbuf_printf(sb, "}\n"); +} + +static void +witness_dotnode_level_descendants(struct witness_dotnode *node, int level, int vgen) +{ + int i; + + if (node->wn_vgen == vgen) + return; + + node->wn_vgen = vgen; + if (node->wn_level < level) + node->wn_level = level; + for (i = 0; i < node->wn_outdegree; i++) { + witness_dotnode_level_descendants(node->wn_outlist[i], level + 1, + vgen); + } + node->wn_vgen--; /* For detecting cycles. */ +} + +/* + * Helper function used by witness_fullgraph() and witness_cyclegraph() to + * generate the initial struct witness_dotgraph. + */ +static struct witness_dotgraph * +witness_make_dotgraph(void) +{ + int i, n; + int mcount; /* count the number of mallocs */ + int fcount; /* count the number of frees */ + struct witness_dotnode **nodes = NULL; + struct witness_dotgraph *graph = NULL; + struct witness_dotnode *node; + + fcount = mcount = 0; + graph = witness_dotgraph_init(M_TEMP, &mcount); + + mcount++; + graph->wg_nodes.lh_first = NULL; + + mtx_lock_spin(&w_mtx); +restart: + graph->wg_generation = w_generation; + n = w_max_used_index; + mtx_unlock_spin(&w_mtx); + + if (!nodes) + mcount++; + + nodes = realloc(nodes, sizeof(struct witness_dotnode*) * (n + 1), M_TEMP, + M_WAITOK | M_ZERO); + for (i = 1; i <= n; i++) { + if (!nodes[i]) { + mcount++; + nodes[i] = malloc(sizeof(struct witness_dotnode), M_TEMP, + M_WAITOK|M_ZERO); + } + } + + mtx_lock_spin(&w_mtx); + /* + * If a new witness was added while we were allocating memory, we have + * to restart and allocate more. + */ + if (n != w_max_used_index || graph->wg_generation != w_generation) + goto restart; + for (i = 1; i <= n; i++) { + /* Copy the relevant fields into the struct witness_dotnode. */ + nodes[i]->wn_name = w_data[i].w_name; + nodes[i]->wn_index = w_data[i].w_index; + nodes[i]->wn_class = w_data[i].w_class; + nodes[i]->wn_file = w_data[i].w_file; + nodes[i]->wn_line = w_data[i].w_line; + nodes[i]->wn_refcount = w_data[i].w_refcount; + nodes[i]->wn_badmalloc = w_data[i].w_badmalloc; + nodes[i]->wn_reversed = w_data[i].w_reversed; + LIST_INSERT_HEAD(&graph->wg_nodes, nodes[i], wn_list); + graph->wg_nodecount++; + } + memcpy(graph->wg_lodata, w_lodata, sizeof(w_lodata)); + memcpy(graph->wg_matrix, w_rmatrix, sizeof(w_rmatrix)); + mtx_unlock_spin(&w_mtx); + + /* + * Ok, now we have a copy of the in-memory witness structures, and from + * them we can generate the dotty graph. + */ + + /* Examine all known edges, build sparse graph. */ + for (i = 0; i < WITNESS_LOCK_ORDER_DATA_COUNT; i++) { + int from, to; + struct witness_dotnode *from_node, *to_node; + + if (witness_lock_order_key_empty(&graph->wg_lodata[i].wlod_key)) + continue; + + from = graph->wg_lodata[i].wlod_key.from; + to = graph->wg_lodata[i].wlod_key.to; + + WITNESS_INDEX_ASSERT(from); + WITNESS_INDEX_ASSERT(to); + + from_node = nodes[from]; + to_node = nodes[to]; + + /* Don't add the edge (X,X) if DUPOK was set */ + if (from == to && + !(graph->wg_matrix[from][to] & WITNESS_REVERSAL)) + continue; + + /* Don't add the edge (Giant, X) if X is sleepable. The correct + * order is always (X, Giant) in this case. */ + if (strcmp("Giant", from_node->wn_name) == 0 && + (to_node->wn_class->lc_flags & LO_SLEEPABLE)) + continue; + + /* Add the edge (from, to) to the graph. */ + from_node->wn_outdegree++; + if (!from_node->wn_outlist) + mcount++; + from_node->wn_outlist = realloc(from_node->wn_outlist, + from_node->wn_outdegree * sizeof(struct witness_dotnode*), + M_TEMP, M_WAITOK); + from_node->wn_outlist[from_node->wn_outdegree - 1] = to_node; + to_node->wn_indegree++; + if (!to_node->wn_inlist) + mcount++; + to_node->wn_inlist = realloc(to_node->wn_inlist, + to_node->wn_indegree * sizeof(struct witness_dotnode*), + M_TEMP, M_WAITOK); + to_node->wn_inlist[to_node->wn_indegree - 1] = from_node; + } + + /* Compute the level for all nodes */ + node = LIST_FIRST(&graph->wg_nodes); + while (node != NULL) { + while (node && node->wn_indegree != 0) { + node = LIST_NEXT(node, wn_list); + } + /* There are none left */ + if (node == NULL) + break; + witness_dotnode_level_descendants(node, 1, 1); + node = LIST_NEXT(node, wn_list); + } + + + fcount++; + free(nodes, M_TEMP); + + graph->wg_mcount = mcount; + graph->wg_fcount = fcount; + graph->wg_refcount = 1; + return graph; +} + +struct witness_dotgraph * +witness_fullgraph(void) +{ + int mcount, fcount; + struct witness_dotgraph *graph = NULL; + + mcount = fcount = 0; + mtx_lock_spin(&w_mtx); + if (w_last_full_graph != NULL) { + /* Return the last up-to-date full graph if there is one. */ + if (w_generation == w_last_full_graph->wg_generation) { + graph = w_last_full_graph; + atomic_add_int(&graph->wg_refcount, 1); + mtx_unlock_spin(&w_mtx); + goto out; + } else { + graph = w_last_full_graph; + w_last_full_graph = NULL; + mcount = graph->wg_mcount; + fcount = graph->wg_fcount; + atomic_add_int(&graph->wg_refcount, -1); + mtx_unlock_spin(&w_mtx); + if (graph->wg_refcount == 0) + witness_dotgraph_destroy(graph, M_TEMP, + &fcount); + MPASS(mcount == fcount); + mcount = fcount = 0; + graph = NULL; + } + } else + mtx_unlock_spin(&w_mtx); + + /* Have to make a new graph. */ + graph = witness_make_dotgraph(); + + /* Store this graph if it's up to date ... */ + mtx_lock_spin(&w_mtx); + if (w_generation != graph->wg_generation) { + mtx_unlock_spin(&w_mtx); + goto out; + } + + /* .. and there isn't one stored already. */ + if (w_last_full_graph != NULL) { + mtx_unlock_spin(&w_mtx); + goto out; + } + + w_last_full_graph = graph; + atomic_add_int(&graph->wg_refcount, 1); + mtx_unlock_spin(&w_mtx); + +out: + return graph; +} + +struct witness_dotgraph * +witness_cyclegraph(void) +{ + int i, n, mcount, fcount; + struct witness_dotgraph *graph = NULL; + struct witness_dotnode_list doomed = LIST_HEAD_INITIALIZER(doomed); + struct witness_dotnode_list gone = LIST_HEAD_INITIALIZER(gone); + struct witness_dotnode *node, *node2; + + mcount = fcount = 0; + mtx_lock_spin(&w_mtx); + if (w_last_cycle_graph != NULL) { + /* Return the last up-to-date cycle graph if there is one. */ + if (w_generation == w_last_cycle_graph->wg_generation) { + graph = w_last_cycle_graph; + atomic_add_int(&graph->wg_refcount, 1); + mtx_unlock_spin(&w_mtx); + goto out; + } else { + graph = w_last_cycle_graph; + w_last_cycle_graph = NULL; + mcount = graph->wg_mcount; + fcount = graph->wg_fcount; + atomic_add_int(&graph->wg_refcount, -1); + mtx_unlock_spin(&w_mtx); + if (graph->wg_refcount == 0) + witness_dotgraph_destroy(graph, M_TEMP, + &fcount); + MPASS(mcount == fcount); + mcount = fcount = 0; + graph = NULL; + } + } else + mtx_unlock_spin(&w_mtx); + + /* Have to make a new graph. */ + graph = witness_make_dotgraph(); + + /* TODO: Reduce the graph to its' cycles */ + + /* + * Step 1: Build initial list of "doomed" nodes with in-degree or + * out-degree of 0. These are definitely not in the cycle graph. + */ + node = LIST_FIRST(&graph->wg_nodes); + while (node != NULL) { + while (node && node->wn_indegree != 0 && + node->wn_outdegree != 0) { + node = LIST_NEXT(node, wn_list); + } + /* There are none left */ + if (node == NULL) + break; + /* + * Found one w/ in-degree 0 and/or out-degree 0, + * move to doomed list. + */ + node2 = LIST_NEXT(node, wn_list); + LIST_REMOVE(node, wn_list); + LIST_INSERT_HEAD(&doomed, node, wn_list); + node = node2; + } + + /* + * Step 2: Remove nodes of in-degree or out-degree 0 until we no longer + * can. As we remove these doomed nodes, new nodes may become doomed if + * their in-degree or out-degree becomes 0. + */ + while ((node = LIST_FIRST(&doomed)) != NULL) { + struct witness_dotnode *to, *from; + int k; + + /* The out-degree and/or the in-degree better be 0 */ + MPASS(node->wn_outdegree == 0 || node->wn_indegree == 0); + /* Found a node that can be removed, change lists ... */ + LIST_REMOVE(node, wn_list); + LIST_INSERT_HEAD(&gone, node, wn_list); + graph->wg_nodecount--; + /* ... and remove the edges */ + if (node->wn_outdegree == 0) + goto next; + + MPASS(node->wn_indegree == 0); + for (i = 0, n = node->wn_outdegree; i < n; i++) { + to = node->wn_outlist[i]; + for (k = 0; k < to->wn_indegree; k++) { + if (to->wn_inlist[k] != node) + continue; + to->wn_inlist[k] = + to->wn_inlist[to->wn_indegree-1]; + to->wn_inlist[to->wn_indegree-1] = NULL; + } + to->wn_indegree--; + node->wn_outdegree--; + if (to->wn_indegree == 0) { + if (to->wn_inlist) { + graph->wg_fcount++; + free(to->wn_inlist, M_TEMP); + to->wn_inlist = NULL; + } + /* Move it to the doomed list */ + if (!(node->wn_flags & WITNESS_NODE_BAD)) { + LIST_REMOVE(to, wn_list); + LIST_INSERT_HEAD(&doomed, to, wn_list); + } + } + } + if (node->wn_outlist) { + graph->wg_fcount++; + free(node->wn_outlist, M_TEMP); + node->wn_outlist = NULL; + } + + next: + if (node->wn_indegree == 0) + continue; + + for (i = 0, n = node->wn_indegree; i < n; i++) { + from = node->wn_inlist[i]; + for (k = 0; k < from->wn_outdegree; k++) { + if (from->wn_outlist[k] != node) + continue; + from->wn_outlist[k] = + from->wn_outlist[from->wn_outdegree - 1]; + from->wn_outlist[from->wn_outdegree - 1] = NULL; + } + from->wn_outdegree--; + node->wn_indegree--; + if (from->wn_outdegree == 0) { + if (from->wn_outlist) { + graph->wg_fcount++; + free(from->wn_outlist, M_TEMP); + from->wn_outlist = NULL; + } + /* Move it to the doomed list */ + if (!(node->wn_flags & WITNESS_NODE_BAD)) { + LIST_REMOVE(from, wn_list); + LIST_INSERT_HEAD(&doomed, from, + wn_list); + } + } + } + if (node->wn_inlist) { + graph->wg_fcount++; + free(node->wn_inlist, M_TEMP); + node->wn_inlist = NULL; + } + } + + /* At this point, all nodes should be on the gone or graph lists */ + MPASS(LIST_EMPTY(&doomed)); + + /* + * Free all the nodes on the gone list, they are no longer a part + * of the graph. + */ + node = LIST_FIRST(&gone); + while (node != NULL) { + MPASS(node->wn_indegree == 0 && node->wn_inlist == NULL); + MPASS(node->wn_outdegree == 0 && node->wn_outlist == NULL); + node2 = LIST_NEXT(node, wn_list); + graph->wg_fcount++; + free(node, M_TEMP); + node = node2; + } + + /* Store this graph if it's up to date ... */ + mtx_lock_spin(&w_mtx); + if (w_generation != graph->wg_generation) { + mtx_unlock_spin(&w_mtx); + goto out; + } + + /* .. and there isn't one stored already. */ + if (w_last_cycle_graph != NULL) { + mtx_unlock_spin(&w_mtx); + goto out; + } + + w_last_cycle_graph = graph; + atomic_add_int(&graph->wg_refcount, 1); + mtx_unlock_spin(&w_mtx); + +out: + return graph; +} + +static int +sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) +{ + struct witness_dotgraph *wg; + struct sbuf *sb; + int error; + + if (witness_watch == 0) { + error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); + return (error); + } + if (witness_cold) { + error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); + return (error); + } + sb = sbuf_new(NULL, NULL, 8 * 1024, SBUF_AUTOEXTEND); + wg = witness_fullgraph(); + witness_dotgraph_print(sb, "WITNESS graph", wg); + + atomic_add_int(&wg->wg_refcount, -1); + if (wg->wg_refcount == 0) { + int mcount, fcount; + mcount = wg->wg_mcount; + fcount = wg->wg_fcount; + + witness_dotgraph_destroy(wg, M_TEMP, &fcount); + MPASS(mcount == fcount); + } + + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + return error; +} + + +static int +sysctl_debug_witness_cyclegraph(SYSCTL_HANDLER_ARGS) +{ + struct witness_dotgraph *wg; + struct sbuf *sb; + int error; + + if (witness_watch == 0) { + error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); + return (error); + } + if (witness_cold) { + error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); + return (error); + } + + sb = sbuf_new(NULL, NULL, 8 * 1024, SBUF_AUTOEXTEND); + wg = witness_cyclegraph(); + witness_dotgraph_print(sb, "WITNESS cycle graph", wg); + + atomic_add_int(&wg->wg_refcount, -1); + if (wg->wg_refcount == 0) { + int mcount, fcount; + mcount = wg->wg_mcount; + fcount = wg->wg_fcount; + + witness_dotgraph_destroy(wg, M_TEMP, &fcount); + MPASS(mcount == fcount); + } + + sbuf_finish(sb); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + return error; +} + +/******************* + * HASH TABLE CODE * + *******************/ + +/* + * A simple hash function. Takes a key pointer and a key size. If size == 0, + * interprets the key as a string and reads until the null + * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit + * hash value computed from the key. + */ +static u_int32_t +witness_hash_djb2(const u_int8_t *key, u_int32_t size) +{ + unsigned int hash = 5381; + int i; + + /* hash = hash * 33 + key[i] */ + if (size) + for (i = 0; i < size; i++) + hash = ((hash << 5) + hash) + (unsigned int)key[i]; + else + for (i = 0; key[i] != 0; i++) + hash = ((hash << 5) + hash) + (unsigned int)key[i]; + + return hash; +} + + +/* + * Initializes the two witness hash tables. Called exactly once from + * witness_initialize(). + */ +static void +witness_init_hash_tables(void) +{ + int i; + + MPASS(witness_cold); + /* Init the hash tables. First the witness hash ... */ + for (i = 0; i < WITNESS_HASH_SIZE; i++) + w_hash.wh_array[i] = NULL; + + w_hash.wh_size = WITNESS_HASH_SIZE; + w_hash.wh_count = 0; + + /* ... then the lock order data hash */ + w_lofree = NULL; + for (i = 0; i < WITNESS_LOCK_ORDER_DATA_COUNT; i++) { + memset(&w_lodata[i], 0, sizeof(w_lodata[i])); + w_lodata[i].wlod_next = w_lofree; + w_lofree = &w_lodata[i]; + } + w_lohash.wloh_size = WITNESS_LOCK_ORDER_HASH_SIZE; + w_lohash.wloh_count = 0; + for (i = 0; i < WITNESS_LOCK_ORDER_HASH_SIZE; i++) + w_lohash.wloh_array[i] = NULL; +} + +static struct witness * +witness_hash_get(const char *key) +{ + struct witness *w; + u_int32_t hash; + + MPASS(mtx_owned(&w_mtx) || witness_cold); + MPASS(key != NULL); + + hash = witness_hash_djb2(key, 0) % w_hash.wh_size; + w = w_hash.wh_array[hash]; + while (w != NULL) { + if (strcmp(w->w_name, key) == 0) + goto out; + w = w->w_hash_next; + } + +out: + return w; +} + +static void +witness_hash_put(struct witness *w) +{ + u_int32_t hash; + + MPASS(mtx_owned(&w_mtx) || witness_cold); + MPASS(w != NULL); + MPASS(w->w_name != NULL); + KASSERT(witness_hash_get(w->w_name) == NULL, + ("%s: trying to add a hash entry that already exists!", __func__)); + KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); + + hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; + + w->w_hash_next = w_hash.wh_array[hash]; + w_hash.wh_array[hash] = w; + w_hash.wh_count++; +} + + +static struct witness_lock_order_data * +witness_lock_order_get(struct witness *parent, struct witness *child) +{ + struct witness_lock_order_data *data = NULL; + struct witness_lock_order_key key; + unsigned int hash; + + MPASS(parent != NULL && child != NULL); + key.from = parent->w_index; + key.to = child->w_index; + WITNESS_INDEX_ASSERT(key.from); + WITNESS_INDEX_ASSERT(key.to); + if (!(w_rmatrix[parent->w_index][child->w_index] + & WITNESS_LOCK_ORDER_KNOWN)) + goto out; + + hash = witness_hash_djb2((const char*)&key, + sizeof(key)) % w_lohash.wloh_size; + data = w_lohash.wloh_array[hash]; + while (data != NULL) { + if (witness_lock_order_key_equal(&data->wlod_key, &key)) + break; + data = data->wlod_next; + } + +out: + return data; +} + +/* + * Verify that parent and child have a known relationship, are not the same, + * and child is actually a child of parent. This is done without w_mtx + * to avoid contention in the common case. + */ +static int +witness_lock_order_check(struct witness *parent, struct witness *child) +{ + + if (parent != child && + w_rmatrix[parent->w_index][child->w_index] + & WITNESS_LOCK_ORDER_KNOWN && + isitmychild(parent, child)) + return 1; + + return (0); +} + +static int +witness_lock_order_add(struct witness *parent, struct witness *child) +{ + struct witness_lock_order_data *data = NULL; + struct witness_lock_order_key key; + unsigned int hash; + + MPASS(parent != NULL && child != NULL); + key.from = parent->w_index; + key.to = child->w_index; + WITNESS_INDEX_ASSERT(key.from); + WITNESS_INDEX_ASSERT(key.to); + if (w_rmatrix[parent->w_index][child->w_index] + & WITNESS_LOCK_ORDER_KNOWN) + return 1; + + hash = witness_hash_djb2((const char*)&key, + sizeof(key)) % w_lohash.wloh_size; + w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; + data = w_lofree; + if (data == NULL) + return 0; + w_lofree = data->wlod_next; + data->wlod_next = w_lohash.wloh_array[hash]; + data->wlod_key = key; + w_lohash.wloh_array[hash] = data; + w_lohash.wloh_count++; + stack_zero(&data->wlod_stack); + stack_save(&data->wlod_stack); + return 1; +} + +#ifdef KDB +static void +witness_debugger(int cond, const char *msg) +{ + if (witness_trace && cond) + kdb_backtrace(); + if (witness_kdb && cond) + kdb_enter(KDB_WHY_WITNESS, msg); } #endif + +/* Call this whenver the structure of the witness graph changes. */ +static void +witness_increment_graph_generation(void) +{ + MPASS(mtx_owned(&w_mtx) || witness_cold); + w_generation++; +} +