/* * fget_unlocked function below is used in the FreeBSD kernel, see * http://code.metager.de/source/xref/freebsd/sys/kern/kern_descrip.c#2326 * * In version present here several compiler hints for branch prediction were * added. * * gcc5 takes these annotations and generates code without jumps for the common * case. * * clang generates a maze which is smaller than gcc5's code but also ~10% slower * in my microbenchmarks. * * FreeBSD clang version 3.6.1 (tags/RELEASE_361/final 237755) 20150525 * Target: x86_64-unknown-freebsd11.0 * Thread model: posix * * gcc version 5.1.0 (FreeBSD Ports Collection) * * Compiled as: * clang -o fget-clang -O2 -std=gnu99 fget_unlocked.c fget_unlocked_funcs.c * gcc -o fget-gcc -O2 -std=gnu99 fget_unlocked.c fget_unlocked_funcs.c * * fget_unlocked_funcs.c file is provided because clang kept optimizing out 2 * functions to which calls have to be generated. */ #include #include #include #include #include #include #define __predict_true(exp) __builtin_expect((exp), 1) #define __predict_false(exp) __builtin_expect((exp), 0) #define __compiler_membar() __asm __volatile(" " : : : "memory") #define cpu_spinwait() __asm __volatile("pause") #define EBADF 9 struct file { volatile unsigned int f_count; }; struct filedescent { struct file *fde_file; unsigned int fde_seq; }; struct fdescenttbl { int fdt_nfiles; struct filedescent fdt_ofiles[0]; }; struct filedesc { struct fdescenttbl *fd_files; }; #define fd_seq(fdt, fd) (&(fdt)->fdt_ofiles[(fd)].fde_seq) static inline unsigned int atomic_load_acq_int(volatile unsigned int *vp) { unsigned int v = *vp; __compiler_membar(); return (v); } static __inline int atomic_cmpset_acq_int(volatile u_int *dst, u_int expect, u_int src) { u_char res; __asm __volatile( " lock ; " " cmpxchgl %3,%1 ; " " sete %0 ; " "# atomic_cmpset_acq_int" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+a" (expect) /* 2 */ : "r" (src) /* 3 */ : "memory", "cc"); return (res); } static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int v) { __asm __volatile( " lock ; " " xaddl %0,%1 ; " "# atomic_fetchadd_int" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } static __inline int refcount_release(volatile u_int *count) { u_int old; old = atomic_fetchadd_int(count, -1); return (old == 1); } int cap_check(void *, void *); int _fdrop(void *, void *); /* * clang optimizes this out */ #if 0 int __attribute__ ((noinline)) cap_check(void *ptr, void *ptr2) { return (0); } static int __attribute__ ((noinline)) _fdrop(struct file *fp, void *td) { return (0); } #endif static __inline int fdrop(struct file *fp, void *td) { if (__predict_false(refcount_release(&(fp)->f_count))) return (_fdrop(fp, td)); return (0); } static inline unsigned int seq_read(unsigned int *seqp) { unsigned int ret; for (;;) { ret = atomic_load_acq_int(seqp); if (__predict_false(ret & 1)) { cpu_spinwait(); continue; } break; } return (ret); } static inline unsigned int seq_consistent_nomb(unsigned int *seqp, unsigned int oldseq) { return (__predict_true(*seqp == oldseq)); } static __inline unsigned int seq_consistent(unsigned int *seqp, unsigned int oldseq) { __compiler_membar(); return (seq_consistent_nomb(seqp, oldseq)); } static int __attribute__ ((noinline)) fget_unlocked(struct filedesc *fdp, int fd, void *needrightsp, struct file **fpp, unsigned int *seqp) { struct filedescent *fde; struct fdescenttbl *fdt; struct file *fp; unsigned int count; unsigned int seq; int error; fdt = fdp->fd_files; if (__predict_false((unsigned int)fd >= fdt->fdt_nfiles)) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { seq = seq_read(fd_seq(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EBADF); error = cap_check(fde, needrightsp); if (__predict_false(error != 0)) { if (seq_consistent(fd_seq(fdt, fd), seq)) return (error); continue; } retry: count = fp->f_count; if (__predict_false(count == 0)) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it possible that * there is a stale fp pointer in cached version. */ fdt = *(struct fdescenttbl * const volatile *)&(fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ if (__predict_false(atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)) goto retry; fdt = fdp->fd_files; if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) break; fdrop(fp, NULL); } *fpp = fp; if (seqp != NULL) { *seqp = seq; } return (0); } static volatile sig_atomic_t do_test = 1; static void sigalrm(int signo) { do_test = 0; } static unsigned long test(void) { struct filedesc fdp; struct file *fp; int rights; unsigned long i; fdp.fd_files = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + sizeof(struct filedescent) * 42); fdp.fd_files->fdt_nfiles = 42; signal(SIGALRM, sigalrm); alarm(10); for (i = 0; do_test; i++) fget_unlocked(&fdp, 0, &rights, &fp, NULL); return (i); } int main(void) { printf("%lu\n", test()); return (0); }