Index: sys/systm.h =================================================================== RCS file: /usr/repo/src/sys/sys/systm.h,v retrieving revision 1.234 diff -u -p -r1.234 systm.h --- sys/systm.h 28 Apr 2005 03:19:50 -0000 1.234 +++ sys/systm.h 19 Aug 2005 21:40:07 -0000 @@ -330,4 +330,56 @@ int alloc_unr(struct unrhdr *uh); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. + * + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. + * + * Another variant (also from fortune): + * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255) + * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \ + * - (((x)>>2)&0x33333333) \ + * - (((x)>>3)&0x11111111)) + */ +static __inline uint32_t +bitcount(uint32_t x) +{ + + x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); + x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); + x = (x & 0x0f0f0f0f) + ((x & 0xf0f0f0f0) >> 4); + x = (x & 0x00ff00ff) + ((x & 0xff00ff00) >> 8); + x = (x & 0x0000ffff) + ((x & 0xffff0000) >> 16); + return (x); +} + #endif /* !_SYS_SYSTM_H_ */ Index: i386/i386/mp_machdep.c =================================================================== RCS file: /usr/repo/src/sys/i386/i386/mp_machdep.c,v retrieving revision 1.252 diff -u -p -r1.252 mp_machdep.c --- i386/i386/mp_machdep.c 29 Jun 2005 23:23:16 -0000 1.252 +++ i386/i386/mp_machdep.c 19 Aug 2005 21:40:49 -0000 @@ -1008,58 +1008,6 @@ smp_tlb_shootdown(u_int vector, vm_offse ia32_pause(); } -/* - * This is about as magic as it gets. fortune(1) has got similar code - * for reversing bits in a word. Who thinks up this stuff?? - * - * Yes, it does appear to be consistently faster than: - * while (i = ffs(m)) { - * m >>= i; - * bits++; - * } - * and - * while (lsb = (m & -m)) { // This is magic too - * m &= ~lsb; // or: m ^= lsb - * bits++; - * } - * Both of these latter forms do some very strange things on gcc-3.1 with - * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. - * There is probably an SSE or MMX popcnt instruction. - * - * I wonder if this should be in libkern? - * - * XXX Stop the presses! Another one: - * static __inline u_int32_t - * popcnt1(u_int32_t v) - * { - * v -= ((v >> 1) & 0x55555555); - * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); - * v = (v + (v >> 4)) & 0x0F0F0F0F; - * return (v * 0x01010101) >> 24; - * } - * The downside is that it has a multiply. With a pentium3 with - * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use - * an imull, and in that case it is faster. In most other cases - * it appears slightly slower. - * - * Another variant (also from fortune): - * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255) - * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \ - * - (((x)>>2)&0x33333333) \ - * - (((x)>>3)&0x11111111)) - */ -static __inline u_int32_t -popcnt(u_int32_t m) -{ - - m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); - m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); - m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); - m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); - m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); - return m; -} - static void smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { @@ -1074,7 +1022,7 @@ smp_targeted_tlb_shootdown(u_int mask, u mask &= ~PCPU_GET(cpumask); if (mask == 0) return; - ncpu = popcnt(mask); + ncpu = bitcount(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", Index: amd64/amd64/mp_machdep.c =================================================================== RCS file: /usr/repo/src/sys/amd64/amd64/mp_machdep.c,v retrieving revision 1.261 diff -u -p -r1.261 mp_machdep.c --- amd64/amd64/mp_machdep.c 21 Jul 2005 21:46:09 -0000 1.261 +++ amd64/amd64/mp_machdep.c 19 Aug 2005 21:41:24 -0000 @@ -812,58 +812,6 @@ smp_tlb_shootdown(u_int vector, vm_offse ia32_pause(); } -/* - * This is about as magic as it gets. fortune(1) has got similar code - * for reversing bits in a word. Who thinks up this stuff?? - * - * Yes, it does appear to be consistently faster than: - * while (i = ffs(m)) { - * m >>= i; - * bits++; - * } - * and - * while (lsb = (m & -m)) { // This is magic too - * m &= ~lsb; // or: m ^= lsb - * bits++; - * } - * Both of these latter forms do some very strange things on gcc-3.1 with - * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. - * There is probably an SSE or MMX popcnt instruction. - * - * I wonder if this should be in libkern? - * - * XXX Stop the presses! Another one: - * static __inline u_int32_t - * popcnt1(u_int32_t v) - * { - * v -= ((v >> 1) & 0x55555555); - * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); - * v = (v + (v >> 4)) & 0x0F0F0F0F; - * return (v * 0x01010101) >> 24; - * } - * The downside is that it has a multiply. With a pentium3 with - * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use - * an imull, and in that case it is faster. In most other cases - * it appears slightly slower. - * - * Another variant (also from fortune): - * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255) - * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \ - * - (((x)>>2)&0x33333333) \ - * - (((x)>>3)&0x11111111)) - */ -static __inline u_int32_t -popcnt(u_int32_t m) -{ - - m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); - m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); - m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); - m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); - m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); - return m; -} - static void smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { @@ -878,7 +826,7 @@ smp_targeted_tlb_shootdown(u_int mask, u mask &= ~PCPU_GET(cpumask); if (mask == 0) return; - ncpu = popcnt(mask); + ncpu = bitcount(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", Index: geom/stripe/g_stripe.c =================================================================== RCS file: /usr/repo/src/sys/geom/stripe/g_stripe.c,v retrieving revision 1.26 diff -u -p -r1.26 g_stripe.c --- geom/stripe/g_stripe.c 17 Jul 2005 13:15:02 -0000 1.26 +++ geom/stripe/g_stripe.c 19 Aug 2005 21:41:42 -0000 @@ -792,7 +792,7 @@ g_stripe_create(struct g_class *mp, cons sc->sc_id = md->md_id; sc->sc_stripesize = md->md_stripesize; - sc->sc_stripebits = BITCOUNT(sc->sc_stripesize - 1); + sc->sc_stripebits = bitcount(sc->sc_stripesize - 1); sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_STRIPE, M_WAITOK | M_ZERO); Index: geom/stripe/g_stripe.h =================================================================== RCS file: /usr/repo/src/sys/geom/stripe/g_stripe.h,v retrieving revision 1.7 diff -u -p -r1.7 g_stripe.h --- geom/stripe/g_stripe.h 27 Feb 2005 23:07:47 -0000 1.7 +++ geom/stripe/g_stripe.h 19 Aug 2005 21:41:52 -0000 @@ -120,10 +120,4 @@ stripe_metadata_decode(const u_char *dat md->md_provsize = le64dec(data + 64); } -#ifndef BITCOUNT -#define BITCOUNT(x) (((BX_(x) + (BX_(x) >> 4)) & 0x0F0F0F0F) % 255) -#define BX_(x) ((x) - (((x) >> 1) & 0x77777777) - \ - (((x) >> 2) & 0x33333333) - (((x) >> 3) & 0x11111111)) -#endif - #endif /* _G_STRIPE_H_ */