--- A fixed-point math based naive-bayes classifier implementation in the form --- of a DIFFUSE classifier kernel module. It classifies using discretised --- feature values, and has been empirically shown to perform classifications --- quickly and with high accuracy. --- --- Sponsored by: FreeBSD Foundation --- Reviewed by: bz --- MFC after: 1 month --- diff -r 867fd1ae3dca sys/modules/diffuse/Makefile --- a/sys/modules/diffuse/Makefile Sun Sep 25 17:36:42 2011 +1000 +++ b/sys/modules/diffuse/Makefile Sun Sep 25 17:45:15 2011 +1000 @@ -1,12 +1,13 @@ # $FreeBSD$ SUBDIR= diffuse \ diffuse_classifier_c45 \ + diffuse_classifier_nbayes \ diffuse_feature_iat \ diffuse_feature_iatbd \ diffuse_feature_pcnt \ diffuse_feature_plen \ diffuse_feature_plenbd \ diffuse_feature_skype .include diff -r 867fd1ae3dca sys/modules/diffuse/diffuse_classifier_nbayes/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/modules/diffuse/diffuse_classifier_nbayes/Makefile Sun Sep 25 17:45:15 2011 +1000 @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.include + +.PATH: ${.CURDIR}/../../../netinet/ipfw +KMOD= diffuse_classifier_nbayes +SRCS= diffuse_classifier_nbayes.c + +.include diff -r 867fd1ae3dca sys/netinet/ipfw/diffuse_classifier_nbayes.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/ipfw/diffuse_classifier_nbayes.c Sun Sep 25 17:45:15 2011 +1000 @@ -0,0 +1,299 @@ +/*- + * Copyright (c) 2010-2011 + * Swinburne University of Technology, Melbourne, Australia. + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Sebastian Zander, made + * possible in part by a gift from The Cisco University Research Program Fund, a + * corporate advised fund of Silicon Valley Community Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * DIFFUSE Naive Bayes classifier + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#endif /* _KERNEL */ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef _KERNEL +#include +#include +#endif + +/* + * Naive Bayes classifier. + */ + +#ifdef _KERNEL + +/* Computes the buffer size needed to store classifier. */ +static int get_size(di_classifier_nbayes_cnf_t *c) +{ + + return (sizeof(di_classifier_nbayes_cnf_t) + c->fdist_len); +} + +/* Copy, assumes target memory is allocated. */ +static void cpy_cnf(di_classifier_nbayes_cnf_t *f, di_classifier_nbayes_cnf_t *t) +{ + + t->oid = f->oid; + strcpy(t->model_name, f->model_name); + t->feature_cnt = f->feature_cnt; + t->class_cnt = f->class_cnt; + t->multi = f->multi; + t->fdist_len = f->fdist_len; + memcpy(t->fdist, f->fdist, f->fdist_len); + +} + +static int +nbayes_init_instance(struct cdata *cdata, struct di_oid *params) +{ + di_classifier_nbayes_cnf_t *c, *cnf; + + c = (di_classifier_nbayes_cnf_t *)params; + + DID("class cnt %d", c->class_cnt); + DID("attr cnt %d", c->feature_cnt); + DID("multi %d", (1 << c->multi)); + DID("fdist_len %d", c->fdist_len); + DID("want size %d", get_size(c)); + + cdata->conf = malloc(get_size(c), M_DIFFUSE, M_NOWAIT | M_ZERO); + cnf = (di_classifier_nbayes_cnf_t *)cdata->conf; + cpy_cnf(c, cnf); + + return (0); +} + +static int +nbayes_destroy_instance(struct cdata *cdata) +{ + + free(cdata->conf, M_DIFFUSE); + + return (0); +} + +static int +nbayes_get_conf(struct cdata *cdata, struct di_oid *cbuf, int size_only) +{ + int len; + + len = get_size((di_classifier_nbayes_cnf_t *)cdata->conf); + + if (!size_only) + cpy_cnf((di_classifier_nbayes_cnf_t *)cdata->conf, + (di_classifier_nbayes_cnf_t *)cbuf); + + return (len); +} + +#endif /* _KERNEL */ + +#define LINEAR_SEARCH_THRESHOLD 5 + +static inline nbayes_attr_disc_val_t * +find_val(nbayes_attr_disc_val_t *first, int val_cnt, int class_cnt, + int32_t fval) +{ + nbayes_attr_disc_val_t *val; + int h, l, j, m; + + val = NULL; + + if (val_cnt <= LINEAR_SEARCH_THRESHOLD) { + /* Linear search. */ + for (j = 0; j < val_cnt; j++) { + val = (nbayes_attr_disc_val_t *)(((char *)first) + + (sizeof(nbayes_attr_disc_val_t) + + class_cnt * sizeof(uint32_t)) * j); + if (fval <= val->high_val) + break; + } + + } else { + /* Binary search. */ + l = 0; + h = val_cnt - 1; + while (l < h) { + m = l + ((h - l) / 2); + val = (nbayes_attr_disc_val_t *)(((char *)first) + + (sizeof(nbayes_attr_disc_val_t) + + class_cnt * sizeof(uint32_t)) * m); + + if (val->high_val < fval) + l = m + 1; + else + h = m; + } + + val = (nbayes_attr_disc_val_t *)(((char *)first) + + (sizeof(nbayes_attr_disc_val_t) + + class_cnt * sizeof(uint32_t)) * h); + } + + return (val); +} + +int +nbayes_classify(struct cdata *cdata, int32_t *features, int fcnt) +{ + di_classifier_nbayes_cnf_t *cnf = + (di_classifier_nbayes_cnf_t *)cdata->conf; + nbayes_attr_prior_t *ap; + nbayes_attr_disc_t *ad; + nbayes_attr_disc_val_t *val; + nbayes_attr_id_t *attr; + char *d; + uint64_t probs[cnf->class_cnt]; + uint64_t max_prob; + int divs[cnf->class_cnt]; + int best_class, divs_max, f, i, l, len; + + max_prob = 0; + best_class = divs_max = 0; + +#ifdef DIFFUSE_DEBUG2 + printf("diffuse: %-10s features ", __FUNCTION__); + for (i = 0; i < fcnt; i++) + printf("%u ", features[i]); + + printf("\n"); +#endif + + for (i = 0; i < cnf->class_cnt; i++) { + divs[i] = 0; + + for (l = cnf->fdist_len, d = (char *)cnf->fdist, f = -1; l > 0; + l -= len, d += len, f++) { + attr = (nbayes_attr_id_t *)d; + len = attr->len; + + DIND("type %d(%d)", attr->type, attr->len); + + if (f >= fcnt) + return (-1); /* Should never happen. */ + + if (attr->type == NBAYES_ATTR_PRIOR) { + ap = (nbayes_attr_prior_t *)attr; + probs[i] = ap->prior_p[i]; + } else if (attr->type == NBAYES_ATTR_DISC) { + ad = (nbayes_attr_disc_t *)attr; + + val = find_val(ad->val, ad->val_cnt, + cnf->class_cnt, features[f]); + probs[i] *= val->cond_p[i]; + + if (probs[i] > + ((uint64_t)1 << (64 - cnf->multi))) { + /* + * Divide by 2^(cnf->multi - 1) to avoid + * overflow. + */ + probs[i] = fixp_div(probs[i], cnf->multi); + divs[i]++; + if (divs[i] > divs_max) + divs_max = divs[i]; + } + } else if (attr->type == NBAYES_ATTR_NORM) { + return (-1); /* XXX: Not supported yet. */ + } + } + } + + /* Make sure the divisor is the same for all probs and find max prob. */ + for (i = 0; i < cnf->class_cnt; i++) { + probs[i] = fixp_div(probs[i], cnf->multi * (divs_max - divs[i]) + - (divs_max - divs[i] - 1)); + + DID2("class %u prob %llu %i", i, probs[i], divs[i]); + + if (probs[i] > max_prob) { + max_prob = probs[i]; + best_class = i; + } + } + + return (best_class); +} + +#ifdef _KERNEL + +static int +nbayes_get_feature_cnt(struct cdata *cdata) +{ + + return (((di_classifier_nbayes_cnf_t *)cdata->conf)->feature_cnt); +} + +static int +nbayes_get_class_cnt(struct cdata *cdata) +{ + + return (((di_classifier_nbayes_cnf_t *)cdata->conf)->class_cnt); +} + +static struct diffuse_classifier_alg diffuse_nbayes_desc = { + _FI( .name = ) "nbayes", + _FI( .ref_count = ) 0, + + _FI( .init_instance = ) nbayes_init_instance, + _FI( .destroy_instance = ) nbayes_destroy_instance, + _FI( .get_conf = ) nbayes_get_conf, + _FI( .classify = ) nbayes_classify, + _FI( .get_feature_cnt = ) nbayes_get_feature_cnt, + _FI( .get_class_cnt = ) nbayes_get_class_cnt, +}; + +DECLARE_DIFFUSE_CLASSIFIER_MODULE(classifier_nbayes, &diffuse_nbayes_desc); + +#endif diff -r 867fd1ae3dca sys/netinet/ipfw/diffuse_classifier_nbayes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/ipfw/diffuse_classifier_nbayes.h Sun Sep 25 17:45:15 2011 +1000 @@ -0,0 +1,118 @@ +/*- + * Copyright (c) 2010-2011 + * Swinburne University of Technology, Melbourne, Australia. + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Sebastian Zander, made + * possible in part by a gift from The Cisco University Research Program Fund, a + * corporate advised fund of Silicon Valley Community Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * DIFFUSE Naive Bayes classifier + */ + +#ifndef _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ +#define _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ + +enum nbayes_attr_type +{ + NBAYES_ATTR_PRIOR = 0, + NBAYES_ATTR_DISC, + NBAYES_ATTR_NORM +}; + +typedef struct nbayes_attr_id +{ + uint16_t type; + uint16_t len; +} nbayes_attr_id_t; + +typedef struct nbayes_attr_disc_val +{ + int32_t high_val; + uint32_t cond_p[]; +} nbayes_attr_disc_val_t; + +typedef struct nbayes_attr_disc +{ + nbayes_attr_id_t id; + /* Number of values/intervals. */ + uint32_t val_cnt; + /* Interval values and conditional probs. */ + nbayes_attr_disc_val_t val[]; +} nbayes_attr_disc_t; + +typedef struct nbayes_attr_norm_class +{ + int32_t mean; + uint32_t stddev; + int32_t wsum; + uint32_t prec; +} nbayes_attr_norm_class_t; + +/* One per real attribute. */ +typedef struct nbayes_attr_norm +{ + nbayes_attr_id_t id; + /* Class_cnt structs. */ + nbayes_attr_norm_class_t class[]; +} nbayes_attr_norm_t; + +/* Priors, not really an attribute. */ +typedef struct nbayes_attr_prior +{ + nbayes_attr_id_t id; + /* Cass_cnt prior probs. */ + uint32_t prior_p[]; +} nbayes_attr_prior_t; + +typedef struct di_classifier_nbayes_cnf +{ + struct di_oid oid; + + /* Model name. */ + char model_name[256]; /* XXX: #define */ + /* Number of features. */ + uint16_t feature_cnt; + /* Number of classes. */ + uint16_t class_cnt; + /* Precsion, multipler for double->int. */ + uint16_t multi; + /* Length of fdists. */ + uint16_t fdist_len; + /* Feature_cnt feature distributions. */ + nbayes_attr_id_t fdist[]; +} di_classifier_nbayes_cnf_t; + +struct classifier_module *nbayes_module(void); + +struct cdata; + +int nbayes_classify(struct cdata *cdata, int32_t *features, int fcnt); + +#endif /* _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ */ diff -r 867fd1ae3dca sys/netinet/ipfw/diffuse_common.h --- a/sys/netinet/ipfw/diffuse_common.h Sun Sep 25 17:36:42 2011 +1000 +++ b/sys/netinet/ipfw/diffuse_common.h Sun Sep 25 17:45:15 2011 +1000 @@ -46,20 +46,40 @@ struct cdata { void *conf; /* Instance configuration ptr. */ }; /* Flow data. */ struct fdata { void *data; /* Work data ptr. */ int32_t *stats; /* Stats ptr. */ }; +/* + * Fast fixed point division with rounding for dividing by a number of 2. + * a is the divident and b is the power of the divisor. + */ +static inline uint64_t +fixp_div(uint64_t a, int b) +{ + uint64_t q, r; + + if (b <= 0) + return (a); + + q = a >> b; + r = a & (b - 1); + if ((r << 1) >= ((uint64_t)1 << b)) + return (q + 1); + else + return (q); +} + static inline uint32_t fixp_sqrt(uint64_t x) { uint64_t rem_hi, rem_lo, test_div; uint32_t root; int count; rem_hi = 0; rem_lo = x; root = 0;