--- A fixed-point math based naive-bayes classifier implementation in the form --- of a DIFFUSE classifier kernel module. It classifies using discretised --- feature values, and has been empirically shown to perform classifications --- quickly and with high accuracy. --- --- Sponsored by: FreeBSD Foundation --- Reviewed by: bz --- MFC after: 1 month --- diff -r c8b77e80ef24 sys/modules/diffuse/Makefile --- a/sys/modules/diffuse/Makefile Tue Oct 04 21:33:24 2011 +1100 +++ b/sys/modules/diffuse/Makefile Tue Oct 04 22:31:20 2011 +1100 @@ -2,6 +2,7 @@ SUBDIR= diffuse \ diffuse_classifier_c45 \ + diffuse_classifier_nbayes \ diffuse_feature_iat \ diffuse_feature_iatbd \ diffuse_feature_pcnt \ diff -r c8b77e80ef24 sys/modules/diffuse/diffuse_classifier_nbayes/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/modules/diffuse/diffuse_classifier_nbayes/Makefile Tue Oct 04 22:31:20 2011 +1100 @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.include + +.PATH: ${.CURDIR}/../../../netinet/ipfw +KMOD= diffuse_classifier_nbayes +SRCS= diffuse_classifier_nbayes.c + +.include diff -r c8b77e80ef24 sys/netinet/ipfw/diffuse_classifier_nbayes.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/ipfw/diffuse_classifier_nbayes.c Tue Oct 04 22:31:20 2011 +1100 @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2010-2011 + * Swinburne University of Technology, Melbourne, Australia. + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Sebastian Zander, made + * possible in part by a gift from The Cisco University Research Program Fund, a + * corporate advised fund of Silicon Valley Community Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * DIFFUSE Naive Bayes classifier. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#endif /* _KERNEL */ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef _KERNEL +#include +#include +#endif + +#ifdef _KERNEL + +/* Computes the buffer size needed to store classifier. */ +static int get_size(struct di_classifier_nbayes_config *c) +{ + + return (sizeof(struct di_classifier_nbayes_config) + c->fdist_len); +} + +/* Copy, assumes target memory is allocated. */ +static void +cpy_conf(struct di_classifier_nbayes_config *f, + struct di_classifier_nbayes_config *t) +{ + + t->oid = f->oid; + strcpy(t->model_name, f->model_name); + t->feature_cnt = f->feature_cnt; + t->class_cnt = f->class_cnt; + t->multi = f->multi; + t->fdist_len = f->fdist_len; + memcpy(t->fdist, f->fdist, f->fdist_len); +} + +static int +nbayes_init_instance(struct di_cdata *cdata, struct di_oid *params) +{ + struct di_classifier_nbayes_config *c, *conf; + + c = (struct di_classifier_nbayes_config *)params; + + DID("class cnt %d", c->class_cnt); + DID("attr cnt %d", c->feature_cnt); + DID("multi %d", (1 << c->multi)); + DID("fdist_len %d", c->fdist_len); + DID("want size %d", get_size(c)); + + cdata->conf = malloc(get_size(c), M_DIFFUSE, M_NOWAIT | M_ZERO); + if (cdata->conf == NULL) + return (ENOMEM); + + conf = (struct di_classifier_nbayes_config *)cdata->conf; + cpy_conf(c, conf); + + return (0); +} + +static int +nbayes_destroy_instance(struct di_cdata *cdata) +{ + + free(cdata->conf, M_DIFFUSE); + + return (0); +} + +static int +nbayes_get_conf(struct di_cdata *cdata, struct di_oid *cbuf, int size_only) +{ + int len; + + len = get_size((struct di_classifier_nbayes_config *)cdata->conf); + + if (!size_only) + cpy_conf((struct di_classifier_nbayes_config *)cdata->conf, + (struct di_classifier_nbayes_config *)cbuf); + + return (len); +} + +#endif /* _KERNEL */ + +#define LINEAR_SEARCH_THRESHOLD 5 + +static inline struct di_nbayes_attr_disc_val * +find_val(struct di_nbayes_attr_disc_val *first, int val_cnt, int class_cnt, + int32_t fval) +{ + struct di_nbayes_attr_disc_val *val; + int h, l, j, m; + + val = NULL; + + if (val_cnt <= LINEAR_SEARCH_THRESHOLD) { + /* Linear search. */ + for (j = 0; j < val_cnt; j++) { + val = (struct di_nbayes_attr_disc_val *) + (((char *)first) + + (sizeof(struct di_nbayes_attr_disc_val) + + class_cnt * sizeof(uint32_t)) * j); + if (fval <= val->high_val) + break; + } + + } else { + /* Binary search. */ + l = 0; + h = val_cnt - 1; + while (l < h) { + m = l + ((h - l) / 2); + val = (struct di_nbayes_attr_disc_val *) + (((char *)first) + + (sizeof(struct di_nbayes_attr_disc_val) + + class_cnt * sizeof(uint32_t)) * m); + + if (val->high_val < fval) + l = m + 1; + else + h = m; + } + + val = (struct di_nbayes_attr_disc_val *)(((char *)first) + + (sizeof(struct di_nbayes_attr_disc_val) + class_cnt * + sizeof(uint32_t)) * h); + } + + return (val); +} + +int +nbayes_classify(struct di_cdata *cdata, int32_t *features, int fcnt) +{ + struct di_classifier_nbayes_config *conf = + (struct di_classifier_nbayes_config *)cdata->conf; + struct di_nbayes_attr_prior *ap; + struct di_nbayes_attr_disc *ad; + struct di_nbayes_attr_disc_val *val; + struct di_nbayes_attr_id *attr; + char *d; + uint64_t probs[conf->class_cnt]; + uint64_t max_prob; + int divs[conf->class_cnt]; + int best_class, divs_max, f, i, l, len; + + max_prob = 0; + best_class = divs_max = 0; + +#ifdef DIFFUSE_DEBUG2 + printf("DIFFUSE: %-10s features ", __func__); + for (i = 0; i < fcnt; i++) + printf("%u ", features[i]); + printf("\n"); +#endif + + for (i = 0; i < conf->class_cnt; i++) { + divs[i] = 0; + + for (l = conf->fdist_len, d = (char *)conf->fdist, f = -1; + l > 0; l -= len, d += len, f++) { + attr = (struct di_nbayes_attr_id *)d; + len = attr->len; + + DID("type %d(%d)", attr->type, attr->len); + + if (f >= fcnt) + return (-1); /* Should never happen. */ + + if (attr->type == DI_NBAYES_ATTR_PRIOR) { + ap = (struct di_nbayes_attr_prior *)attr; + probs[i] = ap->prior_p[i]; + } else if (attr->type == DI_NBAYES_ATTR_DISC) { + ad = (struct di_nbayes_attr_disc *)attr; + + val = find_val(ad->val, ad->val_cnt, + conf->class_cnt, features[f]); + probs[i] *= val->cond_p[i]; + + if (probs[i] > + ((uint64_t)1 << (64 - conf->multi))) { + /* + * Divide by 2^(conf->multi - 1) to avoid + * overflow. + */ + probs[i] = fixp_div(probs[i], conf->multi); + divs[i]++; + if (divs[i] > divs_max) + divs_max = divs[i]; + } + } else if (attr->type == DI_NBAYES_ATTR_NORM) { + return (-1); /* XXX: Not supported yet. */ + } + } + } + + /* Make sure the divisor is the same for all probs and find max prob. */ + for (i = 0; i < conf->class_cnt; i++) { + probs[i] = fixp_div(probs[i], conf->multi * + (divs_max - divs[i]) - (divs_max - divs[i] - 1)); + + DID2("class %u prob %llu %i", i, probs[i], divs[i]); + + if (probs[i] > max_prob) { + max_prob = probs[i]; + best_class = i; + } + } + + return (best_class); +} + +#ifdef _KERNEL + +static int +nbayes_get_feature_cnt(struct di_cdata *cdata) +{ + + return (((struct di_classifier_nbayes_config *) + cdata->conf)->feature_cnt); +} + +static int +nbayes_get_class_cnt(struct di_cdata *cdata) +{ + + return (((struct di_classifier_nbayes_config *)cdata->conf)->class_cnt); +} + +static struct di_classifier_alg di_nbayes_desc = { + _FI( .name = ) "nbayes", + _FI( .ref_count = ) 0, + + _FI( .init_instance = ) nbayes_init_instance, + _FI( .destroy_instance = ) nbayes_destroy_instance, + _FI( .get_conf = ) nbayes_get_conf, + _FI( .classify = ) nbayes_classify, + _FI( .get_feature_cnt = ) nbayes_get_feature_cnt, + _FI( .get_class_cnt = ) nbayes_get_class_cnt, +}; + +DECLARE_DIFFUSE_CLASSIFIER_MODULE(nbayes, &di_nbayes_desc); + +#endif diff -r c8b77e80ef24 sys/netinet/ipfw/diffuse_classifier_nbayes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/netinet/ipfw/diffuse_classifier_nbayes.h Tue Oct 04 22:31:20 2011 +1100 @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2010-2011 + * Swinburne University of Technology, Melbourne, Australia. + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Sebastian Zander, made + * possible in part by a gift from The Cisco University Research Program Fund, a + * corporate advised fund of Silicon Valley Community Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * DIFFUSE Naive Bayes classifier. + */ + +#ifndef _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ +#define _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ + +#define DI_NBAYES_ATTR_PRIOR 0 +#define DI_NBAYES_ATTR_DISC 1 +#define DI_NBAYES_ATTR_NORM 2 + +struct di_nbayes_attr_id +{ + uint16_t type; + uint16_t len; +}; + +struct di_nbayes_attr_disc_val +{ + int32_t high_val; + uint32_t cond_p[]; +}; + +struct di_nbayes_attr_disc +{ + struct di_nbayes_attr_id id; + /* Number of values/intervals. */ + uint32_t val_cnt; + /* Interval values and conditional probs. */ + struct di_nbayes_attr_disc_val val[]; +}; + +struct di_nbayes_attr_norm_class +{ + int32_t mean; + uint32_t stddev; + int32_t wsum; + uint32_t prec; +}; + +/* One per real attribute. */ +struct di_nbayes_attr_norm +{ + struct di_nbayes_attr_id id; + /* Class_cnt structs. */ + struct di_nbayes_attr_norm_class class[]; +}; + +/* Priors, not really an attribute. */ +struct di_nbayes_attr_prior +{ + struct di_nbayes_attr_id id; + /* Cass_cnt prior probs. */ + uint32_t prior_p[]; +}; + +struct di_classifier_nbayes_config +{ + struct di_oid oid; + + /* Model name. */ + char model_name[DI_MAX_MODEL_STR_LEN]; + /* Number of features. */ + uint16_t feature_cnt; + /* Number of classes. */ + uint16_t class_cnt; + /* Precsion, multipler for double->int. */ + uint16_t multi; + /* Length of fdists. */ + uint16_t fdist_len; + /* Feature_cnt feature distributions. */ + struct di_nbayes_attr_id fdist[]; +}; + +struct di_classifier_module *nbayes_module(void); + +struct di_cdata; + +int nbayes_classify(struct di_cdata *cdata, int32_t *features, int fcnt); + +#endif /* _NETINET_IPFW_DIFFUSE_CLASSIFIER_NBAYES_H_ */