Property changes on: usr.bin/procstat ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/usr.bin/procstat:r207767-219808 Property changes on: usr.bin/csup ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/usr.bin/csup:r207767-219808 Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c (.../base) (revision 219811) +++ usr.bin/netstat/inet.c (.../head) (revision 219811) @@ -85,11 +85,11 @@ char *inetname(struct in_addr *); void inetprint(struct in_addr *, int, const char *, int); #ifdef INET6 -static int udp_done, tcp_done; +static int udp_done, tcp_done, sdp_done; #endif /* INET6 */ static int -pcblist_sysctl(int proto, char **bufp, int istcp) +pcblist_sysctl(int proto, const char *name, char **bufp, int istcp) { const char *mibvar; char *buf; @@ -109,7 +109,8 @@ mibvar = "net.inet.raw.pcblist"; break; } - + if (strncmp(name, "sdp", 3) == 0) + mibvar = "net.inet.sdp.pcblist"; len = 0; if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) { if (errno != ENOENT) @@ -315,10 +316,17 @@ switch (proto) { case IPPROTO_TCP: #ifdef INET6 - if (tcp_done != 0) - return; - else - tcp_done = 1; + if (strncmp(name, "sdp", 3) != 0) { + if (tcp_done != 0) + return; + else + tcp_done = 1; + } else { + if (sdp_done != 0) + return; + else + sdp_done = 1; + } #endif istcp = 1; break; @@ -332,7 +340,7 @@ break; } if (live) { - if (!pcblist_sysctl(proto, &buf, istcp)) + if (!pcblist_sysctl(proto, name, &buf, istcp)) return; } else { if (!pcblist_kvm(off, &buf, istcp)) Index: usr.bin/netstat/main.c =================================================================== --- usr.bin/netstat/main.c (.../base) (revision 219811) +++ usr.bin/netstat/main.c (.../head) (revision 219811) @@ -208,6 +208,10 @@ { -1, N_SCTPSTAT, 1, sctp_protopr, sctp_stats, NULL, "sctp", 1, IPPROTO_SCTP }, #endif +#ifdef SDP + { -1, -1, 1, protopr, + NULL, NULL, "sdp", 1, IPPROTO_TCP }, +#endif { N_DIVCBINFO, -1, 1, protopr, NULL, NULL, "divert", 1, IPPROTO_DIVERT }, { N_RIPCBINFO, N_IPSTAT, 1, protopr, @@ -248,6 +252,10 @@ ip6_stats, ip6_ifstats, "ip6", 1, IPPROTO_RAW }, { N_RIPCBINFO, N_ICMP6STAT, 1, protopr, icmp6_stats, icmp6_ifstats, "icmp6", 1, IPPROTO_ICMPV6 }, +#ifdef SDP + { -1, -1, 1, protopr, + NULL, NULL, "sdp", 1, IPPROTO_TCP }, +#endif #ifdef IPSEC { -1, N_IPSEC6STAT, 1, NULL, ipsec_stats, NULL, "ipsec6", 0, 0 }, Index: usr.bin/netstat/Makefile =================================================================== --- usr.bin/netstat/Makefile (.../base) (revision 219811) +++ usr.bin/netstat/Makefile (.../head) (revision 219811) @@ -18,6 +18,10 @@ CFLAGS+=-DINET6 .endif +.if ${MK_OFED} != "no" +CFLAGS+=-DSDP +.endif + BINGRP= kmem BINMODE=2555 DPADD= ${LIBKVM} ${LIBMEMSTAT} ${LIBUTIL} Property changes on: usr.bin/calendar ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/usr.bin/calendar:r207767-219808 Property changes on: crypto/openssh ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/crypto/openssh:r207767-219808 Property changes on: crypto/openssl ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/crypto/openssl:r207767-219808 Property changes on: gnu/usr.bin/cc/cc_tools ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/gnu/usr.bin/cc/cc_tools:r207767-219808 Merged /head/gnu/usr.bin/cc/cc_tools:r207766-209025 Property changes on: gnu/usr.bin/binutils ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/gnu/usr.bin/binutils:r207767-219808 Merged /head/gnu/usr.bin/binutils:r207766-209025 Property changes on: gnu/usr.bin/gdb ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/gnu/usr.bin/gdb:r207767-219808 Merged /head/gnu/usr.bin/gdb:r207766-209025 Property changes on: gnu/lib ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/gnu/lib:r207766-209025 Merged /projects/ofed/base/gnu/lib:r207767-219808 Property changes on: sbin/ipfw ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sbin/ipfw:r207767-219808 Property changes on: sbin ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sbin:r207767-219808 Property changes on: contrib/expat ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/expat:r207767-219808 Property changes on: contrib/ncurses ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/ncurses:r207767-219808 Property changes on: contrib/tzdata ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/tzdata:r207767-219808 Property changes on: contrib/wpa ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/wpa:r207767-219808 Property changes on: contrib/openpam ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/openpam:r207767-219808 Property changes on: contrib/ntp ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/ntp:r207767-219808 Property changes on: contrib/tcsh ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/tcsh:r207767-219808 Property changes on: contrib/groff ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/groff:r207767-219808 Property changes on: contrib/openbsm ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/openbsm:r207767-219808 Property changes on: contrib/pf ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/pf:r207767-219808 Property changes on: contrib/less ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/less:r207767-219808 Property changes on: contrib/libpcap ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/libpcap:r207767-219808 Index: contrib/ofed/dapl/test/dtest/dtest.c =================================================================== --- contrib/ofed/dapl/test/dtest/dtest.c (.../base) (revision 219811) +++ contrib/ofed/dapl/test/dtest/dtest.c (.../head) (revision 219811) @@ -63,8 +63,8 @@ #else // _WIN32 || _WIN64 -#include -#include +#include +#include #include #include #include Index: contrib/ofed/dapl/test/dtest/dtestcm.c =================================================================== --- contrib/ofed/dapl/test/dtest/dtestcm.c (.../base) (revision 219811) +++ contrib/ofed/dapl/test/dtest/dtestcm.c (.../head) (revision 219811) @@ -64,8 +64,8 @@ #else // _WIN32 || _WIN64 -#include -#include +#include +#include #include #include #include Index: contrib/ofed/dapl/dapl/openib_ucm/linux/openib_osd.h =================================================================== --- contrib/ofed/dapl/dapl/openib_ucm/linux/openib_osd.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/openib_ucm/linux/openib_osd.h (.../head) (revision 219811) @@ -1,7 +1,9 @@ #ifndef OPENIB_OSD_H #define OPENIB_OSD_H -#include +#include +#include +#include #include #if __BYTE_ORDER == __BIG_ENDIAN Index: contrib/ofed/dapl/dapl/udapl/linux/dapl_osd.h =================================================================== --- contrib/ofed/dapl/dapl/udapl/linux/dapl_osd.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/udapl/linux/dapl_osd.h (.../head) (revision 219811) @@ -45,9 +45,9 @@ * This file is defined for Linux systems only, including it on any * other build will cause an error */ -#ifndef __linux__ +#if !defined(__linux__) && !defined(__FreeBSD__) #error UNDEFINED OS TYPE -#endif /* __linux__ */ +#endif /* __linux__ || __freebsd__ */ #if !defined (__i386__) && !defined (__ia64__) && !defined(__x86_64__) && !defined(__PPC__) && !defined(__PPC64__) #error UNDEFINED ARCH @@ -67,7 +67,7 @@ #include #include #include /* for getaddrinfo */ -#include +#include #include /* for IOCTL's */ Index: contrib/ofed/dapl/dapl/openib_cma/linux/openib_osd.h =================================================================== --- contrib/ofed/dapl/dapl/openib_cma/linux/openib_osd.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/openib_cma/linux/openib_osd.h (.../head) (revision 219811) @@ -1,7 +1,7 @@ #ifndef OPENIB_OSD_H #define OPENIB_OSD_H -#include +#include #include #if __BYTE_ORDER == __BIG_ENDIAN Index: contrib/ofed/dapl/dapl/openib_cma/device.c =================================================================== --- contrib/ofed/dapl/dapl/openib_cma/device.c (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/openib_cma/device.c (.../head) (revision 219811) @@ -154,7 +154,9 @@ /* Fill in the structure */ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", name); +#ifndef __FreeBSD__ ifr.ifr_hwaddr.sa_family = ARPHRD_INFINIBAND; +#endif /* Create a socket fd */ skfd = socket(PF_INET, SOCK_STREAM, 0); @@ -661,8 +663,8 @@ /* work thread for uAT, uCM, CQ, and async events */ void dapli_thread(void *arg) { - struct pollfd ufds[__FD_SETSIZE]; - struct _ib_hca_transport *uhca[__FD_SETSIZE] = { NULL }; + struct pollfd ufds[FD_SETSIZE]; + struct _ib_hca_transport *uhca[FD_SETSIZE] = { NULL }; struct _ib_hca_transport *hca; int ret, idx, fds; char rbuf[2]; Index: contrib/ofed/dapl/dapl/openib_scm/linux/openib_osd.h =================================================================== --- contrib/ofed/dapl/dapl/openib_scm/linux/openib_osd.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/openib_scm/linux/openib_osd.h (.../head) (revision 219811) @@ -1,7 +1,7 @@ #ifndef OPENIB_OSD_H #define OPENIB_OSD_H -#include +#include #include #if __BYTE_ORDER == __BIG_ENDIAN Index: contrib/ofed/dapl/dapl/openib_scm/device.c =================================================================== --- contrib/ofed/dapl/dapl/openib_scm/device.c (.../base) (revision 219811) +++ contrib/ofed/dapl/dapl/openib_scm/device.c (.../head) (revision 219811) @@ -646,8 +646,8 @@ /* work thread for uAT, uCM, CQ, and async events */ void dapli_thread(void *arg) { - struct pollfd ufds[__FD_SETSIZE]; - struct _ib_hca_transport *uhca[__FD_SETSIZE] = { NULL }; + struct pollfd ufds[FD_SETSIZE]; + struct _ib_hca_transport *uhca[FD_SETSIZE] = { NULL }; struct _ib_hca_transport *hca; int ret, idx, fds; char rbuf[2]; Index: contrib/ofed/dapl/dat/include/dat2/dat_platform_specific.h =================================================================== --- contrib/ofed/dapl/dat/include/dat2/dat_platform_specific.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dat/include/dat2/dat_platform_specific.h (.../head) (revision 219811) @@ -255,8 +255,46 @@ #endif /* __KDAPL__ */ /* Windoze ends */ +#elif defined(__FreeBSD__) +#include +#include +#include +#include +typedef u_int32_t DAT_UINT32; /* unsigned host order, 32 bits */ +typedef u_int64_t DAT_UINT64; /* unsigned host order, 64 bits */ +typedef unsigned long long DAT_UVERYLONG; /* unsigned longest native to compiler */ + +typedef void * DAT_PVOID; +typedef int DAT_COUNT; +typedef DAT_UINT64 DAT_PADDR; + +#ifndef UINT64_C +#define UINT64_C(c) c ## ULL +#endif /* UINT64_C */ + +#define DAT_IA_HANDLE_TO_UL(a) (unsigned long)(a) +#define DAT_UL_TO_IA_HANDLE(a) (DAT_IA_HANDLE)(a) + + +typedef struct dat_comm { + int domain; + int type; + int protocol; +} DAT_COMM; + +typedef int DAT_FD; /* DAT File Descriptor */ + +typedef struct sockaddr DAT_SOCK_ADDR; /* Socket address header native to OS */ +typedef struct sockaddr_in6 DAT_SOCK_ADDR6; /* Socket address header native to OS */ +#define DAT_AF_INET AF_INET +#define DAT_AF_INET6 AF_INET6 + +#define DAT_API +#define DAT_EXPORT extern + +/* Linux ends */ #else #error dat_platform_specific.h : OS type not defined #endif Index: contrib/ofed/dapl/dat/udat/linux/dat_osd.h =================================================================== --- contrib/ofed/dapl/dat/udat/linux/dat_osd.h (.../base) (revision 219811) +++ contrib/ofed/dapl/dat/udat/linux/dat_osd.h (.../head) (revision 219811) @@ -48,9 +48,9 @@ * This file is defined for Linux systems only, including it on any * other build will cause an error */ -#ifndef __linux__ +#if defined(__linux__) || defined(__freebsd__) #error "UNDEFINED OS TYPE" -#endif /* __linux__ */ +#endif /* __linux__ || FreeBSD */ #include Index: contrib/ofed/usr.bin/ibtracert/Makefile =================================================================== --- contrib/ofed/usr.bin/ibtracert/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibtracert/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibtracert +SRCS= ibtracert.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad -losmcomp +CFLAGS+= -pthread -I${DIAGPATH}/include +MAN= ibtracert.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/perfquery/Makefile =================================================================== --- contrib/ofed/usr.bin/perfquery/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/perfquery/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= perfquery +SRCS= perfquery.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= perfquery.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/smpquery/Makefile =================================================================== --- contrib/ofed/usr.bin/smpquery/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/smpquery/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= smpquery +SRCS= smpquery.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad -losmcomp +CFLAGS+= -pthread -I${DIAGPATH}/include +MAN= smpquery.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/Makefile.inc =================================================================== --- contrib/ofed/usr.bin/Makefile.inc (.../base) (revision 0) +++ contrib/ofed/usr.bin/Makefile.inc (.../head) (revision 219811) @@ -0,0 +1,4 @@ +DIAGPATH= ${.CURDIR}/../../management/infiniband-diags +BINDIR?= /usr/bin +CFLAGS+= -I${.CURDIR}/../../include/infiniband +CFLAGS+= -I${.CURDIR}/../../management/opensm/include/ Index: contrib/ofed/usr.bin/ibaddr/Makefile =================================================================== --- contrib/ofed/usr.bin/ibaddr/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibaddr/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibaddr +SRCS= ibaddr.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= ibaddr.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibsysstat/Makefile =================================================================== --- contrib/ofed/usr.bin/ibsysstat/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibsysstat/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibsysstat +SRCS= ibsysstat.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= ibsysstat.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/smpdump/Makefile =================================================================== --- contrib/ofed/usr.bin/smpdump/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/smpdump/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= smpdump +SRCS= smpdump.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= smpdump.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibstat/Makefile =================================================================== --- contrib/ofed/usr.bin/ibstat/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibstat/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibstat +SRCS= ibstat.c +LDADD= -libumad -libcommon +CFLAGS+= -I${DIAGPATH}/include +MAN= ibstat.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/vendstat/Makefile =================================================================== --- contrib/ofed/usr.bin/vendstat/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/vendstat/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= vendstat +SRCS= vendstat.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= vendstat.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/saquery/Makefile =================================================================== --- contrib/ofed/usr.bin/saquery/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/saquery/Makefile (.../head) (revision 219811) @@ -0,0 +1,16 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= saquery +SRCS= saquery.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad -losmcomp -losmvendor -lopensm +CFLAGS+= -I${DIAGPATH}/include +CFLAGS+= -DOSM_VENDOR_INTF_OPENIB -DVENDOR_RMPP_SUPPORT -DDUAL_SIDED_RMPP +CFLAGS+= -pthread +MAN= saquery.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibsendtrap/Makefile =================================================================== --- contrib/ofed/usr.bin/ibsendtrap/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibsendtrap/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibsendtrap +SRCS= ibsendtrap.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +NO_MAN= true + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/sminfo/Makefile =================================================================== --- contrib/ofed/usr.bin/sminfo/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/sminfo/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= sminfo +SRCS= sminfo.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= sminfo.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibnetdiscover/Makefile =================================================================== --- contrib/ofed/usr.bin/ibnetdiscover/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibnetdiscover/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibnetdiscover +SRCS= ibnetdiscover.c grouping.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad -losmcomp +CFLAGS+= -pthread -I${DIAGPATH}/include +MAN= ibnetdiscover.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibportstate/Makefile =================================================================== --- contrib/ofed/usr.bin/ibportstate/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibportstate/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibportstate +SRCS= ibportstate.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= ibportstate.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/opensm/Makefile =================================================================== --- contrib/ofed/usr.bin/opensm/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/opensm/Makefile (.../head) (revision 219811) @@ -0,0 +1,40 @@ +# $FreeBSD$ + +.include "../Makefile.inc" + +OPENSM = ${.CURDIR}/../../management/opensm +.PATH: ${OPENSM}/opensm ${OPENSM}/man + +PROG= opensm + +SRCS= main.c osm_console_io.c osm_console.c osm_db_files.c +SRCS+= osm_db_pack.c osm_drop_mgr.c osm_inform.c osm_lid_mgr.c +SRCS+= osm_lin_fwd_rcv.c osm_link_mgr.c osm_mcast_fwd_rcv.c osm_mcast_mgr.c +SRCS+= osm_mcast_tbl.c osm_mcm_info.c osm_mcm_port.c osm_mtree.c +SRCS+= osm_multicast.c osm_node.c osm_node_desc_rcv.c osm_node_info_rcv.c +SRCS+= osm_opensm.c osm_pkey.c osm_pkey_mgr.c osm_pkey_rcv.c osm_port.c +SRCS+= osm_port_info_rcv.c osm_remote_sm.c osm_req.c osm_resp.c osm_sa.c +SRCS+= osm_sa_class_port_info.c osm_sa_informinfo.c osm_sa_lft_record.c +SRCS+= osm_sa_mft_record.c osm_sa_link_record.c osm_sa_mad_ctrl.c +SRCS+= osm_sa_mcmember_record.c osm_sa_node_record.c osm_sa_path_record.c +SRCS+= osm_sa_pkey_record.c osm_sa_portinfo_record.c osm_sa_guidinfo_record.c +SRCS+= osm_sa_multipath_record.c osm_sa_service_record.c osm_sa_slvl_record.c +SRCS+= osm_sa_sminfo_record.c osm_sa_vlarb_record.c osm_sa_sw_info_record.c +SRCS+= osm_service.c osm_slvl_map_rcv.c osm_sm.c osm_sminfo_rcv.c +SRCS+= osm_sm_mad_ctrl.c osm_sm_state_mgr.c osm_state_mgr.c osm_subnet.c +SRCS+= osm_sw_info_rcv.c osm_switch.c osm_prtn.c osm_prtn_config.c osm_qos.c +SRCS+= osm_router.c osm_trap_rcv.c osm_ucast_mgr.c osm_ucast_updn.c +SRCS+= osm_ucast_lash.c osm_ucast_file.c osm_ucast_ftree.c osm_vl15intf.c +SRCS+= osm_vl_arb_rcv.c st.c osm_perfmgr.c osm_perfmgr_db.c osm_event_plugin.c +SRCS+= osm_dump.c osm_ucast_cache.c osm_qos_parser_y.y osm_qos_parser_l.l +SRCS+= osm_qos_policy.c + +LDADD= -lopensm -losmvendor -losmcomp -libmad -libumad -libcommon +CFLAGS+= -pthread +CFLAGS+= -DVENDOR_RMPP_SUPPORT -DDUAL_SIDED_RMPP + +MAN= opensm.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibping/Makefile =================================================================== --- contrib/ofed/usr.bin/ibping/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibping/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibping +SRCS= ibping.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad +CFLAGS+= -I${DIAGPATH}/include +MAN= ibping.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/ibroute/Makefile =================================================================== --- contrib/ofed/usr.bin/ibroute/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/ibroute/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +# $FreeBSD$ + +.include "../Makefile.inc" +.PATH: ${DIAGPATH}/src ${DIAGPATH}/man + +PROG= ibroute +SRCS= ibroute.c ibdiag_common.c +LDADD= -libumad -libcommon -libmad -losmcomp +CFLAGS+= -pthread -I${DIAGPATH}/include +MAN= ibroute.8 + +WARNS?= 1 + +.include Index: contrib/ofed/usr.bin/Makefile =================================================================== --- contrib/ofed/usr.bin/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.bin/Makefile (.../head) (revision 219811) @@ -0,0 +1,7 @@ +.include + +SUBDIR = ibaddr ibnetdiscover ibping ibportstate ibroute ibsendtrap ibstat +SUBDIR += ibsysstat ibtracert opensm perfquery saquery +SUBDIR += sminfo smpdump smpquery vendstat + +.include Index: contrib/ofed/libibcm/include/infiniband/cm_abi.h =================================================================== --- contrib/ofed/libibcm/include/infiniband/cm_abi.h (.../base) (revision 219811) +++ contrib/ofed/libibcm/include/infiniband/cm_abi.h (.../head) (revision 219811) @@ -36,7 +36,7 @@ #ifndef CM_ABI_H #define CM_ABI_H -#include +#include #include #include Index: contrib/ofed/libibcm/include/infiniband/cm.h =================================================================== --- contrib/ofed/libibcm/include/infiniband/cm.h (.../base) (revision 219811) +++ contrib/ofed/libibcm/include/infiniband/cm.h (.../head) (revision 219811) @@ -38,7 +38,7 @@ #include #include -#include +#include #ifdef __cplusplus extern "C" { Index: contrib/ofed/libibcm/src/cm.c =================================================================== --- contrib/ofed/libibcm/src/cm.c (.../base) (revision 219811) +++ contrib/ofed/libibcm/src/cm.c (.../head) (revision 219811) @@ -300,7 +300,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -331,7 +331,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -361,7 +361,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : result; + return (result >= 0) ? ERR(ECONNREFUSED) : result; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -387,7 +387,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -446,7 +446,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -481,7 +481,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -506,7 +506,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -547,7 +547,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -572,7 +572,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -606,7 +606,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -656,7 +656,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : result; + return (result >= 0) ? ERR(ECONNREFUSED) : result; return 0; } @@ -691,7 +691,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -731,7 +731,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : result; + return (result >= 0) ? ERR(ECONNREFUSED) : result; return 0; } @@ -765,7 +765,7 @@ result = write(cm_id->device->fd, msg, size); if (result != size) - return (result >= 0) ? ERR(ENODATA) : -1; + return (result >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -875,7 +875,7 @@ result = write(device->fd, msg, size); if (result != size) { - result = (result >= 0) ? ERR(ENODATA) : -1; + result = (result >= 0) ? ERR(ECONNREFUSED) : -1; goto done; } Index: contrib/ofed/include/infiniband/byteswap.h =================================================================== --- contrib/ofed/include/infiniband/byteswap.h (.../base) (revision 0) +++ contrib/ofed/include/infiniband/byteswap.h (.../head) (revision 219811) @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _INFINIBAND_BYTESWAP_H_ +#define _INFINIBAND_BYTESWAP_H_ +/* + * This file is included for compatibility with the userland libraries + * accompanying the infiniband stack. + */ +#include +#include + +#define bswap_16 bswap16 +#define bswap_32 bswap32 +#define bswap_64 bswap64 + +#endif /* _INFINIBAND_BYTESWAP_H_ */ Index: contrib/ofed/include/infiniband/endian.h =================================================================== --- contrib/ofed/include/infiniband/endian.h (.../base) (revision 0) +++ contrib/ofed/include/infiniband/endian.h (.../head) (revision 219811) @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * This file is included for compatibility with the userland libraries + * accompanying the infiniband stack. + */ + +#ifndef _INFINIBAND_ENDIAN_H_ +#define _INFINIBAND_ENDIAN_H_ + +#include +#include +#define __LITTLE_ENDIAN _LITTLE_ENDIAN +#define __BIG_ENDIAN _BIG_ENDIAN +#define __BYTE_ORDER _BYTE_ORDER + +#endif /* _INFINIBAND_ENDIAN_H_ */ Index: contrib/ofed/include/infiniband/byteorder.h =================================================================== --- contrib/ofed/include/infiniband/byteorder.h (.../base) (revision 0) +++ contrib/ofed/include/infiniband/byteorder.h (.../head) (revision 219811) @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _INFINIBAND_BYTEORDER_H_ +#define _INFINIBAND_BYTEORDER_H_ + +#include +#include + +#if BYTE_ORDER == LITTLE_ENDIAN +#define __LITTLE_ENDIAN +#else +#define __BIG_ENDIAN +#endif + +#define cpu_to_le64 htole64 +#define le64_to_cpu le64toh +#define cpu_to_le32 htole32 +#define le32_to_cpu le32toh +#define cpu_to_le16 htole16 +#define le16_to_cpu le16toh +#define cpu_to_be64 htobe64 +#define be64_to_cpu be64toh +#define cpu_to_be32 htobe32 +#define be32_to_cpu be32toh +#define cpu_to_be16 htobe16 +#define be16_to_cpu be16toh +#define __be16_to_cpu be16toh + +#define cpu_to_le64p(x) htole64(*((uint64_t *)x)) +#define le64_to_cpup(x) le64toh(*((uint64_t *)x)) +#define cpu_to_le32p(x) htole32(*((uint32_t *)x)) +#define le32_to_cpup(x) le32toh(*((uint32_t *)x)) +#define cpu_to_le16p(x) htole16(*((uint16_t *)x)) +#define le16_to_cpup(x) le16toh(*((uint16_t *)x)) +#define cpu_to_be64p(x) htobe64(*((uint64_t *)x)) +#define be64_to_cpup(x) be64toh(*((uint64_t *)x)) +#define cpu_to_be32p(x) htobe32(*((uint32_t *)x)) +#define be32_to_cpup(x) be32toh(*((uint32_t *)x)) +#define cpu_to_be16p(x) htobe16(*((uint16_t *)x)) +#define be16_to_cpup(x) be16toh(*((uint16_t *)x)) + +#define cpu_to_le64s(x) do { *((uint64_t *)x) = cpu_to_le64p((x)) } while (0) +#define le64_to_cpus(x) do { *((uint64_t *)x) = le64_to_cpup((x)) } while (0) +#define cpu_to_le32s(x) do { *((uint32_t *)x) = cpu_to_le32p((x)) } while (0) +#define le32_to_cpus(x) do { *((uint32_t *)x) = le32_to_cpup((x)) } while (0) +#define cpu_to_le16s(x) do { *((uint16_t *)x) = cpu_to_le16p((x)) } while (0) +#define le16_to_cpus(x) do { *((uint16_t *)x) = le16_to_cpup((x)) } while (0) +#define cpu_to_be64s(x) do { *((uint64_t *)x) = cpu_to_be64p((x)) } while (0) +#define be64_to_cpus(x) do { *((uint64_t *)x) = be64_to_cpup((x)) } while (0) +#define cpu_to_be32s(x) do { *((uint32_t *)x) = cpu_to_be32p((x)) } while (0) +#define be32_to_cpus(x) do { *((uint32_t *)x) = be32_to_cpup((x)) } while (0) +#define cpu_to_be16s(x) do { *((uint16_t *)x) = cpu_to_be16p((x)) } while (0) +#define be16_to_cpus(x) do { *((uint16_t *)x) = be16_to_cpup((x)) } while (0) + +#define swab16 bswap16 +#define swab32 bswap32 +#define swab64 bswap64 + +#endif /* _INFINIBAND_BYTEORDER_H_ */ Index: contrib/ofed/include/infiniband/types.h =================================================================== --- contrib/ofed/include/infiniband/types.h (.../base) (revision 0) +++ contrib/ofed/include/infiniband/types.h (.../head) (revision 219811) @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _INFINIBAND_TYPES_H_ +#define _INFINIBAND_TYPES_H_ + +#include +#include + +typedef int8_t s8; +typedef uint8_t u8; +typedef int8_t __s8; +typedef uint8_t __u8; + +typedef int16_t s16; +typedef uint16_t u16; +typedef int16_t __s16; +typedef uint16_t __u16; + +typedef int32_t s32; +typedef uint32_t u32; +typedef int32_t __s32; +typedef uint32_t __u32; + +typedef int64_t s64; +typedef uint64_t u64; +typedef int64_t __s64; +typedef uint64_t __u64; + +typedef uint16_t __le16; +typedef uint16_t __be16; +typedef uint32_t __le32; +typedef uint32_t __be32; +typedef uint64_t __le64; +typedef uint64_t __be64; + +typedef unsigned int uint; + +#endif /* _INFINIBAND_TYPES_H_ */ Index: contrib/ofed/include/infiniband/Makefile =================================================================== --- contrib/ofed/include/infiniband/Makefile (.../base) (revision 0) +++ contrib/ofed/include/infiniband/Makefile (.../head) (revision 219811) @@ -0,0 +1,107 @@ +.include + +INCS= +IBINCS= byteorder.h byteswap.h endian.h types.h +IBINCSDIR= ${INCLUDEDIR}/infiniband + +IBVERBS= ${.CURDIR}/../../libibverbs/include/infiniband +VERBINCS= ${IBVERBS}/arch.h ${IBVERBS}/driver.h ${IBVERBS}/kern-abi.h +VERBINCS+= ${IBVERBS}/marshall.h ${IBVERBS}/opcode.h +VERBINCS+= ${IBVERBS}/sa-kern-abi.h ${IBVERBS}/sa.h ${IBVERBS}/verbs.h +VERBINCSDIR= ${INCLUDEDIR}/infiniband + +IBCOMMON= ${.CURDIR}/../../management/libibcommon/include/infiniband +COMMONINCS= ${IBCOMMON}/common.h +COMMONINCSDIR= ${INCLUDEDIR}/infiniband + +IBMAD= ${.CURDIR}/../../management/libibmad/include/infiniband +MADINCS= ${IBMAD}/mad.h +MADINCSDIR= ${INCLUDEDIR}/infiniband + +IBUMAD= ${.CURDIR}/../../management/libibumad/include/infiniband +UMADINCS= ${IBUMAD}/umad.h +UMADINCSDIR= ${INCLUDEDIR}/infiniband + +COMPLIB= ${.CURDIR}/../../management/opensm/include/complib +COMPLIBINCS= ${COMPLIB}/cl_atomic.h ${COMPLIB}/cl_atomic_osd.h +COMPLIBINCS+= ${COMPLIB}/cl_byteswap.h ${COMPLIB}/cl_byteswap_osd.h +COMPLIBINCS+= ${COMPLIB}/cl_comppool.h ${COMPLIB}/cl_debug.h +COMPLIBINCS+= ${COMPLIB}/cl_debug_osd.h ${COMPLIB}/cl_dispatcher.h +COMPLIBINCS+= ${COMPLIB}/cl_event.h ${COMPLIB}/cl_event_osd.h +COMPLIBINCS+= ${COMPLIB}/cl_event_wheel.h ${COMPLIB}/cl_fleximap.h +COMPLIBINCS+= ${COMPLIB}/cl_list.h ${COMPLIB}/cl_log.h +COMPLIBINCS+= ${COMPLIB}/cl_map.h ${COMPLIB}/cl_math.h +COMPLIBINCS+= ${COMPLIB}/cl_nodenamemap.h ${COMPLIB}/cl_packoff.h +COMPLIBINCS+= ${COMPLIB}/cl_packon.h ${COMPLIB}/cl_passivelock.h +COMPLIBINCS+= ${COMPLIB}/cl_pool.h ${COMPLIB}/cl_ptr_vector.h +COMPLIBINCS+= ${COMPLIB}/cl_qcomppool.h ${COMPLIB}/cl_qlist.h +COMPLIBINCS+= ${COMPLIB}/cl_qmap.h ${COMPLIB}/cl_qpool.h +COMPLIBINCS+= ${COMPLIB}/cl_spinlock.h ${COMPLIB}/cl_spinlock_osd.h +COMPLIBINCS+= ${COMPLIB}/cl_thread.h ${COMPLIB}/cl_thread_osd.h +COMPLIBINCS+= ${COMPLIB}/cl_threadpool.h ${COMPLIB}/cl_timer.h +COMPLIBINCS+= ${COMPLIB}/cl_timer_osd.h ${COMPLIB}/cl_types.h +COMPLIBINCS+= ${COMPLIB}/cl_types_osd.h ${COMPLIB}/cl_vector.h +COMPLIBINCSDIR= ${INCLUDEDIR}/infiniband/complib + +IBADIR= ${.CURDIR}/../../management/opensm/include/iba +IBAINCS= ${IBADIR}/ib_cm_types.h ${IBADIR}/ib_types.h +IBAINCSDIR= ${INCLUDEDIR}/infiniband/iba + +OPENSM= ${.CURDIR}/../../management/opensm/include/opensm + +OPENSMINCS= ${OPENSM}/osm_attrib_req.h ${OPENSM}/osm_base.h +OPENSMINCS+= ${OPENSM}/osm_config.h ${OPENSM}/osm_console.h +OPENSMINCS+= ${OPENSM}/osm_console_io.h ${OPENSM}/osm_db.h +OPENSMINCS+= ${OPENSM}/osm_db_pack.h ${OPENSM}/osm_errors.h +OPENSMINCS+= ${OPENSM}/osm_event_plugin.h ${OPENSM}/osm_helper.h +OPENSMINCS+= ${OPENSM}/osm_inform.h ${OPENSM}/osm_lid_mgr.h +OPENSMINCS+= ${OPENSM}/osm_log.h ${OPENSM}/osm_mad_pool.h +OPENSMINCS+= ${OPENSM}/osm_madw.h ${OPENSM}/osm_mcast_tbl.h +OPENSMINCS+= ${OPENSM}/osm_mcm_info.h ${OPENSM}/osm_mcm_port.h +OPENSMINCS+= ${OPENSM}/osm_msgdef.h ${OPENSM}/osm_mtree.h +OPENSMINCS+= ${OPENSM}/osm_multicast.h ${OPENSM}/osm_node.h +OPENSMINCS+= ${OPENSM}/osm_opensm.h ${OPENSM}/osm_partition.h +OPENSMINCS+= ${OPENSM}/osm_path.h ${OPENSM}/osm_perfmgr.h +OPENSMINCS+= ${OPENSM}/osm_perfmgr_db.h ${OPENSM}/osm_pkey.h +OPENSMINCS+= ${OPENSM}/osm_pkey_mgr.h ${OPENSM}/osm_port.h +OPENSMINCS+= ${OPENSM}/osm_port_profile.h ${OPENSM}/osm_prefix_route.h +OPENSMINCS+= ${OPENSM}/osm_qos_policy.h ${OPENSM}/osm_remote_sm.h +OPENSMINCS+= ${OPENSM}/osm_router.h ${OPENSM}/osm_sa.h +OPENSMINCS+= ${OPENSM}/osm_sa_mad_ctrl.h ${OPENSM}/osm_service.h +OPENSMINCS+= ${OPENSM}/osm_sm.h ${OPENSM}/osm_sm_mad_ctrl.h +OPENSMINCS+= ${OPENSM}/osm_stats.h ${OPENSM}/osm_subnet.h +OPENSMINCS+= ${OPENSM}/osm_switch.h ${OPENSM}/osm_ucast_cache.h +OPENSMINCS+= ${OPENSM}/osm_ucast_mgr.h ${OPENSM}/osm_version.h +OPENSMINCS+= ${OPENSM}/osm_vl15intf.h ${OPENSM}/st.h +OPENSMINCSDIR= ${INCLUDEDIR}/infiniband/opensm + +VENDOR= ${.CURDIR}/../../management/opensm/include/vendor +VENDORINCS= ${VENDOR}/osm_mtl_bind.h ${VENDOR}/osm_pkt_randomizer.h +VENDORINCS+= ${VENDOR}/osm_ts_useraccess.h ${VENDOR}/osm_umadt.h +VENDORINCS+= ${VENDOR}/osm_vendor.h ${VENDOR}/osm_vendor_al.h +VENDORINCS+= ${VENDOR}/osm_vendor_api.h ${VENDOR}/osm_vendor_ibumad.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx.h ${VENDOR}/osm_vendor_mlx_defs.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_dispatcher.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_hca.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_inout.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_rmpp_ctx.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_sar.h ${VENDOR}/osm_vendor_mlx_sender.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_svc.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_transport.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_transport_anafa.h +VENDORINCS+= ${VENDOR}/osm_vendor_mlx_txn.h +VENDORINCS+= ${VENDOR}/osm_vendor_mtl.h ${VENDOR}/osm_vendor_mtl_hca_guid.h +VENDORINCS+= ${VENDOR}/osm_vendor_mtl_transaction_mgr.h +VENDORINCS+= ${VENDOR}/osm_vendor_sa_api.h +VENDORINCS+= ${VENDOR}/osm_vendor_test.h ${VENDOR}/osm_vendor_ts.h +VENDORINCS+= ${VENDOR}/osm_vendor_umadt.h +VENDORINCSDIR= ${INCLUDEDIR}/infiniband/vendor + +IBCM= ${.CURDIR}/../../libibcm/include/infiniband +IBCMINCS= ${IBCM}/cm.h ${IBCM}/cm_abi.h +IBCMINCSDIR= ${INCLUDEDIR}/infiniband + +INCSGROUPS= INCS VERBINCS COMMONINCS MADINCS UMADINCS COMPLIBINCS IBAINCS +INCSGROUPS+= OPENSMINCS VENDORINCS IBCMINCS IBINCS + +.include Index: contrib/ofed/include/rdma/Makefile =================================================================== --- contrib/ofed/include/rdma/Makefile (.../base) (revision 0) +++ contrib/ofed/include/rdma/Makefile (.../head) (revision 219811) @@ -0,0 +1,10 @@ +.include + +INCS= +RDMACM= ${.CURDIR}/../../librdmacm/include/rdma +RDMACMINCS= ${RDMACM}/rdma_cma.h ${RDMACM}/rdma_cma_abi.h +RDMACMINCSDIR= ${INCLUDEDIR}/rdma + +INCSGROUPS= RDMACMINCS + +.include Index: contrib/ofed/include/Makefile =================================================================== --- contrib/ofed/include/Makefile (.../base) (revision 0) +++ contrib/ofed/include/Makefile (.../head) (revision 219811) @@ -0,0 +1,5 @@ +.include + +SUBDIR = infiniband rdma + +.include Index: contrib/ofed/libibverbs/include/infiniband/sa-kern-abi.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/sa-kern-abi.h (.../base) (revision 219811) +++ contrib/ofed/libibverbs/include/infiniband/sa-kern-abi.h (.../head) (revision 219811) @@ -33,7 +33,7 @@ #ifndef INFINIBAND_SA_KERN_ABI_H #define INFINIBAND_SA_KERN_ABI_H -#include +#include /* * Obsolete, deprecated names. Will be removed in libibverbs 1.1. Index: contrib/ofed/libibverbs/include/infiniband/arch.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/arch.h (.../base) (revision 219811) +++ contrib/ofed/libibverbs/include/infiniband/arch.h (.../head) (revision 219811) @@ -34,8 +34,8 @@ #define INFINIBAND_ARCH_H #include -#include -#include +#include +#include #if __BYTE_ORDER == __LITTLE_ENDIAN static inline uint64_t htonll(uint64_t x) { return bswap_64(x); } Index: contrib/ofed/libibverbs/include/infiniband/kern-abi.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/kern-abi.h (.../base) (revision 219811) +++ contrib/ofed/libibverbs/include/infiniband/kern-abi.h (.../head) (revision 219811) @@ -35,7 +35,7 @@ #ifndef KERN_ABI_H #define KERN_ABI_H -#include +#include /* * This file must be kept in sync with the kernel's version of Index: contrib/ofed/libibverbs/src/libibverbs.map =================================================================== --- contrib/ofed/libibverbs/src/libibverbs.map (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/libibverbs.map (.../head) (revision 219811) @@ -83,6 +83,7 @@ ibv_get_device_guid; ibv_open_device; ibv_close_device; + ibv_resolve_eth_gid; ibv_init_ah_from_wc; ibv_create_ah_from_wc; Index: contrib/ofed/libibverbs/src/device.c =================================================================== --- contrib/ofed/libibverbs/src/device.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/device.c (.../head) (revision 219811) @@ -128,7 +128,7 @@ int cmd_fd; struct ibv_context *context; - if (asprintf(&devpath, "/dev/infiniband/%s", device->dev_name) < 0) + if (asprintf(&devpath, "/dev/%s", device->dev_name) < 0) return NULL; /* Index: contrib/ofed/libibverbs/src/verbs.c =================================================================== --- contrib/ofed/libibverbs/src/verbs.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/verbs.c (.../head) (revision 219811) @@ -717,8 +717,8 @@ return tag < 0x1000; } -int __ibv_resolve_eth_gid(struct ibv_pd *pd, uint8_t port_num, - const union ibv_gid *dgid, uint8_t sgid_index, +int __ibv_resolve_eth_gid(const struct ibv_pd *pd, uint8_t port_num, + union ibv_gid *dgid, uint8_t sgid_index, uint8_t mac[], uint16_t *vlan, uint8_t *tagged, uint8_t *is_mcast) { Index: contrib/ofed/libibverbs/src/init.c =================================================================== --- contrib/ofed/libibverbs/src/init.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/init.c (.../head) (revision 219811) @@ -79,6 +79,7 @@ static int find_sysfs_devs(void) { +#ifdef __linux__ char class_path[IBV_SYSFS_PATH_MAX]; DIR *class_dir; struct dirent *dent; @@ -151,6 +152,58 @@ closedir(class_dir); return ret; +#else + char class_path[IBV_SYSFS_PATH_MAX]; + struct ibv_sysfs_dev *sysfs_dev = NULL; + char value[8]; + int ret = 0; + int i; + + snprintf(class_path, sizeof class_path, "%s/class/infiniband_verbs", + ibv_get_sysfs_path()); + + for (i = 0; i < 256; i++) { + if (!sysfs_dev) + sysfs_dev = malloc(sizeof *sysfs_dev); + if (!sysfs_dev) { + ret = ENOMEM; + goto out; + } + + snprintf(sysfs_dev->sysfs_path, sizeof sysfs_dev->sysfs_path, + "%s/uverbs%d", class_path, i); + + snprintf(sysfs_dev->sysfs_name, sizeof sysfs_dev->sysfs_name, + "uverbs%d", i); + + if (ibv_read_sysfs_file(sysfs_dev->sysfs_path, "ibdev", + sysfs_dev->ibdev_name, + sizeof sysfs_dev->ibdev_name) < 0) + continue; + + snprintf(sysfs_dev->ibdev_path, sizeof sysfs_dev->ibdev_path, + "%s/class/infiniband/%s", ibv_get_sysfs_path(), + sysfs_dev->ibdev_name); + + sysfs_dev->next = sysfs_dev_list; + sysfs_dev->have_driver = 0; + if (ibv_read_sysfs_file(sysfs_dev->sysfs_path, "abi_version", + value, sizeof value) > 0) + sysfs_dev->abi_ver = strtol(value, NULL, 10); + else + sysfs_dev->abi_ver = 0; + + sysfs_dev_list = sysfs_dev; + sysfs_dev = NULL; + } + + out: + if (sysfs_dev) + free(sysfs_dev); + + return ret; + +#endif } void ibv_register_driver(const char *name, ibv_driver_init_func init_func) @@ -348,6 +401,7 @@ if (dev->node_type < IBV_NODE_CA || dev->node_type > IBV_NODE_RNIC) dev->node_type = IBV_NODE_UNKNOWN; } +out: switch (dev->node_type) { case IBV_NODE_CA: Index: contrib/ofed/libibverbs/src/sysfs.c =================================================================== --- contrib/ofed/libibverbs/src/sysfs.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/sysfs.c (.../head) (revision 219811) @@ -42,6 +42,8 @@ #include #include +#include + #include "ibverbs.h" static char *sysfs_path; @@ -78,22 +80,21 @@ int ibv_read_sysfs_file(const char *dir, const char *file, char *buf, size_t size) { - char *path; + char *path, *s; int fd; - int len; + size_t len; if (asprintf(&path, "%s/%s", dir, file) < 0) return -1; - fd = open(path, O_RDONLY); - if (fd < 0) { - free(path); + for (s = &path[0]; *s != '\0'; s++) + if (*s == '/') + *s = '.'; + + len = size; + if (sysctlbyname(&path[1], buf, &len, NULL, 0) == -1) return -1; - } - len = read(fd, buf, size); - - close(fd); free(path); if (len > 0 && buf[len - 1] == '\n') Index: contrib/ofed/libibverbs/src/memory.c =================================================================== --- contrib/ofed/libibverbs/src/memory.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/src/memory.c (.../head) (revision 219811) @@ -46,6 +46,7 @@ /* * Most distro's headers don't have these yet. */ +#ifdef __linux__ #ifndef MADV_DONTFORK #define MADV_DONTFORK 10 #endif @@ -53,6 +54,10 @@ #ifndef MADV_DOFORK #define MADV_DOFORK 11 #endif +#else +#define MADV_DONTFORK INHERIT_NONE +#define MADV_DOFORK INHERIT_SHARE +#endif struct ibv_mem_node { enum { @@ -72,8 +77,10 @@ int ibv_fork_init(void) { +#ifdef __linux__ void *tmp; int ret; +#endif if (mm_root) return 0; @@ -85,6 +92,7 @@ if (page_size < 0) return errno; +#ifdef __linux__ if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; @@ -95,6 +103,7 @@ if (ret) return ENOSYS; +#endif mm_root = malloc(sizeof *mm_root); if (!mm_root) @@ -569,10 +578,10 @@ * and that may lead to a spurious failure. */ if (start > node->start) - ret = madvise((void *) start, node->end - start + 1, + ret = minherit((void *) start, node->end - start + 1, advice); else - ret = madvise((void *) node->start, + ret = minherit((void *) node->start, node->end - node->start + 1, advice); if (ret) { Index: contrib/ofed/libibverbs/examples/rc_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/rc_pingpong.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/examples/rc_pingpong.c (.../head) (revision 219811) @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include @@ -140,7 +140,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -215,7 +215,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -304,6 +304,8 @@ return rem_dest; } +#include + static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, int use_event, int is_server) @@ -317,7 +319,7 @@ ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = memalign(page_size, size); + ctx->buf = malloc(roundup(size, page_size)); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; @@ -558,7 +560,7 @@ break; case 'd': - ib_devname = strdupa(optarg); + ib_devname = strdup(optarg); break; case 'i': @@ -608,7 +610,7 @@ } if (optind == argc - 1) - servername = strdupa(argv[optind]); + servername = strdup(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; Index: contrib/ofed/libibverbs/examples/pingpong.h =================================================================== --- contrib/ofed/libibverbs/examples/pingpong.h (.../base) (revision 219811) +++ contrib/ofed/libibverbs/examples/pingpong.h (.../head) (revision 219811) @@ -33,6 +33,8 @@ #ifndef IBV_PINGPONG_H #define IBV_PINGPONG_H +#include + #include enum ibv_mtu pp_mtu_to_enum(int mtu); Index: contrib/ofed/libibverbs/examples/srq_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/srq_pingpong.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/examples/srq_pingpong.c (.../head) (revision 219811) @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -148,7 +147,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -236,7 +235,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -351,7 +350,7 @@ ctx->num_qp = num_qp; ctx->rx_depth = rx_depth; - ctx->buf = memalign(page_size, size); + ctx->buf = malloc(roundup(size, page_size)); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; @@ -632,7 +631,7 @@ break; case 'd': - ib_devname = strdupa(optarg); + ib_devname = strdup(optarg); break; case 'i': @@ -686,7 +685,7 @@ } if (optind == argc - 1) - servername = strdupa(argv[optind]); + servername = strdup(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; Index: contrib/ofed/libibverbs/examples/uc_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/uc_pingpong.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/examples/uc_pingpong.c (.../head) (revision 219811) @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -129,7 +128,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -204,7 +203,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -305,7 +304,7 @@ ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = memalign(page_size, size); + ctx->buf = malloc(roundup(size, page_size)); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; @@ -546,7 +545,7 @@ break; case 'd': - ib_devname = strdupa(optarg); + ib_devname = strdup(optarg); break; case 'i': @@ -596,7 +595,7 @@ } if (optind == argc - 1) - servername = strdupa(argv[optind]); + servername = strdup(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; Index: contrib/ofed/libibverbs/examples/ud_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/ud_pingpong.c (.../base) (revision 219811) +++ contrib/ofed/libibverbs/examples/ud_pingpong.c (.../head) (revision 219811) @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -128,7 +127,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -202,7 +201,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_UNSPEC, + .ai_family = AF_INET, .ai_socktype = SOCK_STREAM }; char *service; @@ -303,7 +302,7 @@ ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = memalign(page_size, size + 40); + ctx->buf = malloc(roundup(size + 40, page_size)); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; @@ -552,7 +551,7 @@ break; case 'd': - ib_devname = strdupa(optarg); + ib_devname = strdup(optarg); break; case 'i': @@ -594,7 +593,7 @@ } if (optind == argc - 1) - servername = strdupa(argv[optind]); + servername = strdup(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; Index: contrib/ofed/libibverbs/examples/Makefile =================================================================== --- contrib/ofed/libibverbs/examples/Makefile (.../base) (revision 0) +++ contrib/ofed/libibverbs/examples/Makefile (.../head) (revision 219811) @@ -0,0 +1,28 @@ +CFLAGS= -I../../../../sys/ofed/include -libverbs -lmlx4 -lmthca -pthread + +all: asyncwatch devinfo device_list rc_pingpong srq_pingpong uc_pingpong ud_pingpong + +clean: + rm asyncwatch devinfo device_list rc_pingpong srq_pingpong uc_pingpong ud_pingpong + +asyncwatch: + gcc -o asyncwatch asyncwatch.c ${CFLAGS} + +devinfo: + gcc -o devinfo devinfo.c ${CFLAGS} + +device_list: + gcc -o device_list device_list.c ${CFLAGS} + +rc_pingpong: + gcc -o rc_pingpong rc_pingpong.c pingpong.c ${CFLAGS} + +srq_pingpong: + gcc -o srq_pingpong srq_pingpong.c pingpong.c ${CFLAGS} + +uc_pingpong: + gcc -o uc_pingpong uc_pingpong.c pingpong.c ${CFLAGS} + +ud_pingpong: + gcc -o ud_pingpong ud_pingpong.c pingpong.c ${CFLAGS} + Index: contrib/ofed/librdmacm/src/cma.c =================================================================== --- contrib/ofed/librdmacm/src/cma.c (.../base) (revision 219811) +++ contrib/ofed/librdmacm/src/cma.c (.../head) (revision 219811) @@ -46,8 +46,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -317,9 +317,9 @@ if (!channel) return NULL; - channel->fd = open("/dev/infiniband/rdma_cm", O_RDWR); + channel->fd = open("/dev/rdma_cm", O_RDWR); if (channel->fd < 0) { - printf("CMA: unable to open /dev/infiniband/rdma_cm\n"); + printf("CMA: unable to open /dev/rdma_cm\n"); goto err; } return channel; @@ -432,7 +432,7 @@ ret = write(fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -487,7 +487,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -541,7 +541,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return ucma_query_route(id); } @@ -568,7 +568,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; memcpy(&id->route.addr.dst_addr, dst_addr, daddrlen); return 0; @@ -588,7 +588,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -614,7 +614,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -889,7 +889,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -908,7 +908,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return ucma_query_route(id); } @@ -946,7 +946,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) { ucma_modify_qp_err(id); - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; } return 0; @@ -972,7 +972,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -991,7 +991,7 @@ cmd->event = event; ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -1022,7 +1022,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -1067,7 +1067,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) { - ret = (ret >= 0) ? ERR(ENODATA) : -1; + ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1; goto err2; } @@ -1120,7 +1120,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) { - ret = (ret >= 0) ? ERR(ENODATA) : -1; + ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1; goto free; } @@ -1221,7 +1221,7 @@ ret = write(id_priv->id.channel->fd, msg, size); if (ret != size) { - ret = (ret >= 0) ? ERR(ENODATA) : -1; + ret = (ret >= 0) ? ERR(ECONNREFUSED) : -1; goto err; } @@ -1318,7 +1318,7 @@ ret = write(channel->fd, msg, size); if (ret != size) { free(evt); - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; } VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); @@ -1477,7 +1477,7 @@ ret = write(id->channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; return 0; } @@ -1497,7 +1497,7 @@ ret = write(channel->fd, msg, size); if (ret != size) - return (ret >= 0) ? ERR(ENODATA) : -1; + return (ret >= 0) ? ERR(ECONNREFUSED) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, sizeof *resp); Index: contrib/ofed/librdmacm/examples/rping.c =================================================================== --- contrib/ofed/librdmacm/examples/rping.c (.../base) (revision 219811) +++ contrib/ofed/librdmacm/examples/rping.c (.../head) (revision 219811) @@ -1088,9 +1088,9 @@ static void usage(char *name) { printf("%s -s [-vVd] [-S size] [-C count] [-a addr] [-p port]\n", - basename(name)); + name); printf("%s -c [-vVd] [-S size] [-C count] -a addr [-p port]\n", - basename(name)); + name); printf("\t-c\t\tclient side\n"); printf("\t-s\t\tserver side. To bind to any address with IPv6 use -a ::0\n"); printf("\t-v\t\tdisplay ping data to stdout\n"); Index: contrib/ofed/libmlx4/src/doorbell.h =================================================================== --- contrib/ofed/libmlx4/src/doorbell.h (.../base) (revision 219811) +++ contrib/ofed/libmlx4/src/doorbell.h (.../head) (revision 219811) @@ -33,8 +33,7 @@ #ifndef DOORBELL_H #define DOORBELL_H -#if SIZEOF_LONG == 8 - +#ifdef __LP64__ #if __BYTE_ORDER == __LITTLE_ENDIAN # define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0]) #elif __BYTE_ORDER == __BIG_ENDIAN Index: contrib/ofed/libmlx4/src/buf.c =================================================================== --- contrib/ofed/libmlx4/src/buf.c (.../base) (revision 219811) +++ contrib/ofed/libmlx4/src/buf.c (.../head) (revision 219811) @@ -65,7 +65,7 @@ buf->length = align(size, page_size); buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANON, -1, 0); if (buf->buf == MAP_FAILED) return errno; Index: contrib/ofed/management/infiniband-diags/include/ibdiag_version.h =================================================================== --- contrib/ofed/management/infiniband-diags/include/ibdiag_version.h (.../base) (revision 0) +++ contrib/ofed/management/infiniband-diags/include/ibdiag_version.h (.../head) (revision 219811) @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2008 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _IBDIAG_VERSION_H_ +#define _IBDIAG_VERSION_H_ + +#define IBDIAG_VERSION "1.4.4" + +#endif /* _IBDIAG_VERSION_H_ */ Index: contrib/ofed/management/infiniband-diags/src/ibaddr.c =================================================================== --- contrib/ofed/management/infiniband-diags/src/ibaddr.c (.../base) (revision 219811) +++ contrib/ofed/management/infiniband-diags/src/ibaddr.c (.../head) (revision 219811) @@ -46,6 +46,8 @@ #include #include +#include + #include "ibdiag_common.h" char *argv0 = "ibaddr"; Index: contrib/ofed/management/infiniband-diags/src/smpdump.c =================================================================== --- contrib/ofed/management/infiniband-diags/src/smpdump.c (.../base) (revision 219811) +++ contrib/ofed/management/infiniband-diags/src/smpdump.c (.../head) (revision 219811) @@ -295,7 +295,7 @@ usage(); if (mgmt_class == CLASS_SUBN_DIRECTED_ROUTE && - str2DRPath(strdupa(argv[0]), &path) < 0) + str2DRPath(strdup(argv[0]), &path) < 0) IBPANIC("bad path str '%s'", argv[0]); if (mgmt_class == CLASS_SUBN_LID_ROUTE) Index: contrib/ofed/management/infiniband-diags/src/ibdiag_common.c =================================================================== --- contrib/ofed/management/infiniband-diags/src/ibdiag_common.c (.../base) (revision 219811) +++ contrib/ofed/management/infiniband-diags/src/ibdiag_common.c (.../head) (revision 219811) @@ -45,7 +45,9 @@ #include #include #include +#ifdef HAVE_CONFIG_H #include +#endif #include "ibdiag_common.h" Index: contrib/ofed/management/infiniband-diags/src/saquery.c =================================================================== --- contrib/ofed/management/infiniband-diags/src/saquery.c (.../base) (revision 219811) +++ contrib/ofed/management/infiniband-diags/src/saquery.c (.../head) (revision 219811) @@ -57,6 +57,8 @@ #include #include +#include + #include "ibdiag_common.h" struct query_cmd { Index: contrib/ofed/management/libibumad/include/infiniband/umad.h =================================================================== --- contrib/ofed/management/libibumad/include/infiniband/umad.h (.../base) (revision 219811) +++ contrib/ofed/management/libibumad/include/infiniband/umad.h (.../head) (revision 219811) @@ -81,9 +81,8 @@ #define IB_IOCTL_MAGIC 0x1b -#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ - struct ib_user_mad_reg_req) -#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, uint32_t) +#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1) +#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2) #define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) #define UMAD_CA_NAME_LEN 20 @@ -98,7 +97,7 @@ #define UMAD_MAX_PORTS 64 -#define UMAD_DEV_DIR "/dev/infiniband" +#define UMAD_DEV_DIR "/dev" #define SYS_CA_PORTS_DIR "ports" Index: contrib/ofed/management/libibumad/src/umad.c =================================================================== --- contrib/ofed/management/libibumad/src/umad.c (.../base) (revision 219811) +++ contrib/ofed/management/libibumad/src/umad.c (.../head) (revision 219811) @@ -163,7 +163,7 @@ memcpy(&port->port_guid, gid + 8, sizeof port->port_guid); snprintf(port_dir + len, sizeof(port_dir) - len, "/pkeys"); - ret = scandir(port_dir, &namelist, check_for_digit_name, NULL); + ret = sys_scandir(port_dir, &namelist, check_for_digit_name, NULL); if (ret <= 0) { IBWARN("no pkeys found for %s:%u (at dir %s)...", port->ca_name, port->portnum, port_dir); @@ -346,7 +346,9 @@ static int get_ca(char *ca_name, umad_ca_t *ca) { +#ifdef __linux__ DIR *dir; +#endif char dir_name[256]; struct dirent **namelist; int r, i, ret; @@ -376,10 +378,12 @@ snprintf(dir_name, sizeof(dir_name), "%s/%s/%s", SYS_INFINIBAND, ca->ca_name, SYS_CA_PORTS_DIR); +#ifdef __linux__ if (!(dir = opendir(dir_name))) return -ENOENT; +#endif - if ((r = scandir(dir_name, &namelist, 0, alphasort)) < 0) { + if ((r = sys_scandir(dir_name, &namelist, 0, alphasort)) < 0) { ret = errno < 0 ? errno : -EIO; goto error; } @@ -416,7 +420,9 @@ free(namelist[i]); free(namelist); +#ifdef __linux__ closedir(dir); +#endif put_ca(ca); return 0; @@ -425,7 +431,9 @@ free(namelist[i]); free(namelist); error: +#ifdef __linux__ closedir(dir); +#endif release_ca(ca); return ret; @@ -437,7 +445,7 @@ char path[256]; int r; - snprintf(path, sizeof(path), SYS_INFINIBAND_MAD "/umad%d/", umad_id); + snprintf(path, sizeof(path), SYS_INFINIBAND_MAD "/umad%d", umad_id); if ((r = sys_read_string(path, SYS_IB_MAD_DEV, dev, UMAD_CA_NAME_LEN)) < 0) return r; @@ -520,7 +528,7 @@ TRACE("max %d", max); - n = scandir(SYS_INFINIBAND, &namelist, 0, alphasort); + n = sys_scandir(SYS_INFINIBAND, &namelist, NULL, alphasort); if (n > 0) { for (i = 0; i < n; i++) { if (strcmp(namelist[i]->d_name, ".") && Index: contrib/ofed/management/libibcommon/include/infiniband/common.h =================================================================== --- contrib/ofed/management/libibcommon/include/infiniband/common.h (.../base) (revision 219811) +++ contrib/ofed/management/libibcommon/include/infiniband/common.h (.../head) (revision 219811) @@ -36,8 +36,9 @@ #include #include #include +#include #include -#include +#include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { @@ -79,10 +80,12 @@ /* Misc. macros: */ /** align value \a l to \a size (ceil) */ +#ifndef ALIGN #define ALIGN(l, size) (((l) + ((size) - 1)) / (size) * (size)) /** align value \a l to \a sizeof 32 bit int (ceil) */ #define ALIGN32(l) (ALIGN((l), sizeof(uint32))) +#endif /** printf style debugging MACRO, conmmon header includes name of function */ #define IBWARN(fmt, args...) ibwarn(__FUNCTION__, fmt, ## args) @@ -127,6 +130,9 @@ int sys_read_gid(char *dir_name, char *file_name, uint8_t *gid); int sys_read_uint64(char *dir_name, char *file_name, uint64_t *u); int sys_read_uint(char *dir_name, char *file_name, unsigned *u); +int sys_scandir(const char *dirname, struct dirent ***namelist, + int (*select)(const struct dirent *), + int (*compar)(const struct dirent **, const struct dirent **)); /* stack.c */ void stack_dump(void); Index: contrib/ofed/management/libibcommon/src/libibcommon.map =================================================================== --- contrib/ofed/management/libibcommon/src/libibcommon.map (.../base) (revision 219811) +++ contrib/ofed/management/libibcommon/src/libibcommon.map (.../head) (revision 219811) @@ -7,6 +7,7 @@ sys_read_string; sys_read_uint; sys_read_uint64; + sys_scandir; getcurrenttime; fhash; logmsg; Index: contrib/ofed/management/libibcommon/src/sysfs.c =================================================================== --- contrib/ofed/management/libibcommon/src/sysfs.c (.../base) (revision 219811) +++ contrib/ofed/management/libibcommon/src/sysfs.c (.../head) (revision 219811) @@ -57,6 +57,9 @@ #include #include +#include +#include + #include "common.h" static int @@ -73,26 +76,23 @@ sys_read_string(char *dir_name, char *file_name, char *str, int max_len) { char path[256], *s; - int fd, r; + size_t len; snprintf(path, sizeof(path), "%s/%s", dir_name, file_name); - if ((fd = open(path, O_RDONLY)) < 0) - return ret_code(); + for (s = &path[0]; *s != '\0'; s++) + if (*s == '/') + *s = '.'; - if ((r = read(fd, str, max_len)) < 0) { - int e = errno; - close(fd); - errno = e; + len = max_len; + if (sysctlbyname(&path[1], str, &len, NULL, 0) == -1) return ret_code(); - } - str[(r < max_len) ? r : max_len - 1] = 0; + str[(len < max_len) ? len : max_len - 1] = 0; if ((s = strrchr(str, '\n'))) *s = 0; - close(fd); return 0; } @@ -130,7 +130,7 @@ return r; for (s = buf, i = 0 ; i < 8; i++) { - if (!(str = strsep(&s, ": \t\n"))) + if (!(str = strsep(&s, ": \t\n"))) return -EINVAL; ugid[i] = htons(strtoul(str, 0, 16) & 0xffff); } @@ -165,3 +165,130 @@ return 0; } + +#define DIRECTSIZ(namlen) \ + (((uintptr_t)&((struct dirent *)0)->d_name + \ + ((namlen)+1)*sizeof(((struct dirent *)0)->d_name[0]) + 3) & ~3) + +int +sys_scandir(const char *dirname, struct dirent ***namelist, + int (*select)(const struct dirent *), + int (*compar)(const struct dirent **, const struct dirent **)) +{ + struct dirent **names; + struct dirent **names2; + struct dirent *dp; + char name[1024]; + int lsname[22]; + int chname[22]; + int name2[22]; + int oid[22]; + char *s; + size_t n1, n2; + size_t len, oidlen, namlen; + int cnt, max; + int err; + int i; + + *namelist = NULL; + /* Skip the leading / */ + strncpy(name, &dirname[1], sizeof(name)); + for (s = &name[0]; *s != '\0'; s++) + if (*s == '/') + *s = '.'; + /* + * Resolve the path. + */ + len = sizeof(oid) / sizeof(int); + namlen = strlen(name) + 1; + if (sysctlnametomib(name, oid, &len) != 0) + return (-errno); + lsname[0] = 0; /* Root */ + lsname[1] = 2; /* Get next */ + memcpy(lsname+2, oid, len * sizeof(int)); + n1 = 2 + len; + oidlen = len; + /* + * Setup the return list of dirents. + */ + cnt = 0; + max = 64; + names = malloc(max * sizeof(void *)); + if (names == NULL) + return (-ENOMEM); + + for (;;) { + n2 = sizeof(name2); + if (sysctl(lsname, n1, name2, &n2, 0, 0) < 0) { + if (errno == ENOENT) + break; + goto errout; + } + n2 /= sizeof(int); + if (n2 < oidlen) + break; + for (i = 0; i < oidlen; i++) + if (name2[i] != oid[i]) + goto out; + chname[0] = 0; /* root */ + chname[1] = 1; /* oid name */ + memcpy(chname + 2, name2, n2 * sizeof(int)); + memcpy(lsname + 2, name2, n2 * sizeof(int)); + n1 = 2 + n2; + /* + * scandir() is not supposed to go deeper than the requested + * directory but sysctl also doesn't return a node for + * 'subdirectories' so we have to find a file in the subdir + * and then truncate the name to report it. + */ + if (n2 > oidlen + 1) { + /* Skip to the next name after this one. */ + n1 = 2 + oidlen + 1; + lsname[n1 - 1]++; + } + len = sizeof(name); + if (sysctl(chname, n2 + 2, name, &len, 0, 0) < 0) + goto errout; + if (len <= 0 || len < namlen) + goto out; + s = name + namlen; + /* Just keep the first level name. */ + if (strchr(s, '.')) + *strchr(s, '.') = '\0'; + len = strlen(s) + 1; + dp = malloc(DIRECTSIZ(len)); + dp->d_reclen = DIRECTSIZ(len); + dp->d_namlen = len; + memcpy(&dp->d_name, s, len); + if (select && !select(dp)) { + free(dp); + continue; + } + if (cnt == max) { + max *= 2; + names2 = realloc(names, max * sizeof(void *)); + if (names2 == NULL) { + errno = ENOMEM; + free(dp); + goto errout; + } + names = names2; + } + names[cnt++] = dp; + } +out: + if (cnt && compar) + qsort(names, cnt, sizeof(struct dirent *), + (int (*)(const void *, const void *))compar); + + *namelist = names; + + return (cnt); + +errout: + err = errno; + for (i = 0; i < cnt; i++) + free(names[i]); + free(names); + return (-err); +} Index: contrib/ofed/management/opensm/include/opensm/osm_version.h =================================================================== --- contrib/ofed/management/opensm/include/opensm/osm_version.h (.../base) (revision 0) +++ contrib/ofed/management/opensm/include/opensm/osm_version.h (.../head) (revision 219811) @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _OSM_VERSION_H_ +#define _OSM_VERSION_H_ + +/****s* OpenSM: Base/OSM_VERSION +* NAME +* OSM_VERSION +* +* DESCRIPTION +* The version string for OpenSM +* +* SYNOPSIS +*/ +#define OSM_VERSION "OpenSM 3.3.1" +/********/ + +#endif /* _OSM_VERSION_H_ */ Index: contrib/ofed/management/opensm/include/opensm/osm_config.h =================================================================== --- contrib/ofed/management/opensm/include/opensm/osm_config.h (.../base) (revision 0) +++ contrib/ofed/management/opensm/include/opensm/osm_config.h (.../head) (revision 219811) @@ -0,0 +1,65 @@ +/* include/opensm/osm_config.h. Generated from osm_config.h.in by configure. */ +/* include/osm_config.h.in + * + * Defines various OpenSM configuration parameters to be used by various + * plugins and third party tools. + * + * NOTE: Defines used in header files MUST be included here to ensure plugin + * compatibility. + */ + +#ifndef _OSM_CONFIG_H_ +#define _OSM_CONFIG_H_ 1 + +/* define 1 if OpenSM build is in a debug mode */ +/* #undef OSM_DEBUG */ + +/* Define as 1 if you want Dual Sided RMPP Support */ +#define DUAL_SIDED_RMPP 1 + +/* Define as 1 if you want to enable a console on a socket connection */ +/* #undef ENABLE_OSM_CONSOLE_SOCKET */ + +/* Define as 1 if you want to enable the event plugin */ +/* #undef ENABLE_OSM_DEFAULT_EVENT_PLUGIN */ + +/* Define as 1 if you want to enable the performance manager */ +/* #undef ENABLE_OSM_PERF_MGR */ + +/* Define as 1 if you want to enable the performance manager profiling code */ +/* #undef ENABLE_OSM_PERF_MGR_PROFILE */ + +/* Define a default node name map file */ +#define HAVE_DEFAULT_NODENAME_MAP "/usr/local/etc/opensm/ib-node-name-map" + +/* Define a default OpenSM config file */ +#define HAVE_DEFAULT_OPENSM_CONFIG_FILE "/usr/local/etc/opensm/opensm.conf" + +/* Define a Partition config file */ +#define HAVE_DEFAULT_PARTITION_CONFIG_FILE "/usr/local/etc/opensm/partitions.conf" + +/* Define a Prefix Routes config file */ +#define HAVE_DEFAULT_PREFIX_ROUTES_FILE "/usr/local/etc/opensm/prefix-routes.conf" + +/* Define a QOS policy config file */ +#define HAVE_DEFAULT_QOS_POLICY_FILE "/usr/local/etc/opensm/qos-policy.conf" + +/* Define OpenSM config directory */ +#define OPENSM_CONFIG_DIR "/usr/local/etc/opensm" + +/* Define as 1 for vapi vendor */ +/* #undef OSM_VENDOR_INTF_MTL */ + +/* Define as 1 for OpenIB vendor */ +#define OSM_VENDOR_INTF_OPENIB 1 + +/* Define as 1 for sim vendor */ +/* #undef OSM_VENDOR_INTF_SIM */ + +/* Define as 1 for ts vendor */ +/* #undef OSM_VENDOR_INTF_TS */ + +/* Define as 1 if you want Vendor RMPP Support */ +#define VENDOR_RMPP_SUPPORT 1 + +#endif /* _OSM_CONFIG_H_ */ Index: contrib/ofed/management/opensm/include/complib/cl_debug_osd.h =================================================================== --- contrib/ofed/management/opensm/include/complib/cl_debug_osd.h (.../base) (revision 219811) +++ contrib/ofed/management/opensm/include/complib/cl_debug_osd.h (.../head) (revision 219811) @@ -42,7 +42,6 @@ #define _CL_DEBUG_OSD_H_ #include -#include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { Index: contrib/ofed/management/opensm/include/complib/cl_byteswap.h =================================================================== --- contrib/ofed/management/opensm/include/complib/cl_byteswap.h (.../base) (revision 219811) +++ contrib/ofed/management/opensm/include/complib/cl_byteswap.h (.../head) (revision 219811) @@ -86,6 +86,7 @@ * ntoh32, hton32 * ntoh64, hton64 */ + #ifndef __BYTE_ORDER #error "__BYTE_ORDER macro undefined. Missing in endian.h?" #endif Index: contrib/ofed/management/opensm/include/complib/cl_types.h =================================================================== --- contrib/ofed/management/opensm/include/complib/cl_types.h (.../base) (revision 219811) +++ contrib/ofed/management/opensm/include/complib/cl_types.h (.../head) (revision 219811) @@ -60,6 +60,14 @@ typedef uint32_t net32_t; typedef uint64_t net64_t; +#ifndef __WORDSIZE +#ifdef __LP64__ +#define __WORDSIZE 64 +#else +#define __WORDSIZE 32 +#endif +#endif + /* explicit cast of void* to uint32_t */ #ifndef ASSERT_VOIDP2UINTN #if __WORDSIZE == 64 Index: contrib/ofed/management/opensm/complib/cl_nodenamemap.c =================================================================== --- contrib/ofed/management/opensm/complib/cl_nodenamemap.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/complib/cl_nodenamemap.c (.../head) (revision 219811) @@ -42,7 +42,9 @@ #include #include #include +#ifdef HAVE_CONFIG_H #include +#endif #include Index: contrib/ofed/management/opensm/complib/cl_thread.c =================================================================== --- contrib/ofed/management/opensm/complib/cl_thread.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/complib/cl_thread.c (.../head) (revision 219811) @@ -39,7 +39,6 @@ #include #include -#include #include /* @@ -122,7 +121,7 @@ { uint32_t ret; - ret = get_nprocs(); + ret = sysconf(_SC_NPROCESSORS_ONLN); if (!ret) return 1; /* Workaround for PPC where get_nprocs() returns 0 */ Index: contrib/ofed/management/opensm/opensm/osm_prtn_config.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_prtn_config.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_prtn_config.c (.../head) (revision 219811) @@ -52,6 +52,8 @@ #include #include +#include + struct part_conf { osm_log_t *p_log; osm_subn_t *p_subn; Index: contrib/ofed/management/opensm/opensm/osm_helper.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_helper.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_helper.c (.../head) (revision 219811) @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include Index: contrib/ofed/management/opensm/opensm/osm_inform.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_inform.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_inform.c (.../head) (revision 219811) @@ -51,6 +51,7 @@ #include #include #include +#include typedef struct osm_infr_match_ctxt { cl_list_t *p_remove_infr_list; Index: contrib/ofed/management/opensm/opensm/osm_sa_informinfo.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_sa_informinfo.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_sa_informinfo.c (.../head) (revision 219811) @@ -60,6 +60,8 @@ #include #include +#include + typedef struct osm_iir_item { cl_list_item_t list_item; ib_inform_info_record_t rec; Index: contrib/ofed/management/opensm/opensm/osm_sa_mcmember_record.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_sa_mcmember_record.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_sa_mcmember_record.c (.../head) (revision 219811) @@ -64,6 +64,8 @@ #include #include +#include + #define JOIN_MC_COMP_MASK (IB_MCR_COMPMASK_MGID | \ IB_MCR_COMPMASK_PORT_GID | \ IB_MCR_COMPMASK_JOIN_STATE) Index: contrib/ofed/management/opensm/opensm/osm_sa_path_record.c =================================================================== --- contrib/ofed/management/opensm/opensm/osm_sa_path_record.c (.../base) (revision 219811) +++ contrib/ofed/management/opensm/opensm/osm_sa_path_record.c (.../head) (revision 219811) @@ -67,6 +67,8 @@ #include #include +#include + extern uint8_t osm_get_lash_sl(osm_opensm_t * p_osm, const osm_port_t * p_src_port, const osm_port_t * p_dst_port); Index: contrib/ofed/management/opensm/man/opensm.8 =================================================================== --- contrib/ofed/management/opensm/man/opensm.8 (.../base) (revision 0) +++ contrib/ofed/management/opensm/man/opensm.8 (.../head) (revision 219811) @@ -0,0 +1,1012 @@ +.TH OPENSM 8 "June 13, 2008" "OpenIB" "OpenIB Management" + +.SH NAME +opensm \- InfiniBand subnet manager and administration (SM/SA) + +.SH SYNOPSIS +.B opensm +[\-\-version]] +[\-F | \-\-config ] +[\-c(reate-config) ] +[\-g(uid) ] +[\-l(mc) ] +[\-p(riority) ] +[\-smkey ] +[\-r(eassign_lids)] +[\-R | \-\-routing_engine ] +[\-A | \-\-ucast_cache] +[\-z | \-\-connect_roots] +[\-M | \-\-lid_matrix_file ] +[\-U | \-\-lfts_file ] +[\-S | \-\-sadb_file ] +[\-a | \-\-root_guid_file ] +[\-u | \-\-cn_guid_file ] +[\-X | \-\-guid_routing_order_file ] +[\-m | \-\-ids_guid_file ] +[\-o(nce)] +[\-s(weep) ] +[\-t(imeout) ] +[\-maxsmps ] +[\-console [off | local | socket | loopback]] +[\-console-port ] +[\-i(gnore-guids) ] +[\-f | \-\-log_file ] +[\-L | \-\-log_limit ] [\-e(rase_log_file)] +[\-P(config) ] +[\-N | \-\-no_part_enforce] +[\-Q | \-\-qos [\-Y | \-\-qos_policy_file ]] +[\-y | \-\-stay_on_fatal] +[\-B | \-\-daemon] +[\-I | \-\-inactive] +[\-\-perfmgr] +[\-\-perfmgr_sweep_time_s ] +[\-\-prefix_routes_file ] +[\-\-consolidate_ipv6_snm_req] +[\-v(erbose)] [\-V] [\-D ] [\-d(ebug) ] +[\-h(elp)] [\-?] + +.SH DESCRIPTION +.PP +opensm is an InfiniBand compliant Subnet Manager and Administration, +and runs on top of OpenIB. + +opensm provides an implementation of an InfiniBand Subnet Manager and +Administration. Such a software entity is required to run for in order +to initialize the InfiniBand hardware (at least one per each +InfiniBand subnet). + +opensm also now contains an experimental version of a performance +manager as well. + +opensm defaults were designed to meet the common case usage on clusters with up to a few hundred nodes. Thus, in this default mode, opensm will scan the IB +fabric, initialize it, and sweep occasionally for changes. + +opensm attaches to a specific IB port on the local machine and configures only +the fabric connected to it. (If the local machine has other IB ports, +opensm will ignore the fabrics connected to those other ports). If no port is +specified, it will select the first "best" available port. + +opensm can present the available ports and prompt for a port number to +attach to. + +By default, the run is logged to two files: /var/log/messages and /var/log/opensm.log. +The first file will register only general major events, whereas the second +will include details of reported errors. All errors reported in this second +file should be treated as indicators of IB fabric health issues. +(Note that when a fatal and non-recoverable error occurs, opensm will exit.) +Both log files should include the message "SUBNET UP" if opensm was able to +setup the subnet correctly. + +.SH OPTIONS + +.PP +.TP +\fB\-\-version\fR +Prints OpenSM version and exits. +.TP +\fB\-F\fR, \fB\-\-config\fR +The name of the OpenSM config file. When not specified +\fB\% @OPENSM_CONFIG_DIR@/@OPENSM_CONFIG_FILE@\fP will be used (if exists). +.TP +\fB\-c\fR, \fB\-\-create-config\fR +OpenSM will dump its configuration to the specified file and exit. +This is a way to generate OpenSM configuration file template. +.TP +\fB\-g\fR, \fB\-\-guid\fR +This option specifies the local port GUID value +with which OpenSM should bind. OpenSM may be +bound to 1 port at a time. +If GUID given is 0, OpenSM displays a list +of possible port GUIDs and waits for user input. +Without -g, OpenSM tries to use the default port. +.TP +\fB\-l\fR, \fB\-\-lmc\fR +This option specifies the subnet's LMC value. +The number of LIDs assigned to each port is 2^LMC. +The LMC value must be in the range 0-7. +LMC values > 0 allow multiple paths between ports. +LMC values > 0 should only be used if the subnet +topology actually provides multiple paths between +ports, i.e. multiple interconnects between switches. +Without -l, OpenSM defaults to LMC = 0, which allows +one path between any two ports. +.TP +\fB\-p\fR, \fB\-\-priority\fR +This option specifies the SM\'s PRIORITY. +This will effect the handover cases, where master +is chosen by priority and GUID. Range goes from 0 +(default and lowest priority) to 15 (highest). +.TP +\fB\-smkey\fR +This option specifies the SM\'s SM_Key (64 bits). +This will effect SM authentication. +Note that OpenSM version 3.2.1 and below used the default value '1' +in a host byte order, it is fixed now but you may need this option to +interoperate with old OpenSM running on a little endian machine. +.TP +\fB\-r\fR, \fB\-\-reassign_lids\fR +This option causes OpenSM to reassign LIDs to all +end nodes. Specifying -r on a running subnet +may disrupt subnet traffic. +Without -r, OpenSM attempts to preserve existing +LID assignments resolving multiple use of same LID. +.TP +\fB\-R\fR, \fB\-\-routing_engine\fR +This option chooses routing engine(s) to use instead of Min Hop +algorithm (default). Multiple routing engines can be specified +separated by commas so that specific ordering of routing algorithms +will be tried if earlier routing engines fail. +Supported engines: minhop, updn, file, ftree, lash, dor +.TP +\fB\-A\fR, \fB\-\-ucast_cache\fR +This option enables unicast routing cache and prevents routing +recalculation (which is a heavy task in a large cluster) when +there was no topology change detected during the heavy sweep, or +when the topology change does not require new routing calculation, +e.g. when one or more CAs/RTRs/leaf switches going down, or one or +more of these nodes coming back after being down. +A very common case that is handled by the unicast routing cache +is host reboot, which otherwise would cause two full routing +recalculations: one when the host goes down, and the other when +the host comes back online. +.TP +\fB\-z\fR, \fB\-\-connect_roots\fR +This option enforces a routing engine (currently up/down +only) to make connectivity between root switches and in +this way to be fully IBA complaint. In many cases this can +violate "pure" deadlock free algorithm, so use it carefully. +.TP +\fB\-M\fR, \fB\-\-lid_matrix_file\fR +This option specifies the name of the lid matrix dump file +from where switch lid matrices (min hops tables will be +loaded. +.TP +\fB\-U\fR, \fB\-\-lfts_file\fR +This option specifies the name of the LFTs file +from where switch forwarding tables will be loaded. +.TP +\fB\-S\fR, \fB\-\-sadb_file\fR +This option specifies the name of the SA DB dump file +from where SA database will be loaded. +.TP +\fB\-a\fR, \fB\-\-root_guid_file\fR +Set the root nodes for the Up/Down or Fat-Tree routing +algorithm to the guids provided in the given file (one to a line). +.TP +\fB\-u\fR, \fB\-\-cn_guid_file\fR +Set the compute nodes for the Fat-Tree routing algorithm +to the guids provided in the given file (one to a line). +.TP +\fB\-m\fR, \fB\-\-ids_guid_file\fR +Name of the map file with set of the IDs which will be used +by Up/Down routing algorithm instead of node GUIDs +(format: per line). +.TP +\fB\-X\fR, \fB\-\-guid_routing_order_file\fR +Set the order port guids will be routed for the MinHop +and Up/Down routing algorithms to the guids provided in the +given file (one to a line). +.TP +\fB\-o\fR, \fB\-\-once\fR +This option causes OpenSM to configure the subnet +once, then exit. Ports remain in the ACTIVE state. +.TP +\fB\-s\fR, \fB\-\-sweep\fR +This option specifies the number of seconds between +subnet sweeps. Specifying -s 0 disables sweeping. +Without -s, OpenSM defaults to a sweep interval of +10 seconds. +.TP +\fB\-t\fR, \fB\-\-timeout\fR +This option specifies the time in milliseconds +used for transaction timeouts. +Specifying -t 0 disables timeouts. +Without -t, OpenSM defaults to a timeout value of +200 milliseconds. +.TP +\fB\-maxsmps\fR +This option specifies the number of VL15 SMP MADs +allowed on the wire at any one time. +Specifying -maxsmps 0 allows unlimited outstanding +SMPs. +Without -maxsmps, OpenSM defaults to a maximum of +4 outstanding SMPs. +.TP +\fB\-console [off | local | socket | loopback]\fR +This option brings up the OpenSM console (default off). +Note that the socket and loopback options will only be available +if OpenSM was built with --enable-console-socket. +.TP +\fB\-console-port\fR +Specify an alternate telnet port for the socket console (default 10000). +Note that this option only appears if OpenSM was built with +--enable-console-socket. +.TP +\fB\-i\fR, \fB\-ignore-guids\fR +This option provides the means to define a set of ports +(by node guid and port number) that will be ignored by the link load +equalization algorithm. +.TP +\fB\-x\fR, \fB\-\-honor_guid2lid\fR +This option forces OpenSM to honor the guid2lid file, +when it comes out of Standby state, if such file exists +under OSM_CACHE_DIR, and is valid. +By default, this is FALSE. +.TP +\fB\-f\fR, \fB\-\-log_file\fR +This option defines the log to be the given file. +By default, the log goes to /var/log/opensm.log. +For the log to go to standard output use -f stdout. +.TP +\fB\-L\fR, \fB\-\-log_limit\fR +This option defines maximal log file size in MB. When +specified the log file will be truncated upon reaching +this limit. +.TP +\fB\-e\fR, \fB\-\-erase_log_file\fR +This option will cause deletion of the log file +(if it previously exists). By default, the log file +is accumulative. +.TP +\fB\-P\fR, \fB\-\-Pconfig\fR +This option defines the optional partition configuration file. +The default name is \fB\%@OPENSM_CONFIG_DIR@/@PARTITION_CONFIG_FILE@\fP. +.TP +\fB\-\-prefix_routes_file\fR +Prefix routes control how the SA responds to path record queries for +off-subnet DGIDs. By default, the SA fails such queries. The +.B PREFIX ROUTES +section below describes the format of the configuration file. +The default path is \fB\%@OPENSM_CONFIG_DIR@/prefix\-routes.conf\fP. +.TP +\fB\-Q\fR, \fB\-\-qos\fR +This option enables QoS setup. It is disabled by default. +.TP +\fB\-Y\fR, \fB\-\-qos_policy_file\fR +This option defines the optional QoS policy file. The default +name is \fB\%@OPENSM_CONFIG_DIR@/@QOS_POLICY_FILE@\fP. +.TP +\fB\-N\fR, \fB\-\-no_part_enforce\fR +This option disables partition enforcement on switch external ports. +.TP +\fB\-y\fR, \fB\-\-stay_on_fatal\fR +This option will cause SM not to exit on fatal initialization +issues: if SM discovers duplicated guids or a 12x link with +lane reversal badly configured. +By default, the SM will exit on these errors. +.TP +\fB\-B\fR, \fB\-\-daemon\fR +Run in daemon mode - OpenSM will run in the background. +.TP +\fB\-I\fR, \fB\-\-inactive\fR +Start SM in inactive rather than init SM state. This +option can be used in conjunction with the perfmgr so as to +run a standalone performance manager without SM/SA. However, +this is NOT currently implemented in the performance manager. +.TP +\fB\-perfmgr\fR +Enable the perfmgr. Only takes effect if --enable-perfmgr was specified at +configure time. +.TP +\fB\-perfmgr_sweep_time_s\fR +Specify the sweep time for the performance manager in seconds +(default is 180 seconds). Only takes +effect if --enable-perfmgr was specified at configure time. +.TP +.BI --consolidate_ipv6_snm_req +Consolidate IPv6 Solicited Node Multicast group join requests into one +multicast group per MGID PKey. +.TP +\fB\-v\fR, \fB\-\-verbose\fR +This option increases the log verbosity level. +The -v option may be specified multiple times +to further increase the verbosity level. +See the -D option for more information about +log verbosity. +.TP +\fB\-V\fR +This option sets the maximum verbosity level and +forces log flushing. +The -V option is equivalent to \'-D 0xFF -d 2\'. +See the -D option for more information about +log verbosity. +.TP +\fB\-D\fR +This option sets the log verbosity level. +A flags field must follow the -D option. +A bit set/clear in the flags enables/disables a +specific log level as follows: + + BIT LOG LEVEL ENABLED + ---- ----------------- + 0x01 - ERROR (error messages) + 0x02 - INFO (basic messages, low volume) + 0x04 - VERBOSE (interesting stuff, moderate volume) + 0x08 - DEBUG (diagnostic, high volume) + 0x10 - FUNCS (function entry/exit, very high volume) + 0x20 - FRAMES (dumps all SMP and GMP frames) + 0x40 - ROUTING (dump FDB routing information) + 0x80 - currently unused. + +Without -D, OpenSM defaults to ERROR + INFO (0x3). +Specifying -D 0 disables all messages. +Specifying -D 0xFF enables all messages (see -V). +High verbosity levels may require increasing +the transaction timeout with the -t option. +.TP +\fB\-d\fR, \fB\-\-debug\fR +This option specifies a debug option. +These options are not normally needed. +The number following -d selects the debug +option to enable as follows: + + OPT Description + --- ----------------- + -d0 - Ignore other SM nodes + -d1 - Force single threaded dispatching + -d2 - Force log flushing after each log message + -d3 - Disable multicast support +.TP +\fB\-h\fR, \fB\-\-help\fR +Display this usage info then exit. +.TP +\fB\-?\fR +Display this usage info then exit. + +.SH ENVIRONMENT VARIABLES +.PP +The following environment variables control opensm behavior: + +OSM_TMP_DIR - controls the directory in which the temporary files generated by +opensm are created. These files are: opensm-subnet.lst, opensm.fdbs, and +opensm.mcfdbs. By default, this directory is /var/log. + +OSM_CACHE_DIR - opensm stores certain data to the disk such that subsequent +runs are consistent. The default directory used is /var/cache/opensm. +The following file is included in it: + + guid2lid - stores the LID range assigned to each GUID + +.SH NOTES +.PP +When opensm receives a HUP signal, it starts a new heavy sweep as if a trap was received or a topology change was found. +.PP +Also, SIGUSR1 can be used to trigger a reopen of /var/log/opensm.log for +logrotate purposes. + +.SH PARTITION CONFIGURATION +.PP +The default name of OpenSM partitions configuration file is +\fB\%@OPENSM_CONFIG_DIR@/@PARTITION_CONFIG_FILE@\fP. The default may be changed by using +--Pconfig (-P) option with OpenSM. + +The default partition will be created by OpenSM unconditionally even +when partition configuration file does not exist or cannot be accessed. + +The default partition has P_Key value 0x7fff. OpenSM\'s port will have +full membership in default partition. All other end ports will have +partial membership. + +File Format + +Comments: + +Line content followed after \'#\' character is comment and ignored by +parser. + +General file format: + +: ; + +Partition Definition: + +[PartitionName][=PKey][,flag[=value]][,defmember=full|limited] + + PartitionName - string, will be used with logging. When omitted + empty string will be used. + PKey - P_Key value for this partition. Only low 15 bits will + be used. When omitted will be autogenerated. + flag - used to indicate IPoIB capability of this partition. + defmember=full|limited - specifies default membership for port guid + list. Default is limited. + +Currently recognized flags are: + + ipoib - indicates that this partition may be used for IPoIB, as + result IPoIB capable MC group will be created. + rate= - specifies rate for this IPoIB MC group + (default is 3 (10GBps)) + mtu= - specifies MTU for this IPoIB MC group + (default is 4 (2048)) + sl= - specifies SL for this IPoIB MC group + (default is 0) + scope= - specifies scope for this IPoIB MC group + (default is 2 (link local)). Multiple scope settings + are permitted for a partition. + +Note that values for rate, mtu, and scope should be specified as +defined in the IBTA specification (for example, mtu=4 for 2048). + +PortGUIDs list: + + PortGUID - GUID of partition member EndPort. Hexadecimal + numbers should start from 0x, decimal numbers + are accepted too. + full or limited - indicates full or limited membership for this + port. When omitted (or unrecognized) limited + membership is assumed. + +There are two useful keywords for PortGUID definition: + + - 'ALL' means all end ports in this subnet. + - 'SELF' means subnet manager's port. + +Empty list means no ports in this partition. + +Notes: + +White space is permitted between delimiters ('=', ',',':',';'). + +The line can be wrapped after ':' followed after Partition Definition and +between. + +PartitionName does not need to be unique, PKey does need to be unique. +If PKey is repeated then those partition configurations will be merged +and first PartitionName will be used (see also next note). + +It is possible to split partition configuration in more than one +definition, but then PKey should be explicitly specified (otherwise +different PKey values will be generated for those definitions). + +Examples: + + Default=0x7fff : ALL, SELF=full ; + + NewPartition , ipoib : 0x123456=full, 0x3456789034=limi, 0x2134af2306 ; + + YetAnotherOne = 0x300 : SELF=full ; + YetAnotherOne = 0x300 : ALL=limited ; + + ShareIO = 0x80 , defmember=full : 0x123451, 0x123452; + # 0x123453, 0x123454 will be limited + ShareIO = 0x80 : 0x123453, 0x123454, 0x123455=full; + # 0x123456, 0x123457 will be limited + ShareIO = 0x80 : defmember=limited : 0x123456, 0x123457, 0x123458=full; + ShareIO = 0x80 , defmember=full : 0x123459, 0x12345a; + ShareIO = 0x80 , defmember=full : 0x12345b, 0x12345c=limited, 0x12345d; + + +Note: + +The following rule is equivalent to how OpenSM used to run prior to the +partition manager: + + Default=0x7fff,ipoib:ALL=full; + +.SH QOS CONFIGURATION +.PP +There are a set of QoS related low-level configuration parameters. +All these parameter names are prefixed by "qos_" string. Here is a full +list of these parameters: + + qos_max_vls - The maximum number of VLs that will be on the subnet + qos_high_limit - The limit of High Priority component of VL + Arbitration table (IBA 7.6.9) + qos_vlarb_low - Low priority VL Arbitration table (IBA 7.6.9) + template + qos_vlarb_high - High priority VL Arbitration table (IBA 7.6.9) + template + Both VL arbitration templates are pairs of + VL and weight + qos_sl2vl - SL2VL Mapping table (IBA 7.6.6) template. It is + a list of VLs corresponding to SLs 0-15 (Note + that VL15 used here means drop this SL) + +Typical default values (hard-coded in OpenSM initialization) are: + + qos_max_vls 15 + qos_high_limit 0 + qos_vlarb_low 0:0,1:4,2:4,3:4,4:4,5:4,6:4,7:4,8:4,9:4,10:4,11:4,12:4,13:4,14:4 + qos_vlarb_high 0:4,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0 + qos_sl2vl 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,7 + +The syntax is compatible with rest of OpenSM configuration options and +values may be stored in OpenSM config file (cached options file). + +In addition to the above, we may define separate QoS configuration +parameters sets for various target types. As targets, we currently support +CAs, routers, switch external ports, and switch's enhanced port 0. The +names of such specialized parameters are prefixed by "qos__" +string. Here is a full list of the currently supported sets: + + qos_ca_ - QoS configuration parameters set for CAs. + qos_rtr_ - parameters set for routers. + qos_sw0_ - parameters set for switches' port 0. + qos_swe_ - parameters set for switches' external ports. + +Examples: + qos_sw0_max_vls=2 + qos_ca_sl2vl=0,1,2,3,5,5,5,12,12,0, + qos_swe_high_limit=0 + +.SH PREFIX ROUTES +.PP +Prefix routes control how the SA responds to path record queries for +off-subnet DGIDs. By default, the SA fails such queries. +Note that IBA does not specify how the SA should obtain off-subnet path +record information. +The prefix routes configuration is meant as a stop-gap until the +specification is completed. +.PP +Each line in the configuration file is a 64-bit prefix followed by a +64-bit GUID, separated by white space. +The GUID specifies the router port on the local subnet that will +handle the prefix. +Blank lines are ignored, as is anything between a \fB#\fP character +and the end of the line. +The prefix and GUID are both in hex, the leading 0x is optional. +Either, or both, can be wild-carded by specifying an +asterisk instead of an explicit prefix or GUID. +.PP +When responding to a path record query for an off-subnet DGID, +opensm searches for the first prefix match in the configuration file. +Therefore, the order of the lines in the configuration file is important: +a wild-carded prefix at the beginning of the configuration file renders +all subsequent lines useless. +If there is no match, then opensm fails the query. +It is legal to repeat prefixes in the configuration file, +opensm will return the path to the first available matching router. +A configuration file with a single line where both prefix and GUID +are wild-carded means that a path record query specifying any +off-subnet DGID should return a path to the first available router. +This configuration yields the same behaviour formerly achieved by +compiling opensm with -DROUTER_EXP. + +.SH ROUTING +.PP +OpenSM now offers five routing engines: + +1. Min Hop Algorithm - based on the minimum hops to each node where the +path length is optimized. + +2. UPDN Unicast routing algorithm - also based on the minimum hops to each +node, but it is constrained to ranking rules. This algorithm should be chosen +if the subnet is not a pure Fat Tree, and deadlock may occur due to a +loop in the subnet. + +3. Fat Tree Unicast routing algorithm - this algorithm optimizes routing +for congestion-free "shift" communication pattern. +It should be chosen if a subnet is a symmetrical or almost symmetrical +fat-tree of various types, not just K-ary-N-Trees: non-constant K, not +fully staffed, any Constant Bisectional Bandwidth (CBB) ratio. +Similar to UPDN, Fat Tree routing is constrained to ranking rules. + +4. LASH unicast routing algorithm - uses Infiniband virtual layers +(SL) to provide deadlock-free shortest-path routing while also +distributing the paths between layers. LASH is an alternative +deadlock-free topology-agnostic routing algorithm to the non-minimal +UPDN algorithm avoiding the use of a potentially congested root node. + +5. DOR Unicast routing algorithm - based on the Min Hop algorithm, but +avoids port equalization except for redundant links between the same +two switches. This provides deadlock free routes for hypercubes when +the fabric is cabled as a hypercube and for meshes when cabled as a +mesh (see details below). + +OpenSM also supports a file method which +can load routes from a table. See \'Modular Routing Engine\' for more +information on this. + +The basic routing algorithm is comprised of two stages: + +1. MinHop matrix calculation + How many hops are required to get from each port to each LID ? + The algorithm to fill these tables is different if you run standard +(min hop) or Up/Down. + For standard routing, a "relaxation" algorithm is used to propagate +min hop from every destination LID through neighbor switches + For Up/Down routing, a BFS from every target is used. The BFS tracks link +direction (up or down) and avoid steps that will perform up after a down +step was used. + +2. Once MinHop matrices exist, each switch is visited and for each target LID a +decision is made as to what port should be used to get to that LID. + This step is common to standard and Up/Down routing. Each port has a +counter counting the number of target LIDs going through it. + When there are multiple alternative ports with same MinHop to a LID, +the one with less previously assigned ports is selected. + If LMC > 0, more checks are added: Within each group of LIDs assigned to +same target port, + a. use only ports which have same MinHop + b. first prefer the ones that go to different systemImageGuid (then +the previous LID of the same LMC group) + c. if none - prefer those which go through another NodeGuid + d. fall back to the number of paths method (if all go to same node). + +Effect of Topology Changes + +OpenSM will preserve existing routing in any case where there is no change in +the fabric switches unless the -r (--reassign_lids) option is specified. + +-r +.br +--reassign_lids + This option causes OpenSM to reassign LIDs to all + end nodes. Specifying -r on a running subnet + may disrupt subnet traffic. + Without -r, OpenSM attempts to preserve existing + LID assignments resolving multiple use of same LID. + +If a link is added or removed, OpenSM does not recalculate +the routes that do not have to change. A route has to change +if the port is no longer UP or no longer the MinHop. When routing changes +are performed, the same algorithm for balancing the routes is invoked. + +In the case of using the file based routing, any topology changes are +currently ignored The 'file' routing engine just loads the LFTs from the file +specified, with no reaction to real topology. Obviously, this will not be able +to recheck LIDs (by GUID) for disconnected nodes, and LFTs for non-existent +switches will be skipped. Multicast is not affected by 'file' routing engine +(this uses min hop tables). + + +Min Hop Algorithm + +The Min Hop algorithm is invoked by default if no routing algorithm is +specified. It can also be invoked by specifying '-R minhop'. + +The Min Hop algorithm is divided into two stages: computation of +min-hop tables on every switch and LFT output port assignment. Link +subscription is also equalized with the ability to override based on +port GUID. The latter is supplied by: + +-i +.br +-ignore-guids + This option provides the means to define a set of ports + (by guid) that will be ignored by the link load + equalization algorithm. Note that only endports (CA, + switch port 0, and router ports) and not switch external + ports are supported. + +LMC awareness routes based on (remote) system or switch basis. + + +Purpose of UPDN Algorithm + +The UPDN algorithm is designed to prevent deadlocks from occurring in loops +of the subnet. A loop-deadlock is a situation in which it is no longer +possible to send data between any two hosts connected through the loop. As +such, the UPDN routing algorithm should be used if the subnet is not a pure +Fat Tree, and one of its loops may experience a deadlock (due, for example, +to high pressure). + +The UPDN algorithm is based on the following main stages: + +1. Auto-detect root nodes - based on the CA hop length from any switch in +the subnet, a statistical histogram is built for each switch (hop num vs +number of occurrences). If the histogram reflects a specific column (higher +than others) for a certain node, then it is marked as a root node. Since +the algorithm is statistical, it may not find any root nodes. The list of +the root nodes found by this auto-detect stage is used by the ranking +process stage. + + Note 1: The user can override the node list manually. + Note 2: If this stage cannot find any root nodes, and the user did + not specify a guid list file, OpenSM defaults back to the + Min Hop routing algorithm. + +2. Ranking process - All root switch nodes (found in stage 1) are assigned +a rank of 0. Using the BFS algorithm, the rest of the switch nodes in the +subnet are ranked incrementally. This ranking aids in the process of enforcing +rules that ensure loop-free paths. + +3. Min Hop Table setting - after ranking is done, a BFS algorithm is run from +each (CA or switch) node in the subnet. During the BFS process, the FDB table +of each switch node traversed by BFS is updated, in reference to the starting +node, based on the ranking rules and guid values. + +At the end of the process, the updated FDB tables ensure loop-free paths +through the subnet. + +Note: Up/Down routing does not allow LID routing communication between +switches that are located inside spine "switch systems". +The reason is that there is no way to allow a LID route between them +that does not break the Up/Down rule. +One ramification of this is that you cannot run SM on switches other +than the leaf switches of the fabric. + + +UPDN Algorithm Usage + +Activation through OpenSM + +Use '-R updn' option (instead of old '-u') to activate the UPDN algorithm. +Use '-a ' for adding an UPDN guid file that contains the +root nodes for ranking. +If the `-a' option is not used, OpenSM uses its auto-detect root nodes +algorithm. + +Notes on the guid list file: + +1. A valid guid file specifies one guid in each line. Lines with an invalid +format will be discarded. +.br +2. The user should specify the root switch guids. However, it is also +possible to specify CA guids; OpenSM will use the guid of the switch (if +it exists) that connects the CA to the subnet as a root node. + + +Fat-tree Routing Algorithm + +The fat-tree algorithm optimizes routing for "shift" communication pattern. +It should be chosen if a subnet is a symmetrical or almost symmetrical +fat-tree of various types. +It supports not just K-ary-N-Trees, by handling for non-constant K, +cases where not all leafs (CAs) are present, any CBB ratio. +As in UPDN, fat-tree also prevents credit-loop-deadlocks. + +If the root guid file is not provided ('-a' or '--root_guid_file' options), +the topology has to be pure fat-tree that complies with the following rules: + - Tree rank should be between two and eight (inclusively) + - Switches of the same rank should have the same number + of UP-going port groups*, unless they are root switches, + in which case the shouldn't have UP-going ports at all. + - Switches of the same rank should have the same number + of DOWN-going port groups, unless they are leaf switches. + - Switches of the same rank should have the same number + of ports in each UP-going port group. + - Switches of the same rank should have the same number + of ports in each DOWN-going port group. + - All the CAs have to be at the same tree level (rank). + +If the root guid file is provided, the topology doesn't have to be pure +fat-tree, and it should only comply with the following rules: + - Tree rank should be between two and eight (inclusively) + - All the Compute Nodes** have to be at the same tree level (rank). + Note that non-compute node CAs are allowed here to be at different + tree ranks. + +* ports that are connected to the same remote switch are referenced as +\'port group\'. + +** list of compute nodes (CNs) can be specified by \'-u\' or \'--cn_guid_file\' +OpenSM options. + +Topologies that do not comply cause a fallback to min hop routing. +Note that this can also occur on link failures which cause the topology +to no longer be "pure" fat-tree. + +Note that although fat-tree algorithm supports trees with non-integer CBB +ratio, the routing will not be as balanced as in case of integer CBB ratio. +In addition to this, although the algorithm allows leaf switches to have any +number of CAs, the closer the tree is to be fully populated, the more +effective the "shift" communication pattern will be. +In general, even if the root list is provided, the closer the topology to a +pure and symmetrical fat-tree, the more optimal the routing will be. + +The algorithm also dumps compute node ordering file (opensm-ftree-ca-order.dump) +in the same directory where the OpenSM log resides. This ordering file provides +the CN order that may be used to create efficient communication pattern, that +will match the routing tables. + +Activation through OpenSM + +Use '-R ftree' option to activate the fat-tree algorithm. +Use '-a ' to provide root nodes for ranking. If the `-a' option +is not used, routing algorithm will detect roots automatically. +Use '-u ' to provide the list of compute nodes. If the `-u' option +is not used, all the CAs are considered as compute nodes. + +Note: LMC > 0 is not supported by fat-tree routing. If this is +specified, the default routing algorithm is invoked instead. + + +LASH Routing Algorithm + +LASH is an acronym for LAyered SHortest Path Routing. It is a +deterministic shortest path routing algorithm that enables topology +agnostic deadlock-free routing within communication networks. + +When computing the routing function, LASH analyzes the network +topology for the shortest-path routes between all pairs of sources / +destinations and groups these paths into virtual layers in such a way +as to avoid deadlock. + +Note LASH analyzes routes and ensures deadlock freedom between switch +pairs. The link from HCA between and switch does not need virtual +layers as deadlock will not arise between switch and HCA. + +In more detail, the algorithm works as follows: + +1) LASH determines the shortest-path between all pairs of source / +destination switches. Note, LASH ensures the same SL is used for all +SRC/DST - DST/SRC pairs and there is no guarantee that the return +path for a given DST/SRC will be the reverse of the route SRC/DST. + +2) LASH then begins an SL assignment process where a route is assigned +to a layer (SL) if the addition of that route does not cause deadlock +within that layer. This is achieved by maintaining and analysing a +channel dependency graph for each layer. Once the potential addition +of a path could lead to deadlock, LASH opens a new layer and continues +the process. + +3) Once this stage has been completed, it is highly likely that the +first layers processed will contain more paths than the latter ones. +To better balance the use of layers, LASH moves paths from one layer +to another so that the number of paths in each layer averages out. + +Note, the implementation of LASH in opensm attempts to use as few layers +as possible. This number can be less than the number of actual layers +available. + +In general LASH is a very flexible algorithm. It can, for example, +reduce to Dimension Order Routing in certain topologies, it is topology +agnostic and fares well in the face of faults. + +It has been shown that for both regular and irregular topologies, LASH +outperforms Up/Down. The reason for this is that LASH distributes the +traffic more evenly through a network, avoiding the bottleneck issues +related to a root node and always routes shortest-path. + +The algorithm was developed by Simula Research Laboratory. + + +Use '-R lash -Q ' option to activate the LASH algorithm. + +Note: QoS support has to be turned on in order that SL/VL mappings are +used. + +Note: LMC > 0 is not supported by the LASH routing. If this is +specified, the default routing algorithm is invoked instead. + + +DOR Routing Algorithm + +The Dimension Order Routing algorithm is based on the Min Hop +algorithm and so uses shortest paths. Instead of spreading traffic +out across different paths with the same shortest distance, it chooses +among the available shortest paths based on an ordering of dimensions. +Each port must be consistently cabled to represent a hypercube +dimension or a mesh dimension. Paths are grown from a destination +back to a source using the lowest dimension (port) of available paths +at each step. This provides the ordering necessary to avoid deadlock. +When there are multiple links between any two switches, they still +represent only one dimension and traffic is balanced across them +unless port equalization is turned off. In the case of hypercubes, +the same port must be used throughout the fabric to represent the +hypercube dimension and match on both ends of the cable. In the case +of meshes, the dimension should consistently use the same pair of +ports, one port on one end of the cable, and the other port on the +other end, continuing along the mesh dimension. + +Use '-R dor' option to activate the DOR algorithm. + + +Routing References + +To learn more about deadlock-free routing, see the article +"Deadlock Free Message Routing in Multiprocessor Interconnection Networks" +by William J Dally and Charles L Seitz (1985). + +To learn more about the up/down algorithm, see the article +"Effective Strategy to Compute Forwarding Tables for InfiniBand Networks" +by Jose Carlos Sancho, Antonio Robles, and Jose Duato at the +Universidad Politecnica de Valencia. + +To learn more about LASH and the flexibility behind it, the requirement +for layers, performance comparisons to other algorithms, see the +following articles: + +"Layered Routing in Irregular Networks", Lysne et al, IEEE +Transactions on Parallel and Distributed Systems, VOL.16, No12, +December 2005. + +"Routing for the ASI Fabric Manager", Solheim et al. IEEE +Communications Magazine, Vol.44, No.7, July 2006. + +"Layered Shortest Path (LASH) Routing in Irregular System Area +Networks", Skeie et al. IEEE Computer Society Communication +Architecture for Clusters 2002. + + +Modular Routine Engine + +Modular routing engine structure allows for the ease of +"plugging" new routing modules. + +Currently, only unicast callbacks are supported. Multicast +can be added later. + +One existing routing module is up-down "updn", which may be +activated with '-R updn' option (instead of old '-u'). + +General usage is: +$ opensm -R 'module-name' + +There is also a trivial routing module which is able +to load LFT tables from a file. + +Main features: + + - this will load switch LFTs and/or LID matrices (min hops tables) + - this will load switch LFTs according to the path entries introduced + in the file + - no additional checks will be performed (such as "is port connected", + etc.) + - in case when fabric LIDs were changed this will try to reconstruct + LFTs correctly if endport GUIDs are represented in the file + (in order to disable this, GUIDs may be removed from the file + or zeroed) + +The file format is compatible with output of 'ibroute' util and for +whole fabric can be generated with dump_lfts.sh script. + +To activate file based routing module, use: + + opensm -R file -U /path/to/lfts_file + +If the lfts_file is not found or is in error, the default routing +algorithm is utilized. + +The ability to dump switch lid matrices (aka min hops tables) to file and +later to load these is also supported. + +The usage is similar to unicast forwarding tables loading from a lfts +file (introduced by 'file' routing engine), but new lid matrix file +name should be specified by -M or --lid_matrix_file option. For example: + + opensm -R file -M ./opensm-lid-matrix.dump + +The dump file is named \'opensm-lid-matrix.dump\' and will be generated +in standard opensm dump directory (/var/log by default) when +OSM_LOG_ROUTING logging flag is set. + +When routing engine 'file' is activated, but the lfts file is not specified +or not cannot be open default lid matrix algorithm will be used. + +There is also a switch forwarding tables dumper which generates +a file compatible with dump_lfts.sh output. This file can be used +as input for forwarding tables loading by 'file' routing engine. +Both or one of options -U and -M can be specified together with \'-R file\'. + +.SH FILES +.TP +.B @OPENSM_CONFIG_DIR@/@OPENSM_CONFIG_FILE@ +default OpenSM config file. + +.TP +.B @OPENSM_CONFIG_DIR@/@NODENAMEMAPFILE@ +default node name map file. See ibnetdiscover for more information on format. + +.TP +.B @OPENSM_CONFIG_DIR@/@PARTITION_CONFIG_FILE@ +default partition config file + +.TP +.B @OPENSM_CONFIG_DIR@/@QOS_POLICY_FILE@ +default QOS policy config file + +.TP +.B @OPENSM_CONFIG_DIR@/@PREFIX_ROUTES_FILE@ +default prefix routes file. + +.SH AUTHORS +.TP +Hal Rosenstock +.RI < hal.rosenstock@gmail.com > +.TP +Sasha Khapyorsky +.RI < sashak@voltaire.com > +.TP +Eitan Zahavi +.RI < eitan@mellanox.co.il > +.TP +Yevgeny Kliteynik +.RI < kliteyn@mellanox.co.il > +.TP +Thomas Sodring +.RI < tsodring@simula.no > +.TP +Ira Weiny +.RI < weiny2@llnl.gov > Index: contrib/ofed/management/libibmad/src/rpc.c =================================================================== --- contrib/ofed/management/libibmad/src/rpc.c (.../base) (revision 219811) +++ contrib/ofed/management/libibmad/src/rpc.c (.../head) (revision 219811) @@ -140,7 +140,7 @@ length = len; if (umad_send(port_id, agentid, sndbuf, length, timeout, 0) < 0) { - IBWARN("send failed; %m"); + IBWARN("send failed; %s", strerror(errno)); return -1; } @@ -148,7 +148,7 @@ /* send packet is lost somewhere. */ do { if (umad_recv(port_id, rcvbuf, &length, timeout) < 0) { - IBWARN("recv failed: %m"); + IBWARN("recv failed: %s", strerror(errno)); return -1; } Index: contrib/ofed/Makefile =================================================================== --- contrib/ofed/Makefile (.../base) (revision 0) +++ contrib/ofed/Makefile (.../head) (revision 219811) @@ -0,0 +1,5 @@ +.include + +SUBDIR = include usr.lib usr.bin + +.include Index: contrib/ofed/libsdp/src/config_parser.y =================================================================== --- contrib/ofed/libsdp/src/config_parser.y (.../base) (revision 219811) +++ contrib/ofed/libsdp/src/config_parser.y (.../head) (revision 219811) @@ -1,333 +0,0 @@ -/* - * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * $Id: config_parser.y 1.5 2005/06/29 11:39:27 eitan Exp $ - */ - - -/* - -*/ -%{ - -/* header section */ -#include -#include -#include -#include "libsdp.h" -#include -#include -#include - -#define YYERROR_VERBOSE 1 - -extern int yyerror(char *msg); -extern int yylex(void); -static int parse_err = 0; - -struct use_family_rule *__sdp_clients_family_rules_head = NULL; -struct use_family_rule *__sdp_clients_family_rules_tail = NULL; -struct use_family_rule *__sdp_servers_family_rules_head = NULL; -struct use_family_rule *__sdp_servers_family_rules_tail = NULL; - -/* some globals to store intermidiate parser state */ -static struct use_family_rule __sdp_rule; -static int current_role = 0; - -int __sdp_config_empty( - void - ) -{ - return ( (__sdp_clients_family_rules_head == NULL) && - (__sdp_servers_family_rules_head == NULL) ); -} - -/* define the address by 4 integers */ -static void __sdp_set_ipv4_addr(short a0, short a1, short a2, short a3) -{ - char buf[16]; - sprintf(buf,"%d.%d.%d.%d", a0, a1, a2, a3); - if (!inet_aton(buf, &( __sdp_rule.ipv4 ))) - { - parse_err = 1; - yyerror("provided address is not legal"); - } -} - -static void __sdp_set_prog_name_expr(char *prog_name_expr) -{ - __sdp_rule.prog_name_expr = strdup(prog_name_expr); - if (!__sdp_rule.prog_name_expr) { - yyerror("fail to allocate program name expression"); - } -} - -static char *__sdp_get_role_str(int role) -{ - if (role == 1) return("server"); - if (role == 2) return("client"); - return("unknown role"); -} - -extern int __sdp_min_level; - -/* dump the current state in readable format */ -static void __sdp_dump_config_state() { - char buf[1024]; - sprintf(buf, "CONFIG: use %s %s %s", - __sdp_get_family_str(__sdp_rule.target_family), - __sdp_get_role_str( current_role ), - __sdp_rule.prog_name_expr); - if (__sdp_rule.match_by_addr) { - if ( __sdp_rule.prefixlen != 32 ) - sprintf(buf+strlen(buf), " %s/%d", - inet_ntoa( __sdp_rule.ipv4 ), __sdp_rule.prefixlen); - else - sprintf(buf+strlen(buf), " %s", inet_ntoa( __sdp_rule.ipv4 )); - } else { - sprintf(buf+strlen(buf), " *"); - } - if (__sdp_rule.match_by_port) { - sprintf(buf+strlen(buf), ":%d",__sdp_rule.sport); - if (__sdp_rule.eport > __sdp_rule.sport) - sprintf(buf+strlen(buf), "-%d",__sdp_rule.eport); - } - else - sprintf(buf+strlen(buf), ":*"); - sprintf(buf+strlen(buf), "\n"); - __sdp_log(1, buf); -} - -/* use the above state for making a new rule */ -static void __sdp_add_rule() { - struct use_family_rule **p_tail, **p_head, *rule; - - if (__sdp_min_level <= 1) __sdp_dump_config_state(); - if ( current_role == 1 ) { - p_tail = &__sdp_servers_family_rules_tail; - p_head = &__sdp_servers_family_rules_head; - } else if ( current_role == 2 ) { - p_tail = &__sdp_clients_family_rules_tail; - p_head = &__sdp_clients_family_rules_head; - } else { - yyerror("ignoring unknown role"); - parse_err = 1; - return; - } - - rule = (struct use_family_rule *)malloc(sizeof(*rule)); - if (!rule) { - yyerror("fail to allocate new rule"); - parse_err = 1; - return; - } - - memset(rule, 0, sizeof(*rule)); - *rule = __sdp_rule; - rule->prev = *p_tail; - if (!(*p_head)) { - *p_head = rule; - } else { - (*p_tail)->next = rule; - } /* if */ - *p_tail = rule; -} - -%} - - -%union { - int ival; - char *sval; -} -%token USE "use" -%token CLIENT "client or connect" -%token SERVER "server or listen" -%token TCP "tcp" -%token SDP "sdp" -%token BOTH "both" -%token INT "integer value" -%token LOG "log statement" -%token DEST "destination" -%token STDERR "stderr" -%token SYSLOG "syslog" -%token FILENAME "file" -%token NAME "a name" -%token LEVEL "min-level" -%token LINE "new line" -%type NAME -%type INT LOG DEST STDERR SYSLOG FILENAME USE TCP SDP BOTH CLIENT SERVER LEVEL LINE -%debug -%error-verbose -%start config - -%{ - long __sdp_config_line_num; -%} -%% - -NL: - LINE - | NL LINE; - -ONL: - | NL; - -config: - ONL statements - ; - -statements: - | statements statement - ; - -statement: - log_statement - | socket_statement - ; - -log_statement: - LOG log_opts NL - ; - -log_opts: - | log_opts log_dest - | log_opts verbosity - ; - -log_dest: - DEST STDERR { __sdp_log_set_log_stderr(); } - | DEST SYSLOG { __sdp_log_set_log_syslog(); } - | DEST FILENAME NAME { __sdp_log_set_log_file($3); } - ; - -verbosity: - LEVEL INT { __sdp_log_set_min_level($2); } - ; - -socket_statement: - USE family role program address ':' ports NL { __sdp_add_rule(); } - ; - -family: - TCP { __sdp_rule.target_family = USE_TCP; } - | SDP { __sdp_rule.target_family = USE_SDP; } - | BOTH { __sdp_rule.target_family = USE_BOTH; } - ; - -role: - SERVER { current_role = 1; } - | CLIENT { current_role = 2; } - ; - -program: - NAME { __sdp_set_prog_name_expr($1); } - | '*' { __sdp_set_prog_name_expr("*"); } - ; - -address: - ipv4 { __sdp_rule.match_by_addr = 1; __sdp_rule.prefixlen = 32; } - | ipv4 '/' INT { __sdp_rule.match_by_addr = 1; __sdp_rule.prefixlen = $3; } - | '*' { __sdp_rule.match_by_addr = 0; __sdp_rule.prefixlen = 32; } - ; - -ipv4: - INT '.' INT '.' INT '.' INT { __sdp_set_ipv4_addr($1,$3,$5,$7); } - ; - -ports: - INT { __sdp_rule.match_by_port = 1; __sdp_rule.sport= $1; __sdp_rule.eport= $1; } - | INT '-' INT { __sdp_rule.match_by_port = 1; __sdp_rule.sport= $1; __sdp_rule.eport= $3; } - | '*' { __sdp_rule.match_by_port = 0; __sdp_rule.sport= 0 ; __sdp_rule.eport= 0; } - ; - -%% - -int yyerror(char *msg) -{ - /* replace the $undefined and $end if exists */ - char *orig_msg = (char*)malloc(strlen(msg)+25); - char *final_msg = (char*)malloc(strlen(msg)+25); - - strcpy(orig_msg, msg); - - char *word = strtok(orig_msg, " "); - final_msg[0] = '\0'; - while (word != NULL) { - if (!strncmp(word, "$undefined", 10)) { - strcat(final_msg, "unrecognized-token "); - } else if (!strncmp(word, "$end",4)) { - strcat(final_msg, "end-of-file "); - } else { - strcat(final_msg, word); - strcat(final_msg, " "); - } - word = strtok(NULL, " "); - } - - __sdp_log(9, "Error (line:%ld) : %s\n", __sdp_config_line_num, final_msg); - parse_err = 1; - - free(orig_msg); - free(final_msg); - return 1; -} - -#include -#include - -/* parse apollo route dump file */ -int __sdp_parse_config (const char *fileName) { - extern FILE * libsdp_yyin; - - /* open the file */ - if (access(fileName, R_OK)) { - printf("libsdp Error: No access to open File:%s %s\n", - fileName, strerror(errno)); - return(1); - } - - libsdp_yyin = fopen(fileName,"r"); - if (!libsdp_yyin) { - printf("libsdp Error: Fail to open File:%s\n", fileName); - return(1); - } - parse_err = 0; - __sdp_config_line_num = 1; - - /* parse it */ - yyparse(); - - fclose(libsdp_yyin); - return(parse_err); -} - - Index: contrib/ofed/libsdp/src/config_scanner.l =================================================================== --- contrib/ofed/libsdp/src/config_scanner.l (.../base) (revision 219811) +++ contrib/ofed/libsdp/src/config_scanner.l (.../head) (revision 219811) @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * $Id: ibnl_scanner.ll,v 1.4 2005/02/23 21:08:37 eitan Exp $ - */ - -%{ - -/* #define DEBUG 1 */ - -#define yyparse libsdp_yyparse -#define yylex libsdp_yylex -#define yyerror libsdp_yyerror -#define yylval libsdp_yylval -#define yychar libsdp_yychar -#define yydebug libsdp_yydebug -#define yynerrs libsdp_yynerrs - -#define yywrap libsdp_yywrap - -#include -#include -#include "config_parser.h" -extern long __sdp_config_line_num; -%} -%s CANNAME -%% - -^[ \t]*#.* {} - -([1-9][0-9]*|0) { - yylval.ival = atoi(yytext); -#ifdef DEBUG - printf("INT:%d\n",yylval.ival); -#endif - return INT; -} - -log { - yylval.ival = LOG; -#ifdef DEBUG - printf("LOG\n"); -#endif - return LOG; -} - -destination { - yylval.ival = DEST; -#ifdef DEBUG - printf("DEST\n"); -#endif - return DEST; -} - -min-level { - yylval.ival = LEVEL; -#ifdef DEBUG - printf("LEVEL\n"); -#endif - return LEVEL; -} - -stderr { - yylval.ival = STDERR; -#ifdef DEBUG - printf("STDERR\n"); -#endif - return STDERR; -} - -syslog { - yylval.ival = SYSLOG; -#ifdef DEBUG - printf("SYSLOG\n"); -#endif - return SYSLOG; -} - -file { - yylval.ival = FILENAME; -#ifdef DEBUG - printf("FILENAME\n"); -#endif - BEGIN(CANNAME); - return FILENAME; -} - -use { - yylval.ival = USE; -#ifdef DEBUG - printf("USE\n"); -#endif - return USE; -} - -tcp { - yylval.ival = TCP; -#ifdef DEBUG - printf("TCP\n"); -#endif - return TCP; -} - -sdp { - yylval.ival = SDP; -#ifdef DEBUG - printf("SDP\n"); -#endif - return SDP; -} - -both { - yylval.ival = BOTH; -#ifdef DEBUG - printf("BOTH\n"); -#endif - return BOTH; -} - -client|connect { - yylval.ival = CLIENT; -#ifdef DEBUG - printf("CLIENT\n"); -#endif - BEGIN(CANNAME); - return CLIENT; -} - -server|listen { - yylval.ival = SERVER; -#ifdef DEBUG - printf("SERVER\n"); -#endif - BEGIN(CANNAME); - return SERVER; -} - -[^ \t\n]+ { - yylval.sval = (char *)malloc(strlen(yytext) + 1); - strcpy(yylval.sval, yytext); -#ifdef DEBUG - printf("NAME:%s\n",yylval.sval); -#endif - BEGIN(0); - return (NAME); -} - -\n { - __sdp_config_line_num++; -#ifdef DEBUG - printf("LINE\n"); -#endif - yylval.ival = LINE; - return(LINE); -} - -[#][^\n]* { - __sdp_config_line_num++; -} - -[ \t]+ {} - -. { -#ifdef DEBUG - printf("CHAR:%c\n",yytext[0]); -#endif - return(yytext[0]); -} - -%% - -int yywrap () -{ - return (1); -} - Index: contrib/ofed/libsdp/src/port.c =================================================================== --- contrib/ofed/libsdp/src/port.c (.../base) (revision 219811) +++ contrib/ofed/libsdp/src/port.c (.../head) (revision 219811) @@ -60,7 +60,9 @@ #include #include #include +#ifdef __linux__ #include +#endif #ifdef SOLARIS_BUILD /* We're done protecting ourselves from the header prototypes */ @@ -152,6 +154,7 @@ typedef int (*poll_func_t) (struct pollfd * ufds, unsigned long int nfds, int timeout); +#ifdef __linux__ typedef int (*epoll_create_func_t) (int size); typedef int (*epoll_ctl_func_t) (int epfd, @@ -165,6 +168,7 @@ struct epoll_event * events, int maxevents, int timeout, const sigset_t * sigmask); +#endif struct socket_lib_funcs { @@ -184,10 +188,12 @@ select_func_t select; pselect_func_t pselect; poll_func_t poll; +#ifdef __linux__ epoll_create_func_t epoll_create; epoll_ctl_func_t epoll_ctl; epoll_wait_func_t epoll_wait; epoll_pwait_func_t epoll_pwait; +#endif }; /* socket_lib_funcs */ #ifdef SOLARIS_BUILD @@ -910,6 +916,7 @@ goto close_and_mark; } else { int err; +#ifdef __linux__ socklen_t len = sizeof(int); ret = getsockopt(tmp_sd[1 - tmp_turn], SOL_TCP, @@ -919,6 +926,9 @@ __func__, strerror(errno)); goto close_and_mark; } +#else + err = -errno; +#endif if (-ENOENT == err || -EADDRINUSE != err) { /* bind() failed due to either: * 1. IP is ETH, not IB, so can't bind() to sdp socket. @@ -1028,11 +1038,15 @@ if (EADDRINUSE != errno) goto done; +#ifdef __linux__ if (-1 == getsockopt(*sdp_sd, SOL_TCP, SDP_LAST_BIND_ERR, &err, &len)) { __sdp_log(9, "Error check_legal_bind:getsockopt: %s\n", strerror(errno)); goto done; } +#else + err = -errno; +#endif if (-ENOENT != err) { /* bind() failed due to real error. Can't continue */ __sdp_log(9, "Error check_legal_bind: " @@ -2164,7 +2178,7 @@ if we have shadow we must poll on it too - which requires a hack back and forth */ -int poll(struct pollfd *ufds, unsigned long int nfds, int timeout) +int poll(struct pollfd *ufds, nfds_t nfds, int timeout) { int ret; int shadow_fd; @@ -2255,6 +2269,7 @@ return ret; } /* poll */ +#ifdef __linux__ /* ========================================================================= */ /*..epoll_create -- replacement socket call. */ /* @@ -2379,6 +2394,7 @@ program_invocation_short_name, epfd, ret); return ret; } /* epoll_pwait */ +#endif /* ========================================================================= */ @@ -2516,6 +2532,7 @@ fprintf(stderr, "%s\n", error_str); } +#ifdef __linux__ _socket_funcs.epoll_create = dlsym(__libc_dl_handle, "epoll_create"); if (NULL != (error_str = dlerror())) { fprintf(stderr, "%s\n", error_str); @@ -2535,6 +2552,7 @@ if (NULL != (error_str = dlerror())) { fprintf(stderr, "%s\n", error_str); } +#endif #ifdef SOLARIS_BUILD _socket_xnet_funcs.socket = dlsym(__libc_dl_handle, "__xnet_socket"); if (NULL != (error_str = dlerror())) { Index: contrib/ofed/usr.lib/libibcm/Makefile =================================================================== --- contrib/ofed/usr.lib/libibcm/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibcm/Makefile (.../head) (revision 219811) @@ -0,0 +1,23 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +IBCMDIR= ${.CURDIR}/../../libibcm +IBSRCDIR= ${IBCMDIR}/src + +.PATH: ${IBSRCDIR} + +LIB= ibcm +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= cm.c +CFLAGS+= -I ${IBCMDIR}/include + +MAN= +VERSION_MAP= ${IBSRCDIR}/libibcm.map + +.include Index: contrib/ofed/usr.lib/libibmad/config.h =================================================================== --- contrib/ofed/usr.lib/libibmad/config.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibmad/config.h (.../head) (revision 219811) @@ -0,0 +1 @@ +#include Index: contrib/ofed/usr.lib/libibmad/Makefile =================================================================== --- contrib/ofed/usr.lib/libibmad/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibmad/Makefile (.../head) (revision 219811) @@ -0,0 +1,23 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +IBSRCDIR= ${IBMADDIR}/src + +.PATH: ${IBSRCDIR} + +LIB= ibmad +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= dump.c fields.c gs.c mad.c portid.c register.c resolve.c rpc.c sa.c \ + serv.c smp.c vendor.c + +CFLAGS+= -DHAVE_CONFIG_H + +VERSION_MAP= ${IBSRCDIR}/libibmad.map + +.include Index: contrib/ofed/usr.lib/libibverbs/alloca.h =================================================================== --- contrib/ofed/usr.lib/libibverbs/alloca.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibverbs/alloca.h (.../head) (revision 219811) @@ -0,0 +1,18 @@ +#ifndef _LIBIBVERBS_ALLOCA_H_ +#define _LIBIBVERBS_ALLOCA_H_ +#include +#include +#include + +#define strdupa(_s) \ +({ \ + char *_d; \ + int _len; \ + \ + _len = strlen(_s) + 1; \ + _d = alloca(_len); \ + if (_d) \ + memcpy(_d, _s, _len); \ + _d; \ +}) +#endif /* _LIBIBVERBS_ALLOCA_H_ */ Index: contrib/ofed/usr.lib/libibverbs/config.h =================================================================== --- contrib/ofed/usr.lib/libibverbs/config.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibverbs/config.h (.../head) (revision 219811) @@ -0,0 +1,2 @@ +#define _WITH_GETLINE +#include Index: contrib/ofed/usr.lib/libibverbs/Makefile =================================================================== --- contrib/ofed/usr.lib/libibverbs/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibverbs/Makefile (.../head) (revision 219811) @@ -0,0 +1,39 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +IBSRCDIR= ${IBVERBSDIR}/src +IBMANDIR= ${IBVERBSDIR}/man + +.PATH: ${IBSRCDIR} ${IBMANDIR} + +LIB= ibverbs +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= device.c init.c marshall.c verbs.c cmd.c enum_strs.c kern_abi.h \ + memory.c compat-1_0.c sysfs.c + +MAN= ibv_alloc_pd.3 ibv_asyncwatch.1 ibv_attach_mcast.3 ibv_create_ah.3 \ + ibv_create_ah_from_wc.3 ibv_create_comp_channel.3 ibv_create_cq.3 \ + ibv_create_qp.3 ibv_create_srq.3 ibv_devices.1 ibv_devinfo.1 \ + ibv_event_type_str.3 ibv_fork_init.3 ibv_get_async_event.3 \ + ibv_get_cq_event.3 ibv_get_device_guid.3 ibv_get_device_list.3 \ + ibv_get_device_name.3 ibv_modify_qp.3 ibv_modify_srq.3 \ + ibv_open_device.3 ibv_poll_cq.3 ibv_post_recv.3 ibv_post_send.3 \ + ibv_post_srq_recv.3 ibv_query_device.3 ibv_query_gid.3 \ + ibv_query_pkey.3 ibv_query_port.3 ibv_query_qp.3 ibv_query_srq.3 \ + ibv_rate_to_mult.3 ibv_rc_pingpong.1 ibv_reg_mr.3 ibv_req_notify_cq.3 \ + ibv_resize_cq.3 ibv_srq_pingpong.1 ibv_uc_pingpong.1 ibv_ud_pingpong.1 \ + ibv_query_xrc_rcv_qp.3 ibv_reg_xrc_rcv_qp.3 ibv_modify_xrc_rcv_qp.3 \ + verbs.7 ibv_create_xrc_rcv_qp.3 ibv_open_xrc_domain.3 + + +CFLAGS+= -DHAVE_CONFIG_H -DIBV_CONFIG_DIR=\"/etc/ibverbs/\" + +VERSION_MAP= ${IBSRCDIR}/libibverbs.map + +.include Index: contrib/ofed/usr.lib/libmthca/config.h =================================================================== --- contrib/ofed/usr.lib/libmthca/config.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libmthca/config.h (.../head) (revision 219811) @@ -0,0 +1,9 @@ +#define HAVE_IBV_DONTFORK_RANGE +#define HAVE_IBV_DOFORK_RANGE +#define HAVE_IBV_REGISTER_DRIVER +#define HAVE_IBV_READ_SYSFS_FILE +#ifdef __LP64__ +#define SIZEOF_LONG 8 +#else +#define SIZEOF_LONG 4 +#endif Index: contrib/ofed/usr.lib/libmthca/Makefile =================================================================== --- contrib/ofed/usr.lib/libmthca/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libmthca/Makefile (.../head) (revision 219811) @@ -0,0 +1,25 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include + +MTHCADIR= ${.CURDIR}/../../libmthca +IBVERBSDIR= ${.CURDIR}/../../libibverbs +MTHCASRCDIR= ${MTHCADIR}/src + +.PATH: ${MTHCASRCDIR} + +LIB= mthca +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= ah.c buf.c cq.c memfree.c mthca.c qp.c srq.c verbs.c + + +CFLAGS+= -DHAVE_CONFIG_H +CFLAGS+= -I${.CURDIR} -I${MTHCASRCDIR} -I${IBVERBSDIR}/include + +VERSION_MAP= ${MTHCASRCDIR}/mthca.map + +.include Index: contrib/ofed/usr.lib/libosmvendor/Makefile =================================================================== --- contrib/ofed/usr.lib/libosmvendor/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libosmvendor/Makefile (.../head) (revision 219811) @@ -0,0 +1,20 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +.PATH: ${VENDORLIBDIR} + +LIB= osmvendor +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= osm_vendor_ibumad.c osm_vendor_ibumad_sa.c + +CFLAGS+= -DOSM_VENDOR_INTF_OPENIB + +VERSION_MAP= ${VENDORLIBDIR}/libosmvendor.map + +.include Index: contrib/ofed/usr.lib/libosmcomp/Makefile =================================================================== --- contrib/ofed/usr.lib/libosmcomp/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libosmcomp/Makefile (.../head) (revision 219811) @@ -0,0 +1,21 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +.PATH: ${COMPLIBDIR} + +LIB= osmcomp +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= cl_complib.c cl_dispatcher.c cl_event.c cl_event_wheel.c cl_list.c +SRCS+= cl_log.c cl_map.c cl_pool.c cl_ptr_vector.c cl_spinlock.c +SRCS+= cl_statustext.c cl_thread.c cl_threadpool.c cl_timer.c cl_vector.c +SRCS+= ib_statustext.c cl_nodenamemap.c + +VERSION_MAP= ${COMPLIBDIR}/libosmcomp.map + +.include Index: contrib/ofed/usr.lib/Makefile.inc =================================================================== --- contrib/ofed/usr.lib/Makefile.inc (.../base) (revision 0) +++ contrib/ofed/usr.lib/Makefile.inc (.../head) (revision 219811) @@ -0,0 +1,17 @@ +IBMGMT= ${.CURDIR}/../../management +IBCOMMONDIR= ${IBMGMT}/libibcommon +IBMADDIR= ${IBMGMT}/libibmad +UMADDIR= ${IBMGMT}/libibumad +OPENSMDIR= ${IBMGMT}/opensm +COMPLIBDIR= ${OPENSMDIR}/complib +VENDORLIBDIR= ${OPENSMDIR}/libvendor +IBVERBSDIR= ${.CURDIR}/../../libibverbs +IBINC= ${.CURDIR}/../../include + +CFLAGS+= -I${.CURDIR} -I${IBINC}/infiniband +CFLAGS+= -I${IBCOMMONDIR}/include/infiniband +CFLAGS+= -I${IBMADDIR}/include/infiniband +CFLAGS+= -I${UMADDIR}/include/infiniband +CFLAGS+= -I${OPENSMDIR}/include +# CFLAGS+= -I${UMADDIR}/include +# CFLAGS+= -I${IBVERBSDIR}/include Index: contrib/ofed/usr.lib/libibumad/config.h =================================================================== --- contrib/ofed/usr.lib/libibumad/config.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibumad/config.h (.../head) (revision 219811) @@ -0,0 +1 @@ +#include Index: contrib/ofed/usr.lib/libibumad/Makefile =================================================================== --- contrib/ofed/usr.lib/libibumad/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibumad/Makefile (.../head) (revision 219811) @@ -0,0 +1,22 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +IBSRCDIR= ${UMADDIR}/src + +.PATH: ${IBSRCDIR} + +LIB= ibumad +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= umad.c + +CFLAGS+= -DHAVE_CONFIG_H + +VERSION_MAP= ${IBSRCDIR}/libibumad.map + +.include Index: contrib/ofed/usr.lib/libopensm/Makefile =================================================================== --- contrib/ofed/usr.lib/libopensm/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libopensm/Makefile (.../head) (revision 219811) @@ -0,0 +1,18 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +.PATH: ${OPENSMDIR}/opensm + +LIB= opensm +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= osm_log.c osm_mad_pool.c osm_helper.c + +VERSION_MAP= ${OPENSMDIR}/opensm/libopensm.map + +.include Index: contrib/ofed/usr.lib/libibcommon/Makefile =================================================================== --- contrib/ofed/usr.lib/libibcommon/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libibcommon/Makefile (.../head) (revision 219811) @@ -0,0 +1,20 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include +.include "../Makefile.inc" + +IBSRCDIR= ${IBCOMMONDIR}/src + +.PATH: ${IBSRCDIR} + +LIB= ibcommon +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= sysfs.c util.c hash.c stack.c time.c + +VERSION_MAP= ${IBSRCDIR}/libibcommon.map + +.include Index: contrib/ofed/usr.lib/librdmacm/Makefile =================================================================== --- contrib/ofed/usr.lib/librdmacm/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/librdmacm/Makefile (.../head) (revision 219811) @@ -0,0 +1,33 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include + +RDMACMDIR= ${.CURDIR}/../../librdmacm +RDMASRCDIR= ${RDMACMDIR}/src +RDMAMANDIR= ${RDMACMDIR}/man + +.PATH: ${RDMASRCDIR} ${RDMAMANDIR} + +LIB= rdmacm +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= cma.c + +MAN= rdma_get_devices.3 mckey.1 rdma_get_dst_port.3 rdma_accept.3 +MAN+= rdma_get_local_addr.3 rdma_ack_cm_event.3 rdma_get_peer_addr.3 +MAN+= rdma_bind_addr.3 rdma_get_src_port.3 rdma_cm.7 rdma_join_multicast.3 +MAN+= rdma_connect.3 rdma_leave_multicast.3 rdma_create_event_channel.3 +MAN+= rdma_listen.3 rdma_create_id.3 rdma_migrate_id.3 rdma_create_qp.3 +MAN+= rdma_notify.3 rdma_destroy_event_channel.3 rdma_reject.3 +MAN+= rdma_destroy_id.3 rdma_resolve_addr.3 rdma_destroy_qp.3 +MAN+= rdma_resolve_route.3 rdma_disconnect.3 rdma_set_option.3 +MAN+= rdma_event_str.3 rping.1 rdma_free_devices.3 ucmatose.1 +MAN+= rdma_get_cm_event.3 udaddy.1 + + +VERSION_MAP= ${RDMASRCDIR}/librdmacm.map + +.include Index: contrib/ofed/usr.lib/libmlx4/config.h =================================================================== --- contrib/ofed/usr.lib/libmlx4/config.h (.../base) (revision 0) +++ contrib/ofed/usr.lib/libmlx4/config.h (.../head) (revision 219811) @@ -0,0 +1,4 @@ +#define HAVE_IBV_DONTFORK_RANGE +#define HAVE_IBV_DOFORK_RANGE +#define HAVE_IBV_REGISTER_DRIVER +#define HAVE_IBV_READ_SYSFS_FILE Index: contrib/ofed/usr.lib/libmlx4/Makefile =================================================================== --- contrib/ofed/usr.lib/libmlx4/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libmlx4/Makefile (.../head) (revision 219811) @@ -0,0 +1,24 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include + +MLX4DIR= ${.CURDIR}/../../libmlx4 +IBVERBSDIR= ${.CURDIR}/../../libibverbs +MLXSRCDIR= ${MLX4DIR}/src + +.PATH: ${MLXSRCDIR} + +LIB= mlx4 +SHLIB_MAJOR= 1 +NO_PROFILE= + +SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c + +CFLAGS+= -DHAVE_CONFIG_H +CFLAGS+= -I${.CURDIR} -I${MLXSRCDIR} -I${IBVERBSDIR}/include + +VERSION_MAP= ${MLXSRCDIR}/mlx4.map + +.include Index: contrib/ofed/usr.lib/Makefile =================================================================== --- contrib/ofed/usr.lib/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/Makefile (.../head) (revision 219811) @@ -0,0 +1,6 @@ +.include + +SUBDIR = libibcommon libibmad libibumad libibverbs libmlx4 libmthca +SUBDIR += libopensm libosmcomp libosmvendor libibcm librdmacm libsdp + +.include Index: contrib/ofed/usr.lib/libsdp/Makefile =================================================================== --- contrib/ofed/usr.lib/libsdp/Makefile (.../base) (revision 0) +++ contrib/ofed/usr.lib/libsdp/Makefile (.../head) (revision 219811) @@ -0,0 +1,21 @@ +# $FreeBSD$ + +SHLIBDIR?= /usr/lib + +.include + +SDPDIR= ${.CURDIR}/../../libsdp/src + +.PATH: ${SDPDIR} + +LIB= ibsdp +SHLIB_MAJOR= 1 +NO_PROFILE= +NO_MAN= + +SRCS= log.c match.c port.c config_parser.c config_scanner.c + +CFLAGS+= -DSYSCONFDIR=\"/etc\" +CFLAGS+= -I${OFEDSYS}/include + +.include Property changes on: contrib/ofed ___________________________________________________________________ Added: svn:mergeinfo Merged /projects/quota64/contrib/ofed:r184125-207707 Merged /projects/ofed/base/contrib/ofed:r209517-219808 Merged /vendor/resolver/dist/contrib/ofed:r1540-186085 Property changes on: contrib/netcat ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/netcat:r207767-219808 Property changes on: contrib/gnu-sort ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/gnu-sort:r207767-219808 Property changes on: contrib/file ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/file:r207767-219808 Property changes on: contrib/bzip2 ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/bzip2:r207767-219808 Property changes on: contrib/tzcode/zic ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/tzcode/zic:r207767-219808 Property changes on: contrib/tzcode/stdtime ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/tzcode/stdtime:r207767-219808 Property changes on: contrib/binutils ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/binutils:r207767-219808 Property changes on: contrib/ee ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/ee:r207767-219808 Property changes on: contrib/gdb ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/gdb:r207767-219808 Property changes on: contrib/sendmail ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/sendmail:r207767-219808 Property changes on: contrib/gdtoa ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/gdtoa:r207767-219808 Property changes on: contrib/dialog ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/contrib/dialog:r207766-209025 Merged /vendor/resolver/dist/contrib/dialog:r1540-186085 Merged /projects/quota64/contrib/dialog:r184125-207707 Merged /projects/ofed/base/contrib/dialog:r207767-219808 Property changes on: contrib/xz ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /head/contrib/xz:r207842-216915 Merged /projects/ofed/base/contrib/xz:r211851-219808 Property changes on: contrib/top/install-sh ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/top/install-sh:r207767-219808 Property changes on: contrib/top ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/top:r207767-219808 Property changes on: contrib/bind9 ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/bind9:r207767-219808 Property changes on: contrib/llvm/tools/clang ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /head/contrib/llvm/tools/clang:r208954-216915 Merged /projects/ofed/base/contrib/llvm/tools/clang:r211851-219808 Property changes on: contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /head/contrib/llvm:r208954-216915 Merged /projects/ofed/base/contrib/llvm:r211851-219808 Property changes on: contrib/tcpdump ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/tcpdump:r207767-219808 Property changes on: contrib/one-true-awk ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/contrib/one-true-awk:r207767-219808 Property changes on: share/zoneinfo ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/share/zoneinfo:r207767-219808 Index: share/mk/bsd.own.mk =================================================================== --- share/mk/bsd.own.mk (.../base) (revision 219811) +++ share/mk/bsd.own.mk (.../head) (revision 219811) @@ -373,6 +373,7 @@ NS_CACHING \ NTP \ OBJC \ + OFED \ OPENSSH \ OPENSSL \ PAM \ Property changes on: share/mk/bsd.arch.inc.mk ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /head/share/mk/bsd.arch.inc.mk:r209026-216915 Merged /projects/ofed/base/share/mk/bsd.arch.inc.mk:r216918-219808 Property changes on: usr.sbin/ndiscvt ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/usr.sbin/ndiscvt:r207767-219808 Merged /head/usr.sbin/ndiscvt:r207766-209025 Index: usr.sbin/ndp/ndp.c =================================================================== --- usr.sbin/ndp/ndp.c (.../base) (revision 219811) +++ usr.sbin/ndp/ndp.c (.../head) (revision 219811) @@ -822,11 +822,15 @@ ether_str(struct sockaddr_dl *sdl) { static char hbuf[NI_MAXHOST]; + char *cp; - if (sdl->sdl_alen > 0) + if (sdl->sdl_alen == ETHER_ADDR_LEN) { strlcpy(hbuf, ether_ntoa((struct ether_addr *)LLADDR(sdl)), sizeof(hbuf)); - else + } else if (sdl->sdl_alen) { + int n = sdl->sdl_nlen > 0 ? sdl->sdl_nlen + 1 : 0; + snprintf(hbuf, sizeof(hbuf), "%s", link_ntoa(sdl) + n); + } else snprintf(hbuf, sizeof(hbuf), "(incomplete)"); return(hbuf); Index: usr.sbin/config/mkmakefile.c =================================================================== --- usr.sbin/config/mkmakefile.c (.../base) (revision 219811) +++ usr.sbin/config/mkmakefile.c (.../head) (revision 219811) @@ -312,6 +312,7 @@ struct device *dp; struct opt *op; char *wd, *this, *compilewith, *depends, *clean, *warning; + const char *objprefix; int compile, match, nreqs, std, filetype, imp_rule, no_obj, before_depend, mandatory, nowerror; @@ -326,6 +327,7 @@ * [ compile-with "compile rule" [no-implicit-rule] ] * [ dependency "dependency-list"] [ before-depend ] * [ clean "file-list"] [ warning "text warning" ] + * [ obj-prefix "file prefix"] */ wd = get_word(fp); if (wd == (char *)EOF) { @@ -373,6 +375,7 @@ before_depend = 0; nowerror = 0; filetype = NORMAL; + objprefix = ""; if (eq(wd, "standard")) { std = 1; /* @@ -467,6 +470,16 @@ warning = ns(wd); goto nextparam; } + if (eq(wd, "obj-prefix")) { + next_quoted_word(fp, wd); + if (wd == 0) { + printf("%s: %s missing object prefix string.\n", + fname, this); + exit(1); + } + objprefix = ns(wd); + goto nextparam; + } nreqs++; if (eq(wd, "local")) { filetype = LOCAL; @@ -535,6 +548,7 @@ tp->f_depends = depends; tp->f_clean = clean; tp->f_warn = warning; + tp->f_objprefix = objprefix; goto next; } @@ -619,11 +633,12 @@ cp = sp + (len = strlen(sp)) - 1; och = *cp; *cp = 'o'; + len += strlen(tp->f_objprefix); if (len + lpos > 72) { lpos = 8; fprintf(fp, "\\\n\t"); } - fprintf(fp, "%s ", sp); + fprintf(fp, "%s%s ", tp->f_objprefix, sp); lpos += len + 1; *cp = och; } @@ -699,30 +714,33 @@ och = *cp; if (ftp->f_flags & NO_IMPLCT_RULE) { if (ftp->f_depends) - fprintf(f, "%s: %s\n", np, ftp->f_depends); + fprintf(f, "%s%s: %s\n", + ftp->f_objprefix, np, ftp->f_depends); else - fprintf(f, "%s: \n", np); + fprintf(f, "%s%s: \n", ftp->f_objprefix, np); } else { *cp = '\0'; if (och == 'o') { - fprintf(f, "%so:\n\t-cp $S/%so .\n\n", - tail(np), np); + fprintf(f, "%s%so:\n\t-cp $S/%so .\n\n", + ftp->f_objprefix, tail(np), np); continue; } if (ftp->f_depends) { - fprintf(f, "%sln: $S/%s%c %s\n", tail(np), - np, och, ftp->f_depends); + fprintf(f, "%s%sln: $S/%s%c %s\n", + ftp->f_objprefix, tail(np), np, och, + ftp->f_depends); fprintf(f, "\t${NORMAL_LINT}\n\n"); - fprintf(f, "%so: $S/%s%c %s\n", tail(np), - np, och, ftp->f_depends); + fprintf(f, "%s%so: $S/%s%c %s\n", + ftp->f_objprefix, tail(np), np, och, + ftp->f_depends); } else { - fprintf(f, "%sln: $S/%s%c\n", tail(np), - np, och); + fprintf(f, "%s%sln: $S/%s%c\n", + ftp->f_objprefix, tail(np), np, och); fprintf(f, "\t${NORMAL_LINT}\n\n"); - fprintf(f, "%so: $S/%s%c\n", tail(np), - np, och); + fprintf(f, "%s%so: $S/%s%c\n", + ftp->f_objprefix, tail(np), np, och); } } compilewith = ftp->f_compilewith; @@ -750,7 +768,10 @@ compilewith = cmd; } *cp = och; - fprintf(f, "\t%s\n\n", compilewith); + if (strlen(ftp->f_objprefix)) + fprintf(f, "\t%s $S/%s\n\n", compilewith, np); + else + fprintf(f, "\t%s\n\n", compilewith); } } Index: usr.sbin/config/config.h =================================================================== --- usr.sbin/config/config.h (.../base) (revision 219811) +++ usr.sbin/config/config.h (.../head) (revision 219811) @@ -53,6 +53,7 @@ char *f_depends; /* additional dependancies */ char *f_clean; /* File list to add to clean rule */ char *f_warn; /* warning message */ + const char *f_objprefix; /* prefix string for object name */ }; struct files_name { Property changes on: usr.sbin/zic ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/usr.sbin/zic:r207767-219808 Property changes on: cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/cddl/contrib/opensolaris:r207767-219808 Index: Makefile.inc1 =================================================================== --- Makefile.inc1 (.../base) (revision 219811) +++ Makefile.inc1 (.../head) (revision 219811) @@ -76,6 +76,9 @@ SUBDIR+=share .endif SUBDIR+=sys usr.bin usr.sbin +.if ${MK_OFED} != "no" +SUBDIR+=contrib/ofed +.endif # # We must do etc/ last for install/distribute to work. # @@ -1210,8 +1213,12 @@ _lib_libthr= lib/libthr .endif -_generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib +.if ${MK_OFED} != "no" +_ofed_lib= contrib/ofed/usr.lib/ +.endif +_generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib ${_ofed_lib} + lib/libopie__L lib/libtacplus__L: lib/libmd__L .if ${MK_CDDL} != "no" Property changes on: lib/libutil ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/lib/libutil:r207767-219808 Property changes on: lib/libc/stdtime ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/lib/libc/stdtime:r207767-219808 Property changes on: lib/libc ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/lib/libc:r207767-219808 Property changes on: lib/libz ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/lib/libz:r207767-219808 Index: etc/rc.d/Makefile =================================================================== --- etc/rc.d/Makefile (.../base) (revision 219811) +++ etc/rc.d/Makefile (.../head) (revision 219811) @@ -42,6 +42,10 @@ ypset ypupdated ypxfrd \ zfs zvol +.if ${MK_OFED} != "no" +FILES+= opensm +.endif + .if ${MK_OPENSSH} != "no" FILES+= sshd .endif Index: etc/rc.d/opensm =================================================================== --- etc/rc.d/opensm (.../base) (revision 0) +++ etc/rc.d/opensm (.../head) (revision 219811) @@ -0,0 +1,28 @@ +#!/bin/sh +# +# $FreeBSD$ +# + +# PROVIDE: opensm +# BEFORE: netif +# REQUIRE: FILESYSTEMS + +. /etc/rc.subr + +name="opensm" +start_cmd="opensm_start" +rcvar="opensm_enable" + +command=/usr/bin/opensm +command_args="-B" + +opensm_start() +{ + for guid in `ibstat | grep "Port GUID" | cut -d ':' -f2`; do + [ -z "${rc_quiet}" ] && echo "Starting ${guid} opensm." + ${command} ${command_args} -g ${guid} >> /dev/null + done +} + +load_rc_config $name +run_rc_command $* Property changes on: etc/rc.d/opensm ___________________________________________________________________ Added: svn:executable + * Index: etc/defaults/rc.conf =================================================================== --- etc/defaults/rc.conf (.../base) (revision 219811) +++ etc/defaults/rc.conf (.../head) (revision 219811) @@ -647,6 +647,7 @@ newsyslog_enable="YES" # Run newsyslog at startup. newsyslog_flags="-CN" # Newsyslog flags to create marked files mixer_enable="YES" # Run the sound mixer. +opensm_enable="NO" # Opensm(8) for infiniband devices defaults to off ############################################################## ### Jail Configuration ####################################### Index: etc/mtree/BSD.var.dist =================================================================== --- etc/mtree/BSD.var.dist (.../base) (revision 219811) +++ etc/mtree/BSD.var.dist (.../head) (revision 219811) @@ -22,6 +22,8 @@ /set gname=wheel backups .. + cache + .. crash .. cron Index: etc/mtree/BSD.include.dist =================================================================== --- etc/mtree/BSD.include.dist (.../base) (revision 219811) +++ etc/mtree/BSD.include.dist (.../head) (revision 219811) @@ -207,6 +207,16 @@ .. gssapi .. + infiniband + complib + .. + iba + .. + opensm + .. + vendor + .. + .. isofs cd9660 .. @@ -275,6 +285,8 @@ .. protocols .. + rdma + .. readline .. rpc Index: sys/conf/kern.pre.mk =================================================================== --- sys/conf/kern.pre.mk (.../base) (revision 219811) +++ sys/conf/kern.pre.mk (.../head) (revision 219811) @@ -142,6 +142,14 @@ NORMAL_LINT= ${LINT} ${LINTFLAGS} ${CFLAGS:M-[DIU]*} ${.IMPSRC} +# Infiniband C flags. Correct include paths and omit errors that linux +# does not honor. +OFEDINCLUDES= -I$S/ofed/include/ +OFEDNOERR= -Wno-cast-qual -Wno-pointer-arith -fms-extensions +OFEDCFLAGS= ${CFLAGS:N-I*} ${OFEDINCLUDES} ${CFLAGS:M-I*} ${OFEDNOERR} +OFED_C_NOIMP= ${CC} -c -o ${.TARGET} ${OFEDCFLAGS} ${WERROR} ${PROF} +OFED_C= ${OFED_C_NOIMP} ${.IMPSRC} + GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/} SYSTEM_CFILES= config.c env.c hints.c vnode_if.c SYSTEM_DEP= Makefile ${SYSTEM_OBJS} Index: sys/conf/files =================================================================== --- sys/conf/files (.../base) (revision 219811) +++ sys/conf/files (.../head) (revision 219811) @@ -2791,6 +2791,281 @@ nlm/nlm_prot_svc.c optional nfslockd | nfsd nlm/nlm_prot_xdr.c optional nfslockd | nfsd nlm/sm_inter_xdr.c optional nfslockd | nfsd + +# OpenFabrics Enterprise Distribution (Infiniband) +ofed/include/linux/linux_compat.c optional ofed \ + no-depend compile-with "${OFED_C}" +ofed/include/linux/linux_idr.c optional ofed \ + no-depend compile-with "${OFED_C}" +ofed/include/linux/linux_radix.c optional ofed \ + no-depend compile-with "${OFED_C}" +ofed/drivers/infiniband/core/addr.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/agent.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/cache.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +# XXX Mad.c must be ordered before cm.c for sysinit sets to occur in +# the correct order. +ofed/drivers/infiniband/core/mad.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/cm.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/cma.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/device.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/fmr_pool.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/iwcm.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/local_sa.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/mad_rmpp.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/multicast.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/notice.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/packer.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/sa_query.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/smi.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/sysfs.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/ucm.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/ucma.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/ud_header.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/umem.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/user_mad.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/uverbs_cmd.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/uverbs_main.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/uverbs_marshall.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/verbs.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" + +ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c optional ipoib \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +#ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c optional ipoib \ +# no-depend \ +# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c optional ipoib \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c optional ipoib \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c optional ipoib \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c optional ipoib \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" +#ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c optional ipoib \ +# no-depend \ +# compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/ipoib/" + +ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c optional sdp \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" +ofed/drivers/infiniband/ulp/sdp/sdp_main.c optional sdp \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" +ofed/drivers/infiniband/ulp/sdp/sdp_rx.c optional sdp \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" +ofed/drivers/infiniband/ulp/sdp/sdp_cma.c optional sdp \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" +ofed/drivers/infiniband/ulp/sdp/sdp_tx.c optional sdp \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/ulp/sdp/" + +ofed/drivers/infiniband/hw/mlx4/ah.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/cq.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/doorbell.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/mad.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/main.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/mr.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/qp.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/srq.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" +ofed/drivers/infiniband/hw/mlx4/wc.c optional mlx4ib \ + no-depend obj-prefix "mlx4ib_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/infiniband/hw/mlx4/" + +ofed/drivers/net/mlx4/alloc.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/catas.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/cmd.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/cq.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/eq.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/fw.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/icm.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/intf.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/main.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/mcg.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/mr.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/pd.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/port.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/profile.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/qp.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/reset.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/sense.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/srq.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/xrcd.c optional mlx4ib | mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" + +ofed/drivers/net/mlx4/en_cq.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_frag.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_main.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_netdev.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_port.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_resources.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_rx.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" +ofed/drivers/net/mlx4/en_tx.c optional mlxen \ + no-depend obj-prefix "mlx4_" \ + compile-with "${OFED_C_NOIMP} -I$S/ofed/drivers/net/mlx4/" + +ofed/drivers/infiniband/hw/mthca/mthca_allocator.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_av.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_catas.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_cmd.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_cq.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_eq.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_mad.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_main.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_mcg.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_memfree.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_mr.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_pd.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_profile.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_provider.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_qp.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_reset.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_srq.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" +ofed/drivers/infiniband/hw/mthca/mthca_uar.c optional mthca \ + no-depend compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/mthca/" + # crypto support opencrypto/cast.c optional crypto | ipsec opencrypto/criov.c optional crypto Index: sys/conf/options =================================================================== --- sys/conf/options (.../base) (revision 219811) +++ sys/conf/options (.../head) (revision 219811) @@ -862,3 +862,11 @@ # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h + +# OFED Infiniband stack +OFED opt_ofed.h +OFED_DEBUG_INIT opt_ofed.h +SDP opt_ofed.h +SDP_DEBUG opt_ofed.h +IPOIB_DEBUG opt_ofed.h +IPOIB_CM opt_ofed.h Property changes on: sys/conf ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/conf:r207767-219808 Merged /head/sys/conf:r207766-209025 Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c (.../base) (revision 219811) +++ sys/kern/kern_jail.c (.../head) (revision 219811) @@ -4182,7 +4182,7 @@ i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: - snprintf(numbuf, sizeof(numbuf), "%d", arg2); + snprintf(numbuf, sizeof(numbuf), "%jd", arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: Index: sys/kern/subr_bus.c =================================================================== --- sys/kern/subr_bus.c (.../base) (revision 219811) +++ sys/kern/subr_bus.c (.../head) (revision 219811) @@ -1038,7 +1038,7 @@ * @param dc the devclass to edit * @param driver the driver to register */ -static int +int devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp) { driverlink_t dl; @@ -1172,7 +1172,7 @@ * @param dc the devclass to edit * @param driver the driver to unregister */ -static int +int devclass_delete_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); Index: sys/kern/kern_sx.c =================================================================== --- sys/kern/kern_sx.c (.../base) (revision 219811) +++ sys/kern/kern_sx.c (.../head) (revision 219811) @@ -194,7 +194,7 @@ { struct sx_args *sargs = arg; - sx_init(sargs->sa_sx, sargs->sa_desc); + sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void Index: sys/kern/kern_intr.c =================================================================== --- sys/kern/kern_intr.c (.../base) (revision 219811) +++ sys/kern/kern_intr.c (.../head) (revision 219811) @@ -74,6 +74,7 @@ /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ +#define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; @@ -735,6 +736,39 @@ return (ie->ie_source); } +/* + * Sleep until an ithread finishes executing an interrupt handler. + * + * XXX Doesn't currently handle interrupt filters or fast interrupt + * handlers. This is intended for compatibility with linux drivers + * only. Do not use in BSD code. + */ +void +_intr_drain(int irq) +{ + struct mtx *mtx; + struct intr_event *ie; + struct intr_thread *ithd; + struct thread *td; + + ie = intr_lookup(irq); + if (ie == NULL) + return; + if (ie->ie_thread == NULL) + return; + ithd = ie->ie_thread; + td = ithd->it_thread; + thread_lock(td); + mtx = td->td_lock; + if (!TD_AWAITING_INTR(td)) { + ithd->it_flags |= IT_WAIT; + msleep_spin(ithd, mtx, "isync", 0); + } + mtx_unlock_spin(mtx); + return; +} + + #ifndef INTR_FILTER int intr_event_remove_handler(void *cookie) @@ -1271,6 +1305,7 @@ struct intr_event *ie; struct thread *td; struct proc *p; + int wake; td = curthread; p = td->td_proc; @@ -1279,6 +1314,7 @@ ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; + wake = 0; /* * As long as we have interrupts outstanding, go through the @@ -1319,12 +1355,20 @@ * set again, so we have to check it again. */ thread_lock(td); - if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { + if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } + if (ithd->it_flags & IT_WAIT) { + wake = 1; + ithd->it_flags &= ~IT_WAIT; + } thread_unlock(td); + if (wake) { + wakeup(ithd); + wake = 0; + } } } @@ -1439,6 +1483,7 @@ struct thread *td; struct proc *p; int priv; + int wake; td = curthread; p = td->td_proc; @@ -1449,6 +1494,7 @@ ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; + wake = 0; /* * As long as we have interrupts outstanding, go through the @@ -1492,12 +1538,20 @@ * set again, so we have to check it again. */ thread_lock(td); - if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) { + if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } + if (ithd->it_flags & IT_WAIT) { + wake = 1; + ithd->it_flags &= ~IT_WAIT; + } thread_unlock(td); + if (wake) { + wakeup(ithd); + wake = 0; + } } } Index: sys/kern/kern_sysctl.c =================================================================== --- sys/kern/kern_sysctl.c (.../base) (revision 219811) +++ sys/kern/kern_sysctl.c (.../head) (revision 219811) @@ -365,10 +365,31 @@ return (error); } +int +sysctl_remove_name(struct sysctl_oid *parent, const char *name, + int del, int recurse) +{ + struct sysctl_oid *p, *tmp; + int error; + + error = ENOENT; + SYSCTL_XLOCK(); + SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { + if (strcmp(p->oid_name, name) == 0) { + error = sysctl_remove_oid_locked(p, del, recurse); + break; + } + } + SYSCTL_XUNLOCK(); + + return (error); +} + + static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { - struct sysctl_oid *p; + struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_XLOCKED(); @@ -387,7 +408,8 @@ */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { - SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) { + SLIST_FOREACH_SAFE(p, + SYSCTL_CHILDREN(oidp), oid_link, tmp) { if (!recurse) return (ENOTEMPTY); error = sysctl_remove_oid_locked(p, del, @@ -428,14 +450,13 @@ } return (0); } - /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, - int number, const char *name, int kind, void *arg1, int arg2, + int number, const char *name, int kind, void *arg1, intptr_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr) { struct sysctl_oid *oidp; @@ -479,6 +500,7 @@ SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list), M_SYSCTLOID, M_WAITOK)); SLIST_INIT(SYSCTL_CHILDREN(oidp)); + oidp->oid_arg2 = arg2; } else { oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; Property changes on: sys/boot/powerpc/boot1.chrp ___________________________________________________________________ Modified: svn:mergeinfo Merged /head/sys/boot/powerpc/boot1.chrp:r2-4 Merged /head/sys/contrib/dev/acpica/boot/powerpc/boot1.chrp:r207340 Merged /projects/ofed/base/sys/boot/powerpc/boot1.chrp:r216918-219808 Property changes on: sys/boot/powerpc/ofw ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/boot/powerpc/ofw:r207767-219808 Merged /head/sys/boot/powerpc/ofw:r207766-209025 Property changes on: sys/boot/i386/efi ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/boot/i386/efi:r207767-219808 Merged /head/sys/boot/i386/efi:r207766-209025 Property changes on: sys/boot/ia64/efi ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/boot/ia64/efi:r207767-219808 Merged /head/sys/boot/ia64/efi:r207766-209025 Property changes on: sys/boot/ia64/ski ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/boot/ia64/ski:r207767-219808 Merged /head/sys/boot/ia64/ski:r207766-209025 Property changes on: sys/boot ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/boot:r207767-219808 Merged /head/sys/boot:r207766-209025 Index: sys/netinet/if_ether.c =================================================================== --- sys/netinet/if_ether.c (.../base) (revision 219811) +++ sys/netinet/if_ether.c (.../head) (revision 219811) @@ -441,7 +441,8 @@ if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && ntohs(ar->ar_hrd) != ARPHRD_ARCNET && - ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) { + ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 && + ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) { log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n", (unsigned char *)&ar->ar_hrd, ""); m_freem(m); Index: sys/modules/mlx4/Makefile =================================================================== --- sys/modules/mlx4/Makefile (.../base) (revision 0) +++ sys/modules/mlx4/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +#$FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/net/mlx4 +KMOD = mlx4 +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +SRCS+= alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c +SRCS+= pd.c port.c profile.c qp.c reset.c sense.c srq.c xrcd.c + +CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 +CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -DINET6 + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions Index: sys/modules/mthca/Makefile =================================================================== --- sys/modules/mthca/Makefile (.../base) (revision 0) +++ sys/modules/mthca/Makefile (.../head) (revision 219811) @@ -0,0 +1,14 @@ +#$FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mthca +KMOD = mthca +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +SRCS+= mthca_allocator.c mthca_av.c mthca_catas.c mthca_cmd.c mthca_cq.c +SRCS+= mthca_eq.c mthca_mad.c mthca_main.c mthca_mcg.c mthca_memfree.c +SRCS+= mthca_mr.c mthca_pd.c mthca_profile.c mthca_provider.c mthca_qp.c +SRCS+= mthca_reset.c mthca_srq.c mthca_uar.c + +CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6 + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile (.../base) (revision 219811) +++ sys/modules/Makefile (.../head) (revision 219811) @@ -185,6 +185,9 @@ mfi \ mii \ mlx \ + mlx4 \ + mlx4ib \ + mlxen \ ${_mly} \ mmc \ mmcsd \ @@ -195,6 +198,7 @@ msdosfs_iconv \ ${_mse} \ msk \ + mthca \ mvs \ mwl \ mwlfw \ Index: sys/modules/mlx4ib/Makefile =================================================================== --- sys/modules/mlx4ib/Makefile (.../base) (revision 0) +++ sys/modules/mlx4ib/Makefile (.../head) (revision 219811) @@ -0,0 +1,11 @@ +#$FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4 +KMOD = mlx4ib +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c + +CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6 + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions Index: sys/modules/mlxen/Makefile =================================================================== --- sys/modules/mlxen/Makefile (.../base) (revision 0) +++ sys/modules/mlxen/Makefile (.../head) (revision 219811) @@ -0,0 +1,13 @@ +#$FreeBSD$ +.PATH: ${.CURDIR}/../../ofed/drivers/net/mlx4 +KMOD = mlxen +SRCS = device_if.h bus_if.h pci_if.h vnode_if.h +SRCS += en_cq.c en_frag.c en_main.c en_netdev.c en_port.c en_resources.c +SRCS += en_rx.c en_tx.c +CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 +CFLAGS+= -I${.CURDIR}/../../ofed/include/ +CFLAGS+= -DINET6 + +.include + +CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions Index: sys/ofed/include/linux/dmapool.h =================================================================== --- sys/ofed/include/linux/dmapool.h (.../base) (revision 0) +++ sys/ofed/include/linux/dmapool.h (.../head) (revision 219811) @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_DMAPOOL_H_ +#define _LINUX_DMAPOOL_H_ + +#include +#include +#include +#include +#include + +struct dma_pool { + uma_zone_t pool_zone; +}; + +static inline struct dma_pool * +dma_pool_create(char *name, struct device *dev, size_t size, + size_t align, size_t boundary) +{ + struct dma_pool *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + align--; + /* + * XXX Eventually this could use a seperate allocf to honor boundary + * and physical address requirements of the device. + */ + pool->pool_zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, + align, UMA_ZONE_OFFPAGE|UMA_ZONE_HASH); + + return (pool); +} + +static inline void +dma_pool_destroy(struct dma_pool *pool) +{ + uma_zdestroy(pool->pool_zone); + kfree(pool); +} + +static inline void * +dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) +{ + void *vaddr; + + vaddr = uma_zalloc(pool->pool_zone, mem_flags); + if (vaddr) + *handle = vtophys(vaddr); + return (vaddr); +} + +static inline void +dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr) +{ + uma_zfree(pool->pool_zone, vaddr); +} + + +#endif /* _LINUX_DMAPOOL_H_ */ Index: sys/ofed/include/linux/jhash.h =================================================================== --- sys/ofed/include/linux/jhash.h (.../base) (revision 0) +++ sys/ofed/include/linux/jhash.h (.../head) (revision 219811) @@ -0,0 +1,143 @@ +#ifndef _LINUX_JHASH_H_ +#define _LINUX_JHASH_H_ + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup2.c, by Bob Jenkins, December 1996, Public Domain. + * hash(), hash2(), hash3, and mix() are externally useful functions. + * Routines to test the hash are included if SELF_TEST is defined. + * You can use this free for any purpose. It has no warranty. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + * + * I've modified Bob's hash to be useful in the Linux kernel, and + * any bugs present are surely my fault. -DaveM + */ + +/* NOTE: Arguments are modified. */ +#define __jhash_mix(a, b, c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* The golden ration: an arbitrary value */ +#define JHASH_GOLDEN_RATIO 0x9e3779b9 + +/* The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. + */ +static inline u32 jhash(const void *key, u32 length, u32 initval) +{ + u32 a, b, c, len; + const u8 *k = key; + + len = length; + a = b = JHASH_GOLDEN_RATIO; + c = initval; + + while (len >= 12) { + a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24)); + b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24)); + c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24)); + + __jhash_mix(a,b,c); + + k += 12; + len -= 12; + } + + c += length; + switch (len) { + case 11: c += ((u32)k[10]<<24); + case 10: c += ((u32)k[9]<<16); + case 9 : c += ((u32)k[8]<<8); + case 8 : b += ((u32)k[7]<<24); + case 7 : b += ((u32)k[6]<<16); + case 6 : b += ((u32)k[5]<<8); + case 5 : b += k[4]; + case 4 : a += ((u32)k[3]<<24); + case 3 : a += ((u32)k[2]<<16); + case 2 : a += ((u32)k[1]<<8); + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + +/* A special optimized version that handles 1 or more of u32s. + * The length parameter here is the number of u32s in the key. + */ +static inline u32 jhash2(const u32 *k, u32 length, u32 initval) +{ + u32 a, b, c, len; + + a = b = JHASH_GOLDEN_RATIO; + c = initval; + len = length; + + while (len >= 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + k += 3; len -= 3; + } + + c += length * 4; + + switch (len) { + case 2 : b += k[1]; + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + + +/* A special ultra-optimized versions that knows they are hashing exactly + * 3, 2 or 1 word(s). + * + * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally + * done at the end is not done here. + */ +static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) +{ + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += initval; + + __jhash_mix(a, b, c); + + return c; +} + +static inline u32 jhash_2words(u32 a, u32 b, u32 initval) +{ + return jhash_3words(a, b, 0, initval); +} + +static inline u32 jhash_1word(u32 a, u32 initval) +{ + return jhash_3words(a, 0, 0, initval); +} + +#endif /* _LINUX_JHASH_H_ */ Index: sys/ofed/include/linux/rwsem.h =================================================================== --- sys/ofed/include/linux/rwsem.h (.../base) (revision 0) +++ sys/ofed/include/linux/rwsem.h (.../head) (revision 219811) @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_RWSEM_H_ +#define _LINUX_RWSEM_H_ + +#include +#include +#include + +struct rw_semaphore { + struct sx sx; +}; + +#define down_write(_rw) sx_xlock(&(_rw)->sx) +#define up_write(_rw) sx_xunlock(&(_rw)->sx) +#define down_read(_rw) sx_slock(&(_rw)->sx) +#define up_read(_rw) sx_sunlock(&(_rw)->sx) +#define down_read_trylock(_rw) !!sx_try_slock(&(_rw)->sx) +#define down_write_trylock(_rw) !!sx_try_xlock(&(_rw)->sx) +#define downgrade_write(_rw) sx_downgrade(&(_rw)->sx) +#define down_read_nested(_rw, _sc) down_read(_rw) + +static inline void +init_rwsem(struct rw_semaphore *rw) +{ + + memset(&rw->sx, 0, sizeof(rw->sx)); + sx_init_flags(&rw->sx, "lnxrwsem", SX_NOWITNESS); +} + +#endif /* _LINUX_RWSEM_H_ */ Index: sys/ofed/include/linux/inet.h =================================================================== --- sys/ofed/include/linux/inet.h (.../base) (revision 0) +++ sys/ofed/include/linux/inet.h (.../head) (revision 219811) @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_INET_H_ +#define _LINUX_INET_H_ +#endif /* _LINUX_INET_H_ */ Index: sys/ofed/include/linux/init.h =================================================================== --- sys/ofed/include/linux/init.h (.../base) (revision 0) +++ sys/ofed/include/linux/init.h (.../head) (revision 219811) @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_INIT_H_ +#define _LINUX_INIT_H_ + +#endif /* _LINUX_INIT_H_ */ Index: sys/ofed/include/linux/in.h =================================================================== --- sys/ofed/include/linux/in.h (.../base) (revision 0) +++ sys/ofed/include/linux/in.h (.../head) (revision 219811) @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_IN_H_ +#define _LINUX_IN_H_ + +#include +#include + +#define ipv4_is_zeronet IN_ZERONET +#define ipv4_is_loopback IN_LOOPBACK + +#endif /* _LINUX_IN_H_ */ Index: sys/ofed/include/linux/netdevice.h =================================================================== --- sys/ofed/include/linux/netdevice.h (.../base) (revision 0) +++ sys/ofed/include/linux/netdevice.h (.../head) (revision 219811) @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_NETDEVICE_H_ +#define _LINUX_NETDEVICE_H_ + +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct net { +}; + +extern struct net init_net; + +#define MAX_ADDR_LEN 20 + +#define net_device ifnet + +#define dev_get_by_index(n, idx) ifnet_byindex_ref((idx)) +#define dev_hold(d) if_ref((d)) +#define dev_put(d) if_rele((d)) + +#define netif_running(dev) !!((dev)->if_drv_flags & IFF_DRV_RUNNING) +#define netif_oper_up(dev) !!((dev)->if_flags & IFF_UP) +#define netif_carrier_ok(dev) netif_running(dev) + +static inline void * +netdev_priv(const struct net_device *dev) +{ + return (dev->if_softc); +} + +static inline void +_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) +{ + struct notifier_block *nb; + + nb = arg; + if (linkstate == LINK_STATE_UP) + nb->notifier_call(nb, NETDEV_UP, ifp); + else + nb->notifier_call(nb, NETDEV_DOWN, ifp); +} + +static inline void +_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) +{ + struct notifier_block *nb; + + nb = arg; + nb->notifier_call(nb, NETDEV_REGISTER, ifp); +} + +static inline void +_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) +{ + struct notifier_block *nb; + + nb = arg; + nb->notifier_call(nb, NETDEV_UNREGISTER, ifp); +} + +static inline int +register_netdevice_notifier(struct notifier_block *nb) +{ + + nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( + ifnet_link_event, _handle_ifnet_link_event, nb, 0); + nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( + ifnet_arrival_event, _handle_ifnet_arrival_event, nb, 0); + nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( + ifnet_departure_event, _handle_ifnet_departure_event, nb, 0); + return (0); +} + +static inline int +unregister_netdevice_notifier(struct notifier_block *nb) +{ + + EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]); + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + nb->tags[NETDEV_UNREGISTER]); + return (0); +} + +#define rtnl_lock() +#define rtnl_unlock() + +static inline int +dev_mc_delete(struct net_device *dev, void *addr, int alen, int all) +{ + struct sockaddr_dl sdl; + + if (alen > sizeof(sdl.sdl_data)) + return (-EINVAL); + memset(&sdl, 0, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_alen = alen; + memcpy(&sdl.sdl_data, addr, alen); + + return -if_delmulti(dev, (struct sockaddr *)&sdl); +} + +static inline int +dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly) +{ + struct sockaddr_dl sdl; + + if (alen > sizeof(sdl.sdl_data)) + return (-EINVAL); + memset(&sdl, 0, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_alen = alen; + memcpy(&sdl.sdl_data, addr, alen); + + return -if_addmulti(dev, (struct sockaddr *)&sdl, NULL); +} + +#endif /* _LINUX_NETDEVICE_H_ */ Index: sys/ofed/include/linux/poll.h =================================================================== --- sys/ofed/include/linux/poll.h (.../base) (revision 0) +++ sys/ofed/include/linux/poll.h (.../head) (revision 219811) @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_POLL_H_ +#define _LINUX_POLL_H_ + +#include +#include + +typedef struct poll_table_struct { +} poll_table; + +static inline void +poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) +{ + selrecord(curthread, &filp->f_selinfo); +} + +#endif /* _LINUX_POLL_H_ */ Index: sys/ofed/include/linux/ioctl.h =================================================================== --- sys/ofed/include/linux/ioctl.h (.../base) (revision 0) +++ sys/ofed/include/linux/ioctl.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IOCTL_H_ +#define _LINUX_IOCTL_H_ + +#include + +#endif /* _LINUX_IOCTL_H_ */ Index: sys/ofed/include/linux/rtnetlink.h =================================================================== --- sys/ofed/include/linux/rtnetlink.h (.../base) (revision 0) +++ sys/ofed/include/linux/rtnetlink.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/linux/idr.h =================================================================== --- sys/ofed/include/linux/idr.h (.../base) (revision 0) +++ sys/ofed/include/linux/idr.h (.../head) (revision 219811) @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IDR_H_ +#define _LINUX_IDR_H_ + +#include + +#define IDR_BITS 5 +#define IDR_SIZE (1 << IDR_BITS) +#define IDR_MASK (IDR_SIZE - 1) + +#define MAX_ID_SHIFT ((sizeof(int) * NBBY) - 1) +#define MAX_ID_BIT (1U << MAX_ID_SHIFT) +#define MAX_ID_MASK (MAX_ID_BIT - 1) +#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS + +struct idr_layer { + unsigned long bitmap; + struct idr_layer *ary[IDR_SIZE]; +}; + +struct idr { + struct mtx lock; + struct idr_layer *top; + struct idr_layer *free; + int layers; +}; + +#define DEFINE_IDR(name) \ + struct idr name; \ + SYSINIT(name##_idr_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, \ + idr_init, &(name)); + +void *idr_find(struct idr *idp, int id); +int idr_pre_get(struct idr *idp, gfp_t gfp_mask); +int idr_get_new(struct idr *idp, void *ptr, int *id); +int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); +void *idr_replace(struct idr *idp, void *ptr, int id); +void idr_remove(struct idr *idp, int id); +void idr_remove_all(struct idr *idp); +void idr_destroy(struct idr *idp); +void idr_init(struct idr *idp); + +#endif /* _LINUX_IDR_H_ */ Index: sys/ofed/include/linux/linux_radix.c =================================================================== --- sys/ofed/include/linux/linux_radix.c (.../base) (revision 0) +++ sys/ofed/include/linux/linux_radix.c (.../head) (revision 219811) @@ -0,0 +1,170 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MALLOC_DEFINE(M_RADIX, "radix", "Linux radix compat"); + +static inline int +radix_max(struct radix_tree_root *root) +{ + return (1 << (root->height * RADIX_TREE_MAP_SHIFT)) - 1; +} + +static inline int +radix_pos(long id, int height) +{ + return (id >> (RADIX_TREE_MAP_SHIFT * height)) & RADIX_TREE_MAP_MASK; +} + +void * +radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_node *node; + void *item; + int height; + + item = NULL; + node = root->rnode; + height = root->height - 1; + if (index > radix_max(root)) + goto out; + while (height && node) + node = node->slots[radix_pos(index, height--)]; + if (node) + item = node->slots[radix_pos(index, 0)]; + +out: + return (item); +} + +void * +radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_node *stack[RADIX_TREE_MAX_HEIGHT]; + struct radix_tree_node *node; + void *item; + int height; + int idx; + + item = NULL; + node = root->rnode; + height = root->height - 1; + if (index > radix_max(root)) + goto out; + /* + * Find the node and record the path in stack. + */ + while (height && node) { + stack[height] = node; + node = node->slots[radix_pos(index, height--)]; + } + idx = radix_pos(index, 0); + if (node) + item = node->slots[idx]; + /* + * If we removed something reduce the height of the tree. + */ + if (item) + for (;;) { + node->slots[idx] = NULL; + node->count--; + if (node->count > 0) + break; + free(node, M_RADIX); + if (node == root->rnode) { + root->rnode = NULL; + root->height = 0; + break; + } + height++; + node = stack[height]; + idx = radix_pos(index, height); + } +out: + return (item); +} + +int +radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) +{ + struct radix_tree_node *node; + int height; + int idx; + + /* + * Expand the tree to fit indexes as big as requested. + */ + while (root->rnode == NULL || radix_max(root) < index) { + node = malloc(sizeof(*node), M_RADIX, root->gfp_mask | M_ZERO); + if (node == NULL) + return (-ENOMEM); + node->slots[0] = root->rnode; + if (root->rnode) + node->count++; + root->rnode = node; + root->height++; + } + node = root->rnode; + height = root->height - 1; + /* + * Walk down the tree finding the correct node and allocating any + * missing nodes along the way. + */ + while (height) { + idx = radix_pos(index, height); + if (node->slots[idx] == NULL) { + node->slots[idx] = malloc(sizeof(*node), M_RADIX, + root->gfp_mask | M_ZERO); + if (node->slots[idx] == NULL) + return (-ENOMEM); + node->count++; + } + node = node->slots[idx]; + height--; + } + /* + * Insert and adjust count if the item does not already exist. + */ + idx = radix_pos(index, 0); + if (node->slots[idx]) + return (-EEXIST); + node->slots[idx] = item; + node->count++; + + return (0); +} Index: sys/ofed/include/linux/uaccess.h =================================================================== --- sys/ofed/include/linux/uaccess.h (.../base) (revision 0) +++ sys/ofed/include/linux/uaccess.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_UACCESS_H_ +#define _LINUX_UACCESS_H_ + +#define get_user(_x, _p) -copyin((_p), &(_x), sizeof(*(_p))) +#define put_user(_x, _p) -copyout(&(_x), (_p), sizeof(*(_p))) + +#endif /* _LINUX_UACCESS_H_ */ Index: sys/ofed/include/linux/hardirq.h =================================================================== --- sys/ofed/include/linux/hardirq.h (.../base) (revision 0) +++ sys/ofed/include/linux/hardirq.h (.../head) (revision 219811) @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_HARDIRQ_H_ +#define _LINUX_HARDIRQ_H_ + +#include + +#include +#include +#include + +#define synchronize_irq(irq) _intr_drain((irq)) + +#endif /* _LINUX_HARDIRQ_H_ */ Index: sys/ofed/include/linux/in6.h =================================================================== --- sys/ofed/include/linux/in6.h (.../base) (revision 0) +++ sys/ofed/include/linux/in6.h (.../head) (revision 219811) @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IN6_H_ +#define _LINUX_IN6_H_ + +#ifndef KLD_MODULE +#include "opt_inet6.h" +#endif + +#endif /* _LINUX_IN6_H_ */ Index: sys/ofed/include/linux/dma-attrs.h =================================================================== --- sys/ofed/include/linux/dma-attrs.h (.../base) (revision 0) +++ sys/ofed/include/linux/dma-attrs.h (.../head) (revision 219811) @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_DMA_ATTR_H_ +#define _LINUX_DMA_ATTR_H_ + +enum dma_attr { DMA_ATTR_WRITE_BARRIER, DMA_ATTR_WEAK_ORDERING, DMA_ATTR_MAX, }; + +#define __DMA_ATTRS_LONGS BITS_TO_LONGS(DMA_ATTR_MAX) + +struct dma_attrs { + unsigned long flags; +}; + +#define DEFINE_DMA_ATTRS(x) struct dma_attrs x = { } + +static inline void +init_dma_attrs(struct dma_attrs *attrs) +{ + attrs->flags = 0; +} + +#endif /* _LINUX_DMA_ATTR_H_ */ Index: sys/ofed/include/linux/delay.h =================================================================== --- sys/ofed/include/linux/delay.h (.../base) (revision 0) +++ sys/ofed/include/linux/delay.h (.../head) (revision 219811) @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_DELAY_H_ +#define _LINUX_DELAY_H_ + +#include + +static inline void +linux_msleep(int ms) +{ + pause("lnxsleep", msecs_to_jiffies(ms)); +} + +#undef msleep +#define msleep linux_msleep + +#endif /* _LINUX_DELAY_H_ */ Index: sys/ofed/include/linux/device.h =================================================================== --- sys/ofed/include/linux/device.h (.../base) (revision 0) +++ sys/ofed/include/linux/device.h (.../head) (revision 219811) @@ -0,0 +1,388 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_DEVICE_H_ +#define _LINUX_DEVICE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +enum irqreturn { IRQ_NONE = 0, IRQ_HANDLED, IRQ_WAKE_THREAD, }; +typedef enum irqreturn irqreturn_t; + +struct class { + const char *name; + struct module *owner; + struct kobject kobj; + devclass_t bsdclass; + void (*class_release)(struct class *class); + void (*dev_release)(struct device *dev); +}; + +struct device { + struct device *parent; + struct list_head irqents; + device_t bsddev; + dev_t devt; + struct class *class; + void (*release)(struct device *dev); + struct kobject kobj; + uint64_t *dma_mask; + void *driver_data; + unsigned int irq; + unsigned int msix; + unsigned int msix_max; +}; + +extern struct device linux_rootdev; +extern struct kobject class_root; + +struct class_attribute { + struct attribute attr; + ssize_t (*show)(struct class *, char *); + ssize_t (*store)(struct class *, const char *, size_t); +}; +#define CLASS_ATTR(_name, _mode, _show, _store) \ + struct class_attribute class_attr_##_name = \ + { { #_name, NULL, _mode }, _show, _store } + +struct device_attribute { + struct attribute attr; + ssize_t (*show)(struct device *, + struct device_attribute *, char *); + ssize_t (*store)(struct device *, + struct device_attribute *, const char *, + size_t); +}; + +#define DEVICE_ATTR(_name, _mode, _show, _store) \ + struct device_attribute dev_attr_##_name = \ + { { #_name, NULL, _mode }, _show, _store } + +#define dev_err(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__) +#define dev_warn(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__) +#define dev_info(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__) +#define dev_printk(lvl, dev, fmt, ...) \ + device_printf((dev)->bsddev, fmt, ##__VA_ARGS__) + +static inline void * +dev_get_drvdata(struct device *dev) +{ + + return dev->driver_data; +} + +static inline void +dev_set_drvdata(struct device *dev, void *data) +{ + + dev->driver_data = data; +} + +static inline struct device * +get_device(struct device *dev) +{ + + if (dev) + kobject_get(&dev->kobj); + + return (dev); +} + +static inline char * +dev_name(const struct device *dev) +{ + + return kobject_name(&dev->kobj); +} + +#define dev_set_name(_dev, _fmt, ...) \ + kobject_set_name(&(_dev)->kobj, (_fmt), ##__VA_ARGS__) + +static inline void +put_device(struct device *dev) +{ + + if (dev) + kobject_put(&dev->kobj); +} + +static inline ssize_t +class_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct class_attribute *dattr; + ssize_t error; + + dattr = container_of(attr, struct class_attribute, attr); + error = -EIO; + if (dattr->show) + error = dattr->show(container_of(kobj, struct class, kobj), + buf); + return (error); +} + +static inline ssize_t +class_store(struct kobject *kobj, struct attribute *attr, const char *buf, + size_t count) +{ + struct class_attribute *dattr; + ssize_t error; + + dattr = container_of(attr, struct class_attribute, attr); + error = -EIO; + if (dattr->store) + error = dattr->store(container_of(kobj, struct class, kobj), + buf, count); + return (error); +} + +static inline void +class_release(struct kobject *kobj) +{ + struct class *class; + + class = container_of(kobj, struct class, kobj); + if (class->class_release) + class->class_release(class); +} + +static struct sysfs_ops class_sysfs = { + .show = class_show, + .store = class_store, +}; +static struct kobj_type class_ktype = { + .release = class_release, + .sysfs_ops = &class_sysfs +}; + +static inline int +class_register(struct class *class) +{ + + class->bsdclass = devclass_create(class->name); + kobject_init(&class->kobj, &class_ktype); + kobject_set_name(&class->kobj, class->name); + kobject_add(&class->kobj, &class_root, class->name); + + return (0); +} + +static inline void +class_unregister(struct class *class) +{ + + kobject_put(&class->kobj); +} + +static inline void +device_release(struct kobject *kobj) +{ + struct device *dev; + + dev = container_of(kobj, struct device, kobj); + /* This is the precedence defined by linux. */ + if (dev->release) + dev->release(dev); + else if (dev->class && dev->class->dev_release) + dev->class->dev_release(dev); +} + +static inline ssize_t +dev_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct device_attribute *dattr; + ssize_t error; + + dattr = container_of(attr, struct device_attribute, attr); + error = -EIO; + if (dattr->show) + error = dattr->show(container_of(kobj, struct device, kobj), + dattr, buf); + return (error); +} + +static inline ssize_t +dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, + size_t count) +{ + struct device_attribute *dattr; + ssize_t error; + + dattr = container_of(attr, struct device_attribute, attr); + error = -EIO; + if (dattr->store) + error = dattr->store(container_of(kobj, struct device, kobj), + dattr, buf, count); + return (error); +} + +static struct sysfs_ops dev_sysfs = { .show = dev_show, .store = dev_store, }; +static struct kobj_type dev_ktype = { + .release = device_release, + .sysfs_ops = &dev_sysfs +}; + +/* + * Devices are registered and created for exporting to sysfs. create + * implies register and register assumes the device fields have been + * setup appropriately before being called. + */ +static inline int +device_register(struct device *dev) +{ + device_t bsddev; + int unit; + + bsddev = NULL; + if (dev->devt) { + unit = MINOR(dev->devt); + bsddev = devclass_get_device(dev->class->bsdclass, unit); + } else + unit = -1; + if (bsddev == NULL) + bsddev = device_add_child(dev->parent->bsddev, + dev->class->kobj.name, unit); + if (bsddev) { + if (dev->devt == 0) + dev->devt = makedev(0, device_get_unit(bsddev)); + device_set_softc(bsddev, dev); + } + dev->bsddev = bsddev; + kobject_init(&dev->kobj, &dev_ktype); + kobject_add(&dev->kobj, &dev->class->kobj, dev_name(dev)); + + return (0); +} + +static inline void +device_unregister(struct device *dev) +{ + device_t bsddev; + + bsddev = dev->bsddev; + mtx_lock(&Giant); + if (bsddev) + device_delete_child(device_get_parent(bsddev), bsddev); + mtx_unlock(&Giant); + put_device(dev); +} + +struct device *device_create(struct class *class, struct device *parent, + dev_t devt, void *drvdata, const char *fmt, ...); + +static inline void +device_destroy(struct class *class, dev_t devt) +{ + device_t bsddev; + int unit; + + unit = MINOR(devt); + bsddev = devclass_get_device(class->bsdclass, unit); + if (bsddev) + device_unregister(device_get_softc(bsddev)); +} + +static inline void +class_kfree(struct class *class) +{ + + kfree(class); +} + +static inline struct class * +class_create(struct module *owner, const char *name) +{ + struct class *class; + int error; + + class = kzalloc(sizeof(*class), M_WAITOK); + class->owner = owner; + class->name= name; + class->class_release = class_kfree; + error = class_register(class); + if (error) { + kfree(class); + return (NULL); + } + + return (class); +} + +static inline void +class_destroy(struct class *class) +{ + + if (class == NULL) + return; + class_unregister(class); +} + +static inline int +device_create_file(struct device *dev, const struct device_attribute *attr) +{ + + if (dev) + return sysfs_create_file(&dev->kobj, &attr->attr); + return -EINVAL; +} + +static inline void +device_remove_file(struct device *dev, const struct device_attribute *attr) +{ + + if (dev) + sysfs_remove_file(&dev->kobj, &attr->attr); +} + +static inline int +class_create_file(struct class *class, const struct class_attribute *attr) +{ + + if (class) + return sysfs_create_file(&class->kobj, &attr->attr); + return -EINVAL; +} + +static inline void +class_remove_file(struct class *class, const struct class_attribute *attr) +{ + + if (class) + sysfs_remove_file(&class->kobj, &attr->attr); +} + +#endif /* _LINUX_DEVICE_H_ */ Index: sys/ofed/include/linux/bitops.h =================================================================== --- sys/ofed/include/linux/bitops.h (.../base) (revision 0) +++ sys/ofed/include/linux/bitops.h (.../head) (revision 219811) @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_BITOPS_H_ +#define _LINUX_BITOPS_H_ + +#ifdef __LP64__ +#define BITS_PER_LONG 64 +#else +#define BITS_PER_LONG 32 +#endif +#define BIT_MASK(n) (~0UL >> (BITS_PER_LONG - (n))) +#define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG) + +static inline int +__ffs(int mask) +{ + return (ffs(mask) - 1); +} + +static inline int +__fls(int mask) +{ + return (fls(mask) - 1); +} + +static inline int +__ffsl(long mask) +{ + return (ffsl(mask) - 1); +} + +static inline int +__flsl(long mask) +{ + return (flsl(mask) - 1); +} + + +#define ffz(mask) __ffs(~(mask)) + +static inline unsigned long +find_first_bit(unsigned long *addr, unsigned long size) +{ + long mask; + int bit; + + for (bit = 0; size >= BITS_PER_LONG; + size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) { + if (*addr == 0) + continue; + return (bit + __ffsl(*addr)); + } + if (size) { + mask = (*addr) & BIT_MASK(size); + if (mask) + bit += __ffsl(mask); + else + bit += size; + } + return (bit); +} + +static inline unsigned long +find_first_zero_bit(unsigned long *addr, unsigned long size) +{ + long mask; + int bit; + + for (bit = 0; size >= BITS_PER_LONG; + size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) { + if (~(*addr) == 0) + continue; + return (bit + __ffsl(~(*addr))); + } + if (size) { + mask = ~(*addr) & BIT_MASK(size); + if (mask) + bit += __ffsl(mask); + else + bit += size; + } + return (bit); +} + +static inline unsigned long +find_last_bit(unsigned long *addr, unsigned long size) +{ + long mask; + int offs; + int bit; + int pos; + + pos = size / BITS_PER_LONG; + offs = size % BITS_PER_LONG; + bit = BITS_PER_LONG * pos; + addr += pos; + if (offs) { + mask = (*addr) & BIT_MASK(offs); + if (mask) + return (bit + __flsl(mask)); + } + while (--pos) { + addr--; + bit -= BITS_PER_LONG; + if (*addr) + return (bit + __flsl(mask)); + } + return (size); +} + +static inline unsigned long +find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset) +{ + long mask; + int offs; + int bit; + int pos; + + if (offset >= size) + return (size); + pos = offset / BITS_PER_LONG; + offs = offset % BITS_PER_LONG; + bit = BITS_PER_LONG * pos; + addr += pos; + if (offs) { + mask = (*addr) & ~BIT_MASK(offs); + if (mask) + return (bit + __ffsl(mask)); + bit += BITS_PER_LONG; + addr++; + } + for (size -= bit; size >= BITS_PER_LONG; + size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) { + if (*addr == 0) + continue; + return (bit + __ffsl(*addr)); + } + if (size) { + mask = (*addr) & BIT_MASK(size); + if (mask) + bit += __ffsl(mask); + else + bit += size; + } + return (bit); +} + +static inline unsigned long +find_next_zero_bit(unsigned long *addr, unsigned long size, + unsigned long offset) +{ + long mask; + int offs; + int bit; + int pos; + + if (offset >= size) + return (size); + pos = offset / BITS_PER_LONG; + offs = offset % BITS_PER_LONG; + bit = BITS_PER_LONG * pos; + addr += pos; + if (offs) { + mask = ~(*addr) & ~BIT_MASK(offs); + if (mask) + return (bit + __ffsl(mask)); + bit += BITS_PER_LONG; + addr++; + } + for (size -= bit; size >= BITS_PER_LONG; + size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) { + if (~(*addr) == 0) + continue; + return (bit + __ffsl(~(*addr))); + } + if (size) { + mask = ~(*addr) & BIT_MASK(size); + if (mask) + bit += __ffsl(mask); + else + bit += size; + } + return (bit); +} + +static inline void +bitmap_zero(unsigned long *addr, int size) +{ + int len; + + len = BITS_TO_LONGS(size) * sizeof(long); + memset(addr, 0, len); +} + +static inline void +bitmap_fill(unsigned long *addr, int size) +{ + int tail; + int len; + + len = (size / BITS_PER_LONG) * sizeof(long); + memset(addr, 0xff, len); + tail = size & (BITS_PER_LONG - 1); + if (tail) + addr[size / BITS_PER_LONG] = BIT_MASK(tail); +} + +static inline int +bitmap_full(unsigned long *addr, int size) +{ + long mask; + int tail; + int len; + int i; + + len = size / BITS_PER_LONG; + for (i = 0; i < len; i++) + if (addr[i] != ~0UL) + return (0); + tail = size & (BITS_PER_LONG - 1); + if (tail) { + mask = BIT_MASK(tail); + if ((addr[i] & mask) != mask) + return (0); + } + return (1); +} + +static inline int +bitmap_empty(unsigned long *addr, int size) +{ + long mask; + int tail; + int len; + int i; + + len = size / BITS_PER_LONG; + for (i = 0; i < len; i++) + if (addr[i] != 0) + return (0); + tail = size & (BITS_PER_LONG - 1); + if (tail) { + mask = BIT_MASK(tail); + if ((addr[i] & mask) != 0) + return (0); + } + return (1); +} + +#define NBINT (NBBY * sizeof(int)) + +#define set_bit(i, a) \ + atomic_set_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT) + +#define clear_bit(i, a) \ + atomic_clear_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT) + +#define test_bit(i, a) \ + !!(atomic_load_acq_int(&((volatile int *)(a))[(i)/NBINT]) & 1 << ((i) % NBINT)) + +static inline long +test_and_clear_bit(long bit, long *var) +{ + long val; + + bit = 1 << bit; + do { + val = *(volatile long *)var; + } while (atomic_cmpset_long(var, val, val & ~bit) == 0); + + return !!(val & bit); +} + +static inline long +test_and_set_bit(long bit, long *var) +{ + long val; + + bit = 1 << bit; + do { + val = *(volatile long *)var; + } while (atomic_cmpset_long(var, val, val | bit) == 0); + + return !!(val & bit); +} + +#endif /* _LINUX_BITOPS_H_ */ Index: sys/ofed/include/linux/radix-tree.h =================================================================== --- sys/ofed/include/linux/radix-tree.h (.../base) (revision 0) +++ sys/ofed/include/linux/radix-tree.h (.../head) (revision 219811) @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_RADIX_TREE_H_ +#define _LINUX_RADIX_TREE_H_ + +#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SIZE (1 << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE - 1) +#define RADIX_TREE_MAX_HEIGHT \ + DIV_ROUND_UP((sizeof(long) * NBBY), RADIX_TREE_MAP_SHIFT) + +struct radix_tree_node { + void *slots[RADIX_TREE_MAP_SIZE]; + int count; +}; + +struct radix_tree_root { + struct radix_tree_node *rnode; + gfp_t gfp_mask; + int height; +}; + +#define RADIX_TREE_INIT(mask) \ + { .rnode = NULL, .gfp_mask = mask, .height = 0 }; +#define INIT_RADIX_TREE(root, mask) \ + { (root)->rnode = NULL; (root)->gfp_mask = mask; (root)->height = 0; } +#define RADIX_TREE(name, mask) \ + struct radix_tree_root name = RADIX_TREE_INIT(mask) + +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + +#endif /* _LINUX_RADIX_TREE_H_ */ Index: sys/ofed/include/linux/fs.h =================================================================== --- sys/ofed/include/linux/fs.h (.../base) (revision 0) +++ sys/ofed/include/linux/fs.h (.../head) (revision 219811) @@ -0,0 +1,182 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_FS_H_ +#define _LINUX_FS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct module; +struct kiocb; +struct iovec; +struct dentry; +struct page; +struct file_lock; +struct pipe_inode_info; +struct vm_area_struct; +struct poll_table_struct; +struct files_struct; + +#define inode vnode +#define i_cdev v_rdev + +#define S_IRUGO (S_IRUSR | S_IRGRP | S_IROTH) +#define S_IWUGO (S_IWUSR | S_IWGRP | S_IWOTH) + + +typedef struct files_struct *fl_owner_t; + +struct dentry { + struct inode *d_inode; +}; + +struct file_operations; + +struct linux_file { + struct file *_file; + const struct file_operations *f_op; + void *private_data; + int f_flags; + int f_mode; /* Just starting mode. */ + struct dentry *f_dentry; + struct dentry f_dentry_store; + struct selinfo f_selinfo; + struct sigio *f_sigio; +}; + +#define file linux_file +#define fasync_struct sigio * + +#define fasync_helper(fd, filp, on, queue) \ +({ \ + if ((on)) \ + *(queue) = &(filp)->f_sigio; \ + else \ + *(queue) = NULL; \ + 0; \ +}) + +#define kill_fasync(queue, sig, pollstat) \ +do { \ + if (*(queue) != NULL) \ + pgsigio(*(queue), (sig), 0); \ +} while (0) + +typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); + +struct file_operations { + struct module *owner; + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + unsigned int (*poll) (struct file *, struct poll_table_struct *); + long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); + int (*mmap)(struct file *, struct vm_area_struct *); + int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + int (*fasync)(int, struct file *, int); +#if 0 + /* We do not support these methods. Don't permit them to compile. */ + loff_t (*llseek)(struct file *, loff_t, int); + ssize_t (*aio_read)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + ssize_t (*aio_write)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + int (*readdir)(struct file *, void *, filldir_t); + int (*ioctl)(struct inode *, struct file *, unsigned int, + unsigned long); + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + int (*flush)(struct file *, fl_owner_t id); + int (*fsync)(struct file *, struct dentry *, int datasync); + int (*aio_fsync)(struct kiocb *, int datasync); + int (*lock)(struct file *, int, struct file_lock *); + ssize_t (*sendpage)(struct file *, struct page *, int, size_t, + loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + int (*check_flags)(int); + int (*flock)(struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **); +#endif +}; +#define fops_get(fops) (fops) + +#define FMODE_READ FREAD +#define FMODE_WRITE FWRITE +#define FMODE_EXEC FEXEC + +static inline int +register_chrdev_region(dev_t dev, unsigned range, const char *name) +{ + + return 0; +} + +static inline void +unregister_chrdev_region(dev_t dev, unsigned range) +{ + + return; +} + +static inline dev_t +iminor(struct inode *inode) +{ + + return dev2unit(inode->v_rdev); +} + +static inline struct inode * +igrab(struct inode *inode) +{ + int error; + + error = vget(inode, 0, curthread); + if (error) + return (NULL); + + return (inode); +} + +static inline void +iput(struct inode *inode) +{ + + vrele(inode); +} + +#endif /* _LINUX_FS_H_ */ Index: sys/ofed/include/linux/list.h =================================================================== --- sys/ofed/include/linux/list.h (.../base) (revision 0) +++ sys/ofed/include/linux/list.h (.../head) (revision 219811) @@ -0,0 +1,331 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_LIST_H_ +#define _LINUX_LIST_H_ + +/* + * Since LIST_HEAD conflicts with the linux definition we must include any + * FreeBSD header which requires it here so it is resolved with the correct + * definition prior to the undef. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#define prefetch(x) + +struct list_head { + struct list_head *next; + struct list_head *prev; +}; + +static inline void +INIT_LIST_HEAD(struct list_head *list) +{ + + list->next = list->prev = list; +} + +static inline int +list_empty(const struct list_head *head) +{ + + return (head->next == head); +} + +static inline void +list_del(struct list_head *entry) +{ + + entry->next->prev = entry->prev; + entry->prev->next = entry->next; +} + +static inline void +_list_add(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void +list_del_init(struct list_head *entry) +{ + + list_del(entry); + INIT_LIST_HEAD(entry); +} + +#define list_entry(ptr, type, field) container_of(ptr, type, field) + +#define list_for_each(p, head) \ + for (p = (head)->next; p != (head); p = p->next) + +#define list_for_each_safe(p, n, head) \ + for (p = (head)->next, n = p->next; p != (head); p = n, n = p->next) + +#define list_for_each_entry(p, h, field) \ + for (p = list_entry((h)->next, typeof(*p), field); &p->field != (h); \ + p = list_entry(p->field.next, typeof(*p), field)) + +#define list_for_each_entry_safe(p, n, h, field) \ + for (p = list_entry((h)->next, typeof(*p), field), \ + n = list_entry(p->field.next, typeof(*p), field); &p->field != (h);\ + p = n, n = list_entry(n->field.next, typeof(*n), field)) + +#define list_for_each_entry_reverse(p, h, field) \ + for (p = list_entry((h)->prev, typeof(*p), field); &p->field != (h); \ + p = list_entry(p->field.prev, typeof(*p), field)) + +#define list_for_each_prev(p, h) for (p = (h)->prev; p != (h); p = p->prev) + +static inline void +list_add(struct list_head *new, struct list_head *head) +{ + + _list_add(new, head, head->next); +} + +static inline void +list_add_tail(struct list_head *new, struct list_head *head) +{ + + _list_add(new, head->prev, head); +} + +static inline void +list_move(struct list_head *list, struct list_head *head) +{ + + list_del(list); + list_add(list, head); +} + +static inline void +list_move_tail(struct list_head *entry, struct list_head *head) +{ + + list_del(entry); + list_add_tail(entry, head); +} + +static inline void +_list_splice(const struct list_head *list, struct list_head *prev, + struct list_head *next) +{ + struct list_head *first; + struct list_head *last; + + if (list_empty(list)) + return; + first = list->next; + last = list->prev; + first->prev = prev; + prev->next = first; + last->next = next; + next->prev = last; +} + +static inline void +list_splice(const struct list_head *list, struct list_head *head) +{ + + _list_splice(list, head, head->next); +} + +static inline void +list_splice_tail(struct list_head *list, struct list_head *head) +{ + + _list_splice(list, head->prev, head); +} + +static inline void +list_splice_init(struct list_head *list, struct list_head *head) +{ + + _list_splice(list, head, head->next); + INIT_LIST_HEAD(list); +} + +static inline void +list_splice_tail_init(struct list_head *list, struct list_head *head) +{ + + _list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); +} + +#undef LIST_HEAD +#define LIST_HEAD(name) struct list_head name = { &(name), &(name) } + + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL +#define INIT_HLIST_NODE(node) \ +do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ +} while (0) + +static inline int +hlist_unhashed(const struct hlist_node *h) +{ + + return !h->pprev; +} + +static inline int +hlist_empty(const struct hlist_head *h) +{ + + return !h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + + if (n->next) + n->next->pprev = n->pprev; + *n->pprev = n->next; +} + +static inline void +hlist_del_init(struct hlist_node *n) +{ + + if (hlist_unhashed(n)) + return; + hlist_del(n); + INIT_HLIST_NODE(n); +} + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + + n->next = h->first; + if (h->first) + h->first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +static inline void +hlist_add_before(struct hlist_node *n, struct hlist_node *next) +{ + + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void +hlist_add_after(struct hlist_node *n, struct hlist_node *next) +{ + + next->next = n->next; + n->next = next; + next->pprev = &n->next; + if (next->next) + next->next->pprev = &next->next; +} + +static inline void +hlist_move_list(struct hlist_head *old, struct hlist_head *new) +{ + + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) + +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = p->next) + +#define hlist_for_each_safe(p, n, head) \ + for (p = (head)->first; p && ({ n = p->next; 1; }); p = n) + +#define hlist_for_each_entry(tp, p, head, field) \ + for (p = (head)->first; \ + p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next) + +#define hlist_for_each_entry_continue(tp, p, field) \ + for (p = (p)->next; \ + p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next) + +#define hlist_for_each_entry_from(tp, p, field) \ + for (; p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next) + +#define hlist_for_each_entry_safe(tp, p, n, head, field) \ + for (p = (head)->first; p ? \ + (n = p->next) | (tp = hlist_entry(p, typeof(*tp), field)) : \ + NULL; p = n) + +#endif /* _LINUX_LIST_H_ */ Index: sys/ofed/include/linux/kdev_t.h =================================================================== --- sys/ofed/include/linux/kdev_t.h (.../base) (revision 0) +++ sys/ofed/include/linux/kdev_t.h (.../head) (revision 219811) @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_KDEV_T_H_ +#define _LINUX_KDEV_T_H_ + +#define MAJOR(dev) major((dev)) +#define MINOR(dev) minor((dev)) +#define MKDEV(ma, mi) makedev((ma), (mi)) + +#endif /* _LINUX_KDEV_T_H_ */ Index: sys/ofed/include/linux/bitmap.h =================================================================== --- sys/ofed/include/linux/bitmap.h (.../base) (revision 0) +++ sys/ofed/include/linux/bitmap.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_BITMAP_H_ +#define _LINUX_BITMAP_H_ + +#include +#include + +#endif /* _LINUX_BITMAP_H_ */ Index: sys/ofed/include/linux/kobject.h =================================================================== --- sys/ofed/include/linux/kobject.h (.../base) (revision 0) +++ sys/ofed/include/linux/kobject.h (.../head) (revision 219811) @@ -0,0 +1,153 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_KOBJECT_H_ +#define _LINUX_KOBJECT_H_ + +#include + +#include +#include +#include + +struct kobject; +struct sysctl_oid; + +struct kobj_type { + void (*release)(struct kobject *kobj); + const struct sysfs_ops *sysfs_ops; + struct attribute **default_attrs; +}; + +extern struct kobj_type kfree_type; + +struct kobject { + struct kobject *parent; + char *name; + struct kref kref; + struct kobj_type *ktype; + struct list_head entry; + struct sysctl_oid *oidp; +}; + +static inline void +kobject_init(struct kobject *kobj, struct kobj_type *ktype) +{ + + kref_init(&kobj->kref); + INIT_LIST_HEAD(&kobj->entry); + kobj->ktype = ktype; + kobj->oidp = NULL; +} + +static inline void kobject_put(struct kobject *kobj); +void kobject_release(struct kref *kref); + +static inline void +kobject_put(struct kobject *kobj) +{ + + if (kobj) + kref_put(&kobj->kref, kobject_release); +} + +static inline struct kobject * +kobject_get(struct kobject *kobj) +{ + + if (kobj) + kref_get(&kobj->kref); + return kobj; +} + +static inline int +kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args) +{ + char *old; + char *name; + + old = kobj->name; + + if (old && !fmt) + return 0; + + name = kzalloc(MAXPATHLEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + vsnprintf(name, MAXPATHLEN, fmt, args); + kobj->name = name; + kfree(old); + for (; *name != '\0'; name++) + if (*name == '/') + *name = '!'; + return (0); +} + +int kobject_add(struct kobject *kobj, struct kobject *parent, + const char *fmt, ...); + +static inline struct kobject * +kobject_create(void) +{ + struct kobject *kobj; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (kobj == NULL) + return (NULL); + kobject_init(kobj, &kfree_type); + + return (kobj); +} + +static inline struct kobject * +kobject_create_and_add(const char *name, struct kobject *parent) +{ + struct kobject *kobj; + + kobj = kobject_create(); + if (kobj == NULL) + return (NULL); + if (kobject_add(kobj, parent, "%s", name) == 0) + return (kobj); + kobject_put(kobj); + + return (NULL); +} + + +static inline char * +kobject_name(const struct kobject *kobj) +{ + + return kobj->name; +} + +int kobject_set_name(struct kobject *kobj, const char *fmt, ...); +int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, + struct kobject *parent, const char *fmt, ...); + +#endif /* _LINUX_KOBJECT_H_ */ Index: sys/ofed/include/linux/cdev.h =================================================================== --- sys/ofed/include/linux/cdev.h (.../base) (revision 0) +++ sys/ofed/include/linux/cdev.h (.../head) (revision 219811) @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_CDEV_H_ +#define _LINUX_CDEV_H_ + +#include +#include +#include + +struct file_operations; +struct inode; +struct module; + +extern struct cdevsw linuxcdevsw; + +struct linux_cdev { + struct kobject kobj; + struct module *owner; + struct cdev *cdev; + dev_t dev; + const struct file_operations *ops; +}; + +static inline void +cdev_release(struct kobject *kobj) +{ + struct linux_cdev *cdev; + + cdev = container_of(kobj, struct linux_cdev, kobj); + if (cdev->cdev) + destroy_dev(cdev->cdev); + kfree(cdev); +} + +static inline void +cdev_static_release(struct kobject *kobj) +{ + struct linux_cdev *cdev; + + cdev = container_of(kobj, struct linux_cdev, kobj); + if (cdev->cdev) + destroy_dev(cdev->cdev); +} + +static struct kobj_type cdev_ktype = { + .release = cdev_release, +}; + +static struct kobj_type cdev_static_ktype = { + .release = cdev_static_release, +}; + +static inline void +cdev_init(struct linux_cdev *cdev, const struct file_operations *ops) +{ + + kobject_init(&cdev->kobj, &cdev_static_ktype); + cdev->ops = ops; +} + +static inline struct linux_cdev * +cdev_alloc(void) +{ + struct linux_cdev *cdev; + + cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); + if (cdev) + kobject_init(&cdev->kobj, &cdev_ktype); + return (cdev); +} + +static inline void +cdev_put(struct linux_cdev *p) +{ + kobject_put(&p->kobj); +} + +static inline int +cdev_add(struct linux_cdev *cdev, dev_t dev, unsigned count) +{ + if (count != 1) + panic("cdev_add: Unsupported count: %d", count); + cdev->cdev = make_dev(&linuxcdevsw, MINOR(dev), 0, 0, 0700, + kobject_name(&cdev->kobj)); + cdev->dev = dev; + cdev->cdev->si_drv1 = cdev; + + return (0); +} + +static inline void +cdev_del(struct linux_cdev *cdev) +{ + if (cdev->cdev) { + destroy_dev(cdev->cdev); + cdev->cdev = NULL; + } + kobject_put(&cdev->kobj); +} + +#define cdev linux_cdev + +#endif /* _LINUX_CDEV_H_ */ Index: sys/ofed/include/linux/slab.h =================================================================== --- sys/ofed/include/linux/slab.h (.../base) (revision 0) +++ sys/ofed/include/linux/slab.h (.../head) (revision 219811) @@ -0,0 +1,102 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SLAB_H_ +#define _LINUX_SLAB_H_ + +#include +#include +#include +#include + +#include +#include + +MALLOC_DECLARE(M_KMALLOC); + +#define kmalloc(size, flags) malloc((size), M_KMALLOC, (flags)) +#define kzalloc(size, flags) kmalloc((size), (flags) | M_ZERO) +#define kfree(ptr) free(__DECONST(void *, (ptr)), M_KMALLOC) +#define krealloc(ptr, size, flags) realloc((ptr), (size), M_KMALLOC, (flags)) +#define kcalloc(n, size, flags) kmalloc((n) * (size), flags | M_ZERO) + +struct kmem_cache { + uma_zone_t cache_zone; + void (*cache_ctor)(void *); +}; + +#define SLAB_HWCACHE_ALIGN 0x0001 + +static inline int +kmem_ctor(void *mem, int size, void *arg, int flags) +{ + void (*ctor)(void *); + + ctor = arg; + ctor(mem); + + return (0); +} + +static inline struct kmem_cache * +kmem_cache_create(char *name, size_t size, size_t align, u_long flags, + void (*ctor)(void *)) +{ + struct kmem_cache *c; + + c = malloc(sizeof(*c), M_KMALLOC, M_WAITOK); + if (align) + align--; + if (flags & SLAB_HWCACHE_ALIGN) + align = UMA_ALIGN_CACHE; + c->cache_zone = uma_zcreate(name, size, ctor ? kmem_ctor : NULL, + NULL, NULL, NULL, align, 0); + c->cache_ctor = ctor; + + return c; +} + +static inline void * +kmem_cache_alloc(struct kmem_cache *c, int flags) +{ + return uma_zalloc_arg(c->cache_zone, c->cache_ctor, flags); +} + +static inline void +kmem_cache_free(struct kmem_cache *c, void *m) +{ + uma_zfree(c->cache_zone, m); +} + +static inline void +kmem_cache_destroy(struct kmem_cache *c) +{ + uma_zdestroy(c->cache_zone); + free(c, M_KMALLOC); +} + +#endif /* _LINUX_SLAB_H_ */ Index: sys/ofed/include/linux/rbtree.h =================================================================== --- sys/ofed/include/linux/rbtree.h (.../base) (revision 0) +++ sys/ofed/include/linux/rbtree.h (.../head) (revision 219811) @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_RBTREE_H_ +#define _LINUX_RBTREE_H_ + +#include +#include + +struct rb_node { + RB_ENTRY(rb_node) __entry; +}; +#define rb_left __entry.rbe_left +#define rb_right __entry.rbe_right + +/* + * We provide a false structure that has the same bit pattern as tree.h + * presents so it matches the member names expected by linux. + */ +struct rb_root { + struct rb_node *rb_node; +}; + +/* + * In linux all of the comparisons are done by the caller. + */ +int panic_cmp(struct rb_node *one, struct rb_node *two); + +RB_HEAD(linux_root, rb_node); +RB_PROTOTYPE(linux_root, rb_node, __entry, panic_cmp); + +#define rb_parent(r) RB_PARENT(r, __entry) +#define rb_color(r) RB_COLOR(r, __entry) +#define rb_is_red(r) (rb_color(r) == RB_RED) +#define rb_is_black(r) (rb_color(r) == RB_BLACK) +#define rb_set_parent(r, p) rb_parent((r)) = (p) +#define rb_set_color(r, c) rb_color((r)) = (c) +#define rb_entry(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) RB_EMPTY((struct linux_root *)root) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) + +#define rb_insert_color(node, root) \ + linux_root_RB_INSERT_COLOR((struct linux_root *)(root), (node)) +#define rb_erase(node, root) \ + linux_root_RB_REMOVE((struct linux_root *)(root), (node)) +#define rb_next(node) RB_NEXT(linux_root, NULL, (node)) +#define rb_prev(node) RB_PREV(linux_root, NULL, (node)) +#define rb_first(root) RB_MIN(linux_root, (struct linux_root *)(root)) +#define rb_last(root) RB_MAX(linux_root, (struct linux_root *)(root)) + +static inline void +rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + rb_set_parent(node, parent); + rb_set_color(node, RB_RED); + node->__entry.rbe_left = node->__entry.rbe_right = NULL; + *rb_link = node; +} + +static inline void +rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *p; + + p = rb_parent(victim); + if (p) { + if (p->rb_left == victim) + p->rb_left = new; + else + p->rb_right = new; + } else + root->rb_node = new; + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + *new = *victim; +} + +#undef RB_ROOT +#define RB_ROOT (struct rb_root) { NULL } + +#endif /* _LINUX_RBTREE_H_ */ Index: sys/ofed/include/linux/compat.h =================================================================== --- sys/ofed/include/linux/compat.h (.../base) (revision 0) +++ sys/ofed/include/linux/compat.h (.../head) (revision 219811) @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_COMPAT_H_ +#define _LINUX_COMPAT_H_ + + +#endif /* _LINUX_COMPAT_H_ */ Index: sys/ofed/include/linux/module.h =================================================================== --- sys/ofed/include/linux/module.h (.../base) (revision 0) +++ sys/ofed/include/linux/module.h (.../head) (revision 219811) @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_MODULE_H_ +#define _LINUX_MODULE_H_ + +#include +#include +#include +#include +#include + +#define MODULE_AUTHOR(name) +#define MODULE_DESCRIPTION(name) +#define MODULE_LICENSE(name) +#define MODULE_VERSION(name) + +#define THIS_MODULE ((struct module *)0) + +#define EXPORT_SYMBOL(name) +#define EXPORT_SYMBOL_GPL(name) + +#include + +static inline void +_module_run(void *arg) +{ + void (*fn)(void); +#ifdef OFED_DEBUG_INIT + char name[1024]; + caddr_t pc; + long offset; + + pc = (caddr_t)arg; + if (linker_search_symbol_name(pc, name, sizeof(name), &offset) != 0) + printf("Running ??? (%p)\n", pc); + else + printf("Running %s (%p)\n", name, pc); +#endif + fn = arg; + DROP_GIANT(); + fn(); + PICKUP_GIANT(); +} + +#define module_init(fn) \ + SYSINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn)) + +/* + * XXX This is a freebsdism designed to work around not having a module + * load order resolver built in. + */ +#define module_init_order(fn, order) \ + SYSINIT(fn, SI_SUB_RUN_SCHEDULER, (order), _module_run, (fn)) + +#define module_exit(fn) \ + SYSUNINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn)) + +#define module_get(module) +#define module_put(module) +#define try_module_get(module) 1 + +#endif /* _LINUX_MODULE_H_ */ Index: sys/ofed/include/linux/kref.h =================================================================== --- sys/ofed/include/linux/kref.h (.../base) (revision 0) +++ sys/ofed/include/linux/kref.h (.../head) (revision 219811) @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_KREF_H_ +#define _LINUX_KREF_H_ + +#include + +struct kref { + volatile u_int count; +}; + +static inline void +kref_init(struct kref *kref) +{ + + refcount_init(&kref->count, 1); +} + +static inline void +kref_get(struct kref *kref) +{ + + refcount_acquire(&kref->count); +} + +static inline int +kref_put(struct kref *kref, void (*rel)(struct kref *kref)) +{ + + if (refcount_release(&kref->count)) { + rel(kref); + return 1; + } + return 0; +} + +#endif /* _KREF_H_ */ Index: sys/ofed/include/linux/dma-mapping.h =================================================================== --- sys/ofed/include/linux/dma-mapping.h (.../base) (revision 0) +++ sys/ofed/include/linux/dma-mapping.h (.../head) (revision 219811) @@ -0,0 +1,263 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_DMA_MAPPING_H_ +#define _LINUX_DMA_MAPPING_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +enum dma_data_direction { + DMA_BIDIRECTIONAL = 0, + DMA_TO_DEVICE = 1, + DMA_FROM_DEVICE = 2, + DMA_NONE = 3, +}; + +struct dma_map_ops { + void* (*alloc_coherent)(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp); + void (*free_coherent)(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle); + dma_addr_t (*map_page)(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs); + void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, struct dma_attrs *attrs); + int (*map_sg)(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, struct dma_attrs *attrs); + void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs); + void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir); + void (*sync_single_for_device)(struct device *dev, + dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); + void (*sync_single_range_for_cpu)(struct device *dev, + dma_addr_t dma_handle, unsigned long offset, size_t size, + enum dma_data_direction dir); + void (*sync_single_range_for_device)(struct device *dev, + dma_addr_t dma_handle, unsigned long offset, size_t size, + enum dma_data_direction dir); + void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir); + void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir); + int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); + int (*dma_supported)(struct device *dev, u64 mask); + int is_phys; +}; + +#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1)) + +static inline int +dma_supported(struct device *dev, u64 mask) +{ + + /* XXX busdma takes care of this elsewhere. */ + return (1); +} + +static inline int +dma_set_mask(struct device *dev, u64 dma_mask) +{ + + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + return (0); +} + +static inline int +dma_set_coherent_mask(struct device *dev, u64 mask) +{ + + if (!dma_supported(dev, mask)) + return -EIO; + /* XXX Currently we don't support a seperate coherent mask. */ + return 0; +} + +static inline void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flag) +{ + vm_paddr_t high; + size_t align; + void *mem; + + if (dev->dma_mask) + high = *dev->dma_mask; + else + high = BUS_SPACE_MAXADDR_32BIT; + align = PAGE_SIZE << get_order(size); + mem = (void *)kmem_alloc_contig(kmem_map, size, flag, 0, high, align, + 0, VM_MEMATTR_DEFAULT); + if (mem) + *dma_handle = vtophys(mem); + else + *dma_handle = 0; + return (mem); +} + +static inline void +dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + + kmem_free(kmem_map, (vm_offset_t)cpu_addr, size); +} + +/* XXX This only works with no iommu. */ +static inline dma_addr_t +dma_map_single_attrs(struct device *dev, void *ptr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + + return vtophys(ptr); +} + +static inline void +dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ +} + +static inline int +dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + sg_dma_address(sg) = sg_phys(sg); + + return (nents); +} + +static inline void +dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ +} + +static inline dma_addr_t +dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction direction) +{ + + return VM_PAGE_TO_PHYS(page) + offset; +} + +static inline void +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ +} + +static inline void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ +} + +static inline void +dma_sync_single(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ +} + +static inline void +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, int direction) +{ +} + +static inline void +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, int direction) +{ +} + +static inline int +dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + + return (0); +} + +#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) +#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) +#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) +#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) + +#define DEFINE_DMA_UNMAP_ADDR(name) dma_addr_t name +#define DEFINE_DMA_UNMAP_LEN(name) __u32 name +#define dma_unmap_addr(p, name) ((p)->name) +#define dma_unmap_addr_set(p, name, v) (((p)->name) = (v)) +#define dma_unmap_len(p, name) ((p)->name) +#define dma_unmap_len_set(p, name, v) (((p)->name) = (v)) + +extern int uma_align_cache; +#define dma_get_cache_alignment() uma_align_cache + +#endif /* _LINUX_DMA_MAPPING_H_ */ Index: sys/ofed/include/linux/vmalloc.h =================================================================== --- sys/ofed/include/linux/vmalloc.h (.../base) (revision 0) +++ sys/ofed/include/linux/vmalloc.h (.../head) (revision 219811) @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_VMALLOC_H_ +#define _LINUX_VMALLOC_H_ + +#include + +#define VM_MAP 0x0000 +#define PAGE_KERNEL 0x0000 + +void *vmap(struct page **pages, unsigned int count, unsigned long flags, + int prot); +void vunmap(void *addr); + +#endif /* _LINUX_VMALLOC_H_ */ Index: sys/ofed/include/linux/linux_compat.c =================================================================== --- sys/ofed/include/linux/linux_compat.c (.../base) (revision 0) +++ sys/ofed/include/linux/linux_compat.c (.../head) (revision 219811) @@ -0,0 +1,695 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat"); + +#include +/* Undo Linux compat changes. */ +#undef RB_ROOT +#undef file +#undef cdev +#define RB_ROOT(head) (head)->rbh_root +#undef LIST_HEAD +/* From sys/queue.h */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +struct kobject class_root; +struct device linux_rootdev; +struct class miscclass; +struct list_head pci_drivers; +struct list_head pci_devices; +spinlock_t pci_lock; + +int +panic_cmp(struct rb_node *one, struct rb_node *two) +{ + panic("no cmp"); +} + +RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); + +int +kobject_set_name(struct kobject *kobj, const char *fmt, ...) +{ + va_list args; + int error; + + va_start(args, fmt); + error = kobject_set_name_vargs(kobj, fmt, args); + va_end(args); + + return (error); +} + +static inline int +kobject_add_complete(struct kobject *kobj, struct kobject *parent) +{ + struct kobj_type *t; + int error; + + kobj->parent = kobject_get(parent); + error = sysfs_create_dir(kobj); + if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) { + struct attribute **attr; + t = kobj->ktype; + + for (attr = t->default_attrs; *attr != NULL; attr++) { + error = sysfs_create_file(kobj, *attr); + if (error) + break; + } + if (error) + sysfs_remove_dir(kobj); + + } + return (error); +} + +int +kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) +{ + va_list args; + int error; + + va_start(args, fmt); + error = kobject_set_name_vargs(kobj, fmt, args); + va_end(args); + if (error) + return (error); + + return kobject_add_complete(kobj, parent); +} + +void +kobject_release(struct kref *kref) +{ + struct kobject *kobj; + char *name; + + kobj = container_of(kref, struct kobject, kref); + sysfs_remove_dir(kobj); + if (kobj->parent) + kobject_put(kobj->parent); + kobj->parent = NULL; + name = kobj->name; + if (kobj->ktype && kobj->ktype->release) + kobj->ktype->release(kobj); + kfree(name); +} + +static void +kobject_kfree(struct kobject *kobj) +{ + + kfree(kobj); +} + +struct kobj_type kfree_type = { .release = kobject_kfree }; + +struct device * +device_create(struct class *class, struct device *parent, dev_t devt, + void *drvdata, const char *fmt, ...) +{ + struct device *dev; + va_list args; + + dev = kzalloc(sizeof(*dev), M_WAITOK); + dev->parent = parent; + dev->class = class; + dev->devt = devt; + dev->driver_data = drvdata; + va_start(args, fmt); + kobject_set_name_vargs(&dev->kobj, fmt, args); + va_end(args); + device_register(dev); + + return (dev); +} + +int +kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, + struct kobject *parent, const char *fmt, ...) +{ + va_list args; + int error; + + kobject_init(kobj, ktype); + kobj->ktype = ktype; + kobj->parent = parent; + kobj->name = NULL; + + va_start(args, fmt); + error = kobject_set_name_vargs(kobj, fmt, args); + va_end(args); + if (error) + return (error); + return kobject_add_complete(kobj, parent); +} + +static void +linux_file_dtor(void *cdp) +{ + struct linux_file *filp; + + filp = cdp; + filp->f_op->release(curthread->td_fpop->f_vnode, filp); + kfree(filp); +} + +static int +linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (ENODEV); + filp = kzalloc(sizeof(*filp), GFP_KERNEL); + filp->f_dentry = &filp->f_dentry_store; + filp->f_op = ldev->ops; + filp->f_flags = file->f_flag; + if (filp->f_op->open) { + error = -filp->f_op->open(file->f_vnode, filp); + if (error) { + kfree(filp); + return (error); + } + } + error = devfs_set_cdevpriv(filp, linux_file_dtor); + if (error) { + filp->f_op->release(file->f_vnode, filp); + kfree(filp); + return (error); + } + + return 0; +} + +static int +linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (0); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + devfs_clear_cdevpriv(); + + return (0); +} + +static int +linux_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (0); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + /* + * Linux does not have a generic ioctl copyin/copyout layer. All + * linux ioctls must be converted to void ioctls which pass a + * pointer to the address of the data. We want the actual user + * address so we dereference here. + */ + data = *(void **)data; + if (filp->f_op->unlocked_ioctl) + error = -filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data); + else + error = ENOTTY; + + return (error); +} + +static int +linux_dev_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + ssize_t bytes; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (0); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + if (uio->uio_iovcnt != 1) + panic("linux_dev_read: uio %p iovcnt %d", + uio, uio->uio_iovcnt); + if (filp->f_op->read) { + bytes = filp->f_op->read(filp, uio->uio_iov->iov_base, + uio->uio_iov->iov_len, &uio->uio_offset); + if (bytes >= 0) { + uio->uio_iov->iov_base += bytes; + uio->uio_iov->iov_len -= bytes; + uio->uio_resid -= bytes; + } else + error = -bytes; + } else + error = ENXIO; + + return (error); +} + +static int +linux_dev_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + ssize_t bytes; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (0); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + if (uio->uio_iovcnt != 1) + panic("linux_dev_write: uio %p iovcnt %d", + uio, uio->uio_iovcnt); + if (filp->f_op->write) { + bytes = filp->f_op->write(filp, uio->uio_iov->iov_base, + uio->uio_iov->iov_len, &uio->uio_offset); + if (bytes >= 0) { + uio->uio_iov->iov_base += bytes; + uio->uio_iov->iov_len -= bytes; + uio->uio_resid -= bytes; + } else + error = -bytes; + } else + error = ENXIO; + + return (error); +} + +static int +linux_dev_poll(struct cdev *dev, int events, struct thread *td) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + int revents; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (0); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + if (filp->f_op->poll) + revents = filp->f_op->poll(filp, NULL) & events; + else + revents = 0; + + return (revents); +} + +static int +linux_dev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, vm_memattr_t *memattr) +{ + + /* XXX memattr not honored. */ + *paddr = offset; + return (0); +} + +static int +linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset, + vm_size_t size, struct vm_object **object, int nprot) +{ + struct linux_cdev *ldev; + struct linux_file *filp; + struct file *file; + struct vm_area_struct vma; + vm_paddr_t paddr; + vm_page_t m; + int error; + + file = curthread->td_fpop; + ldev = dev->si_drv1; + if (ldev == NULL) + return (ENODEV); + if (size != PAGE_SIZE) + return (EINVAL); + if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) + return (error); + filp->f_flags = file->f_flag; + vma.vm_start = 0; + vma.vm_end = PAGE_SIZE; + vma.vm_pgoff = *offset / PAGE_SIZE; + vma.vm_pfn = 0; + vma.vm_page_prot = 0; + if (filp->f_op->mmap) { + error = -filp->f_op->mmap(filp, &vma); + if (error == 0) { + paddr = (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT; + *offset = paddr; + m = PHYS_TO_VM_PAGE(paddr); + *object = vm_pager_allocate(OBJT_DEVICE, dev, + PAGE_SIZE, nprot, *offset, curthread->td_ucred); + if (*object == NULL) + return (EINVAL); + if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) + pmap_page_set_memattr(m, vma.vm_page_prot); + } + } else + error = ENODEV; + + return (error); +} + +struct cdevsw linuxcdevsw = { + .d_version = D_VERSION, + .d_flags = D_TRACKCLOSE, + .d_open = linux_dev_open, + .d_close = linux_dev_close, + .d_read = linux_dev_read, + .d_write = linux_dev_write, + .d_ioctl = linux_dev_ioctl, + .d_mmap_single = linux_dev_mmap_single, + .d_mmap = linux_dev_mmap, + .d_poll = linux_dev_poll, +}; + +static int +linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + struct linux_file *filp; + ssize_t bytes; + int error; + + error = 0; + filp = (struct linux_file *)file->f_data; + filp->f_flags = file->f_flag; + if (uio->uio_iovcnt != 1) + panic("linux_file_read: uio %p iovcnt %d", + uio, uio->uio_iovcnt); + if (filp->f_op->read) { + bytes = filp->f_op->read(filp, uio->uio_iov->iov_base, + uio->uio_iov->iov_len, &uio->uio_offset); + if (bytes >= 0) { + uio->uio_iov->iov_base += bytes; + uio->uio_iov->iov_len -= bytes; + uio->uio_resid -= bytes; + } else + error = -bytes; + } else + error = ENXIO; + + return (error); +} + +static int +linux_file_poll(struct file *file, int events, struct ucred *active_cred, + struct thread *td) +{ + struct linux_file *filp; + int revents; + + filp = (struct linux_file *)file->f_data; + filp->f_flags = file->f_flag; + if (filp->f_op->poll) + revents = filp->f_op->poll(filp, NULL) & events; + else + revents = 0; + + return (0); +} + +static int +linux_file_close(struct file *file, struct thread *td) +{ + struct linux_file *filp; + int error; + + filp = (struct linux_file *)file->f_data; + filp->f_flags = file->f_flag; + error = -filp->f_op->release(NULL, filp); + funsetown(&filp->f_sigio); + kfree(filp); + + return (error); +} + +static int +linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, + struct thread *td) +{ + struct linux_file *filp; + int error; + + filp = (struct linux_file *)fp->f_data; + filp->f_flags = fp->f_flag; + error = 0; + + switch (cmd) { + case FIONBIO: + break; + case FIOASYNC: + if (filp->f_op->fasync == NULL) + break; + error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC); + break; + case FIOSETOWN: + error = fsetown(*(int *)data, &filp->f_sigio); + if (error == 0) + error = filp->f_op->fasync(0, filp, + fp->f_flag & FASYNC); + break; + case FIOGETOWN: + *(int *)data = fgetown(&filp->f_sigio); + break; + default: + error = ENOTTY; + break; + } + return (error); +} + +struct fileops linuxfileops = { + .fo_read = linux_file_read, + .fo_poll = linux_file_poll, + .fo_close = linux_file_close, + .fo_ioctl = linux_file_ioctl +}; + +/* + * Hash of vmmap addresses. This is infrequently accessed and does not + * need to be particularly large. This is done because we must store the + * caller's idea of the map size to properly unmap. + */ +struct vmmap { + LIST_ENTRY(vmmap) vm_next; + void *vm_addr; + unsigned long vm_size; +}; + +LIST_HEAD(vmmaphd, vmmap); +#define VMMAP_HASH_SIZE 64 +#define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) +#define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK +static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; +static struct mtx vmmaplock; + +static void +vmmap_add(void *addr, unsigned long size) +{ + struct vmmap *vmmap; + + vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); + mtx_lock(&vmmaplock); + vmmap->vm_size = size; + vmmap->vm_addr = addr; + LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); + mtx_unlock(&vmmaplock); +} + +static struct vmmap * +vmmap_remove(void *addr) +{ + struct vmmap *vmmap; + + mtx_lock(&vmmaplock); + LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) + if (vmmap->vm_addr == addr) + break; + if (vmmap) + LIST_REMOVE(vmmap, vm_next); + mtx_unlock(&vmmaplock); + + return (vmmap); +} + +void * +_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) +{ + void *addr; + + addr = pmap_mapdev_attr(phys_addr, size, attr); + if (addr == NULL) + return (NULL); + vmmap_add(addr, size); + + return (addr); +} + +void +iounmap(void *addr) +{ + struct vmmap *vmmap; + + vmmap = vmmap_remove(addr); + if (vmmap == NULL) + return; + pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size); + kfree(vmmap); +} + + +void * +vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) +{ + vm_offset_t off; + size_t size; + + size = count * PAGE_SIZE; + off = kmem_alloc_nofault(kernel_map, size); + if (off == 0) + return (NULL); + vmmap_add((void *)off, size); + pmap_qenter(off, pages, count); + + return ((void *)off); +} + +void +vunmap(void *addr) +{ + struct vmmap *vmmap; + + vmmap = vmmap_remove(addr); + if (vmmap == NULL) + return; + pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); + kmem_free(kernel_map, (vm_offset_t)addr, vmmap->vm_size); + kfree(vmmap); +} + +static void +linux_compat_init(void) +{ + struct sysctl_oid *rootoid; + int i; + + rootoid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(), + OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); + kobject_init(&class_root, &class_ktype); + kobject_set_name(&class_root, "class"); + class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), + OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); + kobject_init(&linux_rootdev.kobj, &dev_ktype); + kobject_set_name(&linux_rootdev.kobj, "device"); + linux_rootdev.kobj.oidp = SYSCTL_ADD_NODE(NULL, + SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL, + "device"); + linux_rootdev.bsddev = root_bus; + miscclass.name = "misc"; + class_register(&miscclass); + INIT_LIST_HEAD(&pci_drivers); + INIT_LIST_HEAD(&pci_devices); + spin_lock_init(&pci_lock); + mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); + for (i = 0; i < VMMAP_HASH_SIZE; i++) + LIST_INIT(&vmmaphead[i]); +} + +SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); Index: sys/ofed/include/linux/if_arp.h =================================================================== --- sys/ofed/include/linux/if_arp.h (.../base) (revision 0) +++ sys/ofed/include/linux/if_arp.h (.../head) (revision 219811) @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_IF_ARP_H_ +#define _LINUX_IF_ARP_H_ +#include +#include +#endif /* _LINUX_IF_ARP_H_ */ Index: sys/ofed/include/linux/rwlock.h =================================================================== --- sys/ofed/include/linux/rwlock.h (.../base) (revision 0) +++ sys/ofed/include/linux/rwlock.h (.../head) (revision 219811) @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_RWLOCK_H_ +#define _LINUX_RWLOCK_H_ + +#include +#include + +typedef struct { + struct rwlock rw; +} rwlock_t; + +#define read_lock(_l) rw_rlock(&(_l)->rw) +#define write_lock(_l) rw_wlock(&(_l)->rw) +#define read_unlock(_l) rw_runlock(&(_l)->rw) +#define write_unlock(_l) rw_wunlock(&(_l)->rw) +#define read_lock_irq(lock) read_lock((lock)) +#define read_unlock_irq(lock) read_unlock((lock)) +#define write_lock_irq(lock) write_lock((lock)) +#define write_unlock_irq(lock) write_unlock((lock)) +#define read_lock_irqsave(lock, flags) \ + do {(flags) = 0; read_lock(lock); } while (0) +#define write_lock_irqsave(lock, flags) \ + do {(flags) = 0; write_lock(lock); } while (0) +#define read_unlock_irqrestore(lock, flags) \ + do { read_unlock(lock); } while (0) +#define write_unlock_irqrestore(lock, flags) \ + do { write_unlock(lock); } while (0) + +static inline void +rwlock_init(rwlock_t *lock) +{ + + memset(&lock->rw, 0, sizeof(lock->rw)); + rw_init_flags(&lock->rw, "lnxrw", RW_NOWITNESS); +} + +#endif /* _LINUX_RWLOCK_H_ */ Index: sys/ofed/include/linux/mutex.h =================================================================== --- sys/ofed/include/linux/mutex.h (.../base) (revision 0) +++ sys/ofed/include/linux/mutex.h (.../head) (revision 219811) @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_MUTEX_H_ +#define _LINUX_MUTEX_H_ + +#include +#include +#include + +#include + +typedef struct mutex { + struct sx sx; +} mutex_t; + +#define mutex_lock(_m) sx_xlock(&(_m)->sx) +#define mutex_lock_nested(_m, _s) mutex_lock(_m) +#define mutex_lock_interruptible(_m) ({ mutex_lock((_m)); 0; }) +#define mutex_unlock(_m) sx_xunlock(&(_m)->sx) +#define mutex_trylock(_m) !!sx_try_xlock(&(_m)->sx) + +#define DEFINE_MUTEX(lock) \ + mutex_t lock; \ + SX_SYSINIT_FLAGS(lock, &(lock).sx, "lnxmtx", SX_NOWITNESS) + +static inline void +linux_mutex_init(mutex_t *m) +{ + + memset(&m->sx, 0, sizeof(m->sx)); + sx_init_flags(&m->sx, "lnxmtx", SX_NOWITNESS); +} + +#define mutex_init linux_mutex_init + +#endif /* _LINUX_MUTEX_H_ */ Index: sys/ofed/include/linux/wait.h =================================================================== --- sys/ofed/include/linux/wait.h (.../base) (revision 0) +++ sys/ofed/include/linux/wait.h (.../head) (revision 219811) @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_WAIT_H_ +#define _LINUX_WAIT_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +struct __wait_queue_head { + unsigned int wchan; +}; +typedef struct __wait_queue_head wait_queue_head_t; + +#define init_waitqueue_head(x) + +static inline void +__wake_up(struct __wait_queue_head *q, int all) +{ + int wakeup_swapper; + void *c; + + c = &q->wchan; + sleepq_lock(c); + if (all) + wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); + else + wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); + sleepq_release(c); + if (wakeup_swapper) + kick_proc0(); +} + +#define wake_up(q) __wake_up(q, 0) +#define wake_up_nr(q, nr) __wake_up(q, 1) +#define wake_up_all(q) __wake_up(q, 1) +#define wake_up_interruptible(q) __wake_up(q, 0) +#define wake_up_interruptible_nr(q, nr) __wake_up(q, 1) +#define wake_up_interruptible_all(q, nr) __wake_up(q, 1) + +#define wait_event(q, cond) \ +do { \ + void *c = &(q).wchan; \ + if (!(cond)) { \ + for (;;) { \ + sleepq_lock(c); \ + if (cond) { \ + sleepq_release(c); \ + break; \ + } \ + sleepq_add(c, NULL, "completion", SLEEPQ_SLEEP, 0); \ + sleepq_wait(c, 0); \ + } \ + } \ +} while (0) + +#define wait_event_interruptible(q, cond) \ +({ \ + void *c = &(q).wchan; \ + int _error; \ + \ + _error = 0; \ + if (!(cond)) { \ + for (; _error == 0;) { \ + sleepq_lock(c); \ + if (cond) { \ + sleepq_release(c); \ + break; \ + } \ + sleepq_add(c, NULL, "completion", \ + SLEEPQ_SLEEP | SLEEPQ_INTERRUPTIBLE, 0); \ + if (sleepq_wait_sig(c, 0)) \ + _error = -ERESTARTSYS; \ + } \ + } \ + -_error; \ +}) + +#define DEFINE_WAIT(x) + +#endif /* _LINUX_WAIT_H_ */ Index: sys/ofed/include/linux/types.h =================================================================== --- sys/ofed/include/linux/types.h (.../base) (revision 0) +++ sys/ofed/include/linux/types.h (.../head) (revision 219811) @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_TYPES_H_ +#define _LINUX_TYPES_H_ + +#include +#include +#include +#include + +typedef __u16 __le16; +typedef __u16 __be16; +typedef __u32 __le32; +typedef __u32 __be32; +typedef __u64 __le64; +typedef __u64 __be64; +typedef _Bool bool; +#define true TRUE +#define false FALSE + +typedef unsigned long kernel_ulong_t; +typedef unsigned int uint; +typedef unsigned gfp_t; +typedef uint64_t loff_t; +typedef vm_paddr_t resource_size_t; + +#define DECLARE_BITMAP(n, bits) \ + unsigned long n[howmany(bits, sizeof(long) * 8)] + +#endif /* _LINUX_TYPES_H_ */ Index: sys/ofed/include/linux/string.h =================================================================== --- sys/ofed/include/linux/string.h (.../base) (revision 0) +++ sys/ofed/include/linux/string.h (.../head) (revision 219811) @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_STRING_H_ +#define _LINUX_STRING_H_ + +#include +#include +#include + +#include + +static inline void * +kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *dst; + + dst = kmalloc(len, gfp); + if (dst) + memcpy(dst, src, len); + return (dst); +} + +#endif /* _LINUX_STRING_H_ */ Index: sys/ofed/include/linux/moduleparam.h =================================================================== --- sys/ofed/include/linux/moduleparam.h (.../base) (revision 0) +++ sys/ofed/include/linux/moduleparam.h (.../head) (revision 219811) @@ -0,0 +1,226 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_MODULEPARAM_H_ +#define _LINUX_MODULEPARAM_H_ + +#include + +/* + * These are presently not hooked up to anything. In linux the parameters + * can be set when modules are loaded. On FreeBSD these could be mapped + * to kenv in the future. + */ +struct kernel_param; + +typedef int (*param_set_fn)(const char *val, struct kernel_param *kp); +typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp); + +struct kernel_param { + const char *name; + u16 perm; + u16 flags; + param_set_fn set; + param_get_fn get; + union { + void *arg; + struct kparam_string *str; + struct kparam_array *arr; + } un; +}; + +#define KPARAM_ISBOOL 2 + +struct kparam_string { + unsigned int maxlen; + char *string; +}; + +struct kparam_array +{ + unsigned int max; + unsigned int *num; + param_set_fn set; + param_get_fn get; + unsigned int elemsize; + void *elem; +}; + +static inline void +param_sysinit(struct kernel_param *param) +{ +} + +#define module_param_call(name, set, get, arg, perm) \ + static struct kernel_param __param_##name = \ + { #name, perm, 0, set, get, { arg } }; \ + SYSINIT(name##_param_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, \ + param_sysinit, &__param_##name); + +#define module_param_named(name, var, type, mode) \ + module_param_call(name, param_set_##type, param_get_##type, &var, mode) + +#define module_param(var, type, mode) \ + module_param_named(var, var, type, mode) + +#define MODULE_PARM_DESC(name, desc) + +static inline int +param_set_byte(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_byte(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_short(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_short(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_ushort(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_ushort(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_int(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_int(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_uint(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_uint(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_long(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_long(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_ulong(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_ulong(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_charp(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_charp(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + + +static inline int +param_set_bool(const char *val, struct kernel_param *kp) +{ + + return 0; +} + +static inline int +param_get_bool(char *buffer, struct kernel_param *kp) +{ + + return 0; +} + +#endif /* _LINUX_MODULEPARAM_H_ */ Index: sys/ofed/include/linux/io.h =================================================================== --- sys/ofed/include/linux/io.h (.../base) (revision 0) +++ sys/ofed/include/linux/io.h (.../head) (revision 219811) @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IO_H_ +#define _LINUX_IO_H_ + +#include + +static inline uint32_t +__raw_readl(const volatile void *addr) +{ + return *(const volatile uint32_t *)addr; +} + +static inline void +__raw_writel(uint32_t b, volatile void *addr) +{ + *(volatile uint32_t *)addr = b; +} + +static inline uint64_t +__raw_readq(const volatile void *addr) +{ + return *(const volatile uint64_t *)addr; +} + +static inline void +__raw_writeq(uint64_t b, volatile void *addr) +{ + *(volatile uint64_t *)addr = b; +} + +/* + * XXX This is all x86 specific. It should be bus space access. + */ +#define mmiowb() + +#undef writel +static inline void +writel(uint32_t b, void *addr) +{ + *(volatile uint32_t *)addr = b; +} + +#undef writeq +static inline void +writeq(uint64_t b, void *addr) +{ + *(volatile uint64_t *)addr = b; +} + +#undef writeb +static inline void +writeb(uint8_t b, void *addr) +{ + *(volatile uint8_t *)addr = b; +} + +#undef writew +static inline void +writew(uint16_t b, void *addr) +{ + *(volatile uint16_t *)addr = b; +} + +void *_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr); +#define ioremap_nocache(addr, size) \ + _ioremap_attr((addr), (size), VM_MEMATTR_UNCACHED) +#define ioremap_wc(addr, size) \ + _ioremap_attr((addr), (size), VM_MEMATTR_WRITE_COMBINING) +#define ioremap ioremap_nocache +void iounmap(void *addr); + +#define memset_io(a, b, c) memset((a), (b), (c)) +#define memcpy_fromio(a, b, c) memcpy((a), (b), (c)) +#define memcpy_toio(a, b, c) memcpy((a), (b), (c)) + +static inline void +__iowrite64_copy(void *to, void *from, size_t count) +{ +#ifdef __LP64__ + uint64_t *src; + uint64_t *dst; + int i; + + for (i = 0, src = from, dst = to; i < count; i++, src++, dst++) + __raw_writeq(*src, dst); +#else + uint32_t *src; + uint32_t *dst; + int i; + + count *= 2; + for (i = 0, src = from, dst = to; i < count; i++, src++, dst++) + __raw_writel(*src, dst); +#endif +} + + +#endif /* _LINUX_IO_H_ */ Index: sys/ofed/include/linux/stddef.h =================================================================== --- sys/ofed/include/linux/stddef.h (.../base) (revision 0) +++ sys/ofed/include/linux/stddef.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_STDDEF_H_ +#define _LINUX_STDDEF_H_ + +#include + +#endif /* _LINUX_STDDEF_H_ */ Index: sys/ofed/include/linux/mm.h =================================================================== --- sys/ofed/include/linux/mm.h (.../base) (revision 0) +++ sys/ofed/include/linux/mm.h (.../head) (revision 219811) @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_MM_H_ +#define _LINUX_MM_H_ + +#include +#include +#include + +#define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE) + +struct vm_area_struct { + vm_offset_t vm_start; + vm_offset_t vm_end; + vm_offset_t vm_pgoff; + vm_paddr_t vm_pfn; /* PFN For mmap. */ + vm_memattr_t vm_page_prot; +}; + +/* + * Compute log2 of the power of two rounded up count of pages + * needed for size bytes. + */ +static inline int +get_order(unsigned long size) +{ + int order; + + size = (size - 1) >> PAGE_SHIFT; + order = 0; + while (size) { + order++; + size >>= 1; + } + return (order); +} + +static inline void * +lowmem_page_address(struct page *page) +{ + + return page_address(page); +} + +/* + * This only works via mmap ops. + */ +static inline int +io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size, + vm_memattr_t prot) +{ + vma->vm_page_prot = prot; + vma->vm_pfn = pfn; + + return (0); +} + +#endif /* _LINUX_MM_H_ */ Index: sys/ofed/include/linux/miscdevice.h =================================================================== --- sys/ofed/include/linux/miscdevice.h (.../base) (revision 0) +++ sys/ofed/include/linux/miscdevice.h (.../head) (revision 219811) @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_MISCDEVICE_H_ +#define _LINUX_MISCDEVICE_H_ + +#define MISC_DYNAMIC_MINOR -1 + +#include +#include + +struct miscdevice { + const char *name; + struct device *this_device; + const struct file_operations *fops; + struct cdev *cdev; + int minor; +}; + +extern struct class miscclass; + +static inline int +misc_register(struct miscdevice *misc) +{ + misc->this_device = device_create(&miscclass, &linux_rootdev, 0, misc, + misc->name); + misc->cdev = cdev_alloc(); + if (misc->cdev == NULL) + return -ENOMEM; + misc->cdev->owner = THIS_MODULE; + misc->cdev->ops = misc->fops; + kobject_set_name(&misc->cdev->kobj, misc->name); + if (cdev_add(misc->cdev, misc->this_device->devt, 1)) + return -EINVAL; + return (0); +} + +static inline int +misc_deregister(struct miscdevice *misc) +{ + device_destroy(&miscclass, misc->this_device->devt); + cdev_del(misc->cdev); + + return (0); +} + +#endif /* _LINUX_MISCDEVICE_H_ */ Index: sys/ofed/include/linux/gfp.h =================================================================== --- sys/ofed/include/linux/gfp.h (.../base) (revision 0) +++ sys/ofed/include/linux/gfp.h (.../head) (revision 219811) @@ -0,0 +1,122 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_GFP_H_ +#define _LINUX_GFP_H_ + +#include +#include + +#include + +#include +#include +#include + +#define __GFP_NOWARN 0 +#define __GFP_HIGHMEM 0 +#define __GFP_ZERO M_ZERO + +#define GFP_NOWAIT M_NOWAIT +#define GFP_ATOMIC (M_NOWAIT | M_USE_RESERVE) +#define GFP_KERNEL M_WAITOK +#define GFP_USER M_WAITOK +#define GFP_HIGHUSER M_WAITOK +#define GFP_HIGHUSER_MOVABLE M_WAITOK +#define GFP_IOFS M_NOWAIT + +static inline void * +page_address(struct page *page) +{ + + if (page->object != kmem_object && page->object != kernel_object) + return (NULL); + return (void *)(VM_MIN_KERNEL_ADDRESS + IDX_TO_OFF(page->pindex)); +} + +static inline unsigned long +_get_page(gfp_t mask) +{ + + return kmem_malloc(kmem_map, PAGE_SIZE, mask); +} + +#define get_zeroed_page(mask) _get_page((mask) | M_ZERO) +#define alloc_page(mask) virt_to_page(_get_page((mask))) +#define __get_free_page(mask) _get_page((mask)) + +static inline void +free_page(unsigned long page) +{ + + if (page == 0) + return; + kmem_free(kmem_map, page, PAGE_SIZE); +} + +static inline void +__free_page(struct page *m) +{ + + if (m->object != kmem_object) + panic("__free_page: Freed page %p not allocated via wrappers.", + m); + kmem_free(kmem_map, (vm_offset_t)page_address(m), PAGE_SIZE); +} + +static inline void +__free_pages(void *p, unsigned int order) +{ + size_t size; + + if (p == 0) + return; + size = PAGE_SIZE << order; + kmem_free(kmem_map, (vm_offset_t)p, size); +} + +/* + * Alloc pages allocates directly from the buddy allocator on linux so + * order specifies a power of two bucket of pages and the results + * are expected to be aligned on the size as well. + */ +static inline struct page * +alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + unsigned long page; + size_t size; + + size = PAGE_SIZE << order; + page = kmem_alloc_contig(kmem_map, size, gfp_mask, 0, -1, + size, 0, VM_MEMATTR_DEFAULT); + if (page == 0) + return (NULL); + return (virt_to_page(page)); +} + +#endif /* _LINUX_GFP_H_ */ Index: sys/ofed/include/linux/page.h =================================================================== --- sys/ofed/include/linux/page.h (.../base) (revision 0) +++ sys/ofed/include/linux/page.h (.../head) (revision 219811) @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_PAGE_H_ +#define _LINUX_PAGE_H_ + +#include + +#include + +#include +#include + +#define page vm_page + +#define virt_to_page(x) PHYS_TO_VM_PAGE(vtophys((x))) + +#define clear_page(page) memset((page), 0, PAGE_SIZE) +#define pgprot_noncached(prot) VM_MEMATTR_UNCACHED +#define pgprot_writecombine(prot) VM_MEMATTR_WRITE_COMBINING + +#undef PAGE_MASK +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#endif /* _LINUX_PAGE_H_ */ Index: sys/ofed/include/linux/if_vlan.h =================================================================== --- sys/ofed/include/linux/if_vlan.h (.../base) (revision 0) +++ sys/ofed/include/linux/if_vlan.h (.../head) (revision 219811) @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IF_VLAN_H_ +#define _LINUX_IF_VLAN_H_ + +#include +#include + +#endif /* _LINUX_IF_VLAN_H_ */ Index: sys/ofed/include/linux/inetdevice.h =================================================================== --- sys/ofed/include/linux/inetdevice.h (.../base) (revision 0) +++ sys/ofed/include/linux/inetdevice.h (.../head) (revision 219811) @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_INETDEVICE_H_ +#define _LINUX_INETDEVICE_H_ + +#include + +static inline struct net_device * +ip_dev_find(struct net *net, uint32_t addr) +{ + struct sockaddr_in sin; + struct ifaddr *ifa; + struct ifnet *ifp; + + ifp = NULL; + memset(&sin, 0, sizeof(sin)); + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa) { + ifp = ifa->ifa_ifp; + if_ref(ifp); + ifa_free(ifa); + } + return (ifp); +} + +#endif /* _LINUX_INETDEVICE_H_ */ Index: sys/ofed/include/linux/notifier.h =================================================================== --- sys/ofed/include/linux/notifier.h (.../base) (revision 0) +++ sys/ofed/include/linux/notifier.h (.../head) (revision 219811) @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NOTIFIER_H_ +#define _LINUX_NOTIFIER_H_ + +#include + +/* + * Max number of FreeBSD events to map to Linux events per notify type. + */ +#define NOTIFY_DONE 0 +#define _NOTIFY_COUNT 5 + +struct notifier_block { + int (*notifier_call)(struct notifier_block *, unsigned long, void *); + struct notifier_block *next; + int priority; + eventhandler_tag tags[_NOTIFY_COUNT]; +}; + +/* Values must be less than NOTIFY_COUNT */ +#define NETDEV_UP 0x0001 +#define NETDEV_DOWN 0x0002 +#define NETDEV_REGISTER 0x0003 +#define NETDEV_UNREGISTER 0x0004 + + +#endif /* _LINUX_NOTIFIER_H_ */ Index: sys/ofed/include/linux/kernel.h =================================================================== --- sys/ofed/include/linux/kernel.h (.../base) (revision 0) +++ sys/ofed/include/linux/kernel.h (.../head) (revision 219811) @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_KERNEL_H_ +#define _LINUX_KERNEL_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERN_EMERG "<0>" +#define KERN_ALERT "<1>" +#define KERN_CRIT "<2>" +#define KERN_ERR "<3>" +#define KERN_WARNING "<4>" +#define KERN_NOTICE "<5>" +#define KERN_INFO "<6>" +#define KERN_DEBUG "<7>" + +#define BUG() panic("BUG") +#define BUG_ON(condition) do { if (condition) BUG(); } while(0) +#define WARN_ON BUG_ON + +#undef ALIGN +#define ALIGN(x, y) roundup2((x), (y)) +#define DIV_ROUND_UP howmany + +#define printk(X...) printf(X) +#define pr_debug(fmt, ...) printk(KERN_DEBUG # fmt, ##__VA_ARGS__) +#define udelay(t) DELAY(t) + +#define container_of(ptr, type, member) \ +({ \ + __typeof(((type *)0)->member) *_p = (ptr); \ + (type *)((char *)_p - offsetof(type, member)); \ +}) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define simple_strtoul strtoul + +#define min(x, y) (x < y ? x : y) +#define max(x, y) (x > y ? x : y) +#define min_t(type, _x, _y) (type)(_x) < (type)(_y) ? (type)(_x) : (_y) +#define max_t(type, _x, _y) (type)(_x) > (type)(_y) ? (type)(_x) : (_y) + +#define num_possible_cpus() mp_ncpus + +#endif /* _LINUX_KERNEL_H_ */ Index: sys/ofed/include/linux/lockdep.h =================================================================== --- sys/ofed/include/linux/lockdep.h (.../base) (revision 0) +++ sys/ofed/include/linux/lockdep.h (.../head) (revision 219811) @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_LOCKDEP_H_ +#define _LINUX_LOCKDEP_H_ + +struct lock_class_key { +}; + +#define lockdep_set_class(lock, key) + +#endif /* _LINUX_LOCKDEP_H_ */ Index: sys/ofed/include/linux/spinlock.h =================================================================== --- sys/ofed/include/linux/spinlock.h (.../base) (revision 0) +++ sys/ofed/include/linux/spinlock.h (.../head) (revision 219811) @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SPINLOCK_H_ +#define _LINUX_SPINLOCK_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include + +typedef struct { + struct mtx m; +} spinlock_t; + +#define spin_lock(_l) mtx_lock(&(_l)->m) +#define spin_unlock(_l) mtx_unlock(&(_l)->m) +#define spin_trylock(_l) mtx_trylock(&(_l)->m) +#define spin_lock_nested(_l, _n) mtx_lock_flags(&(_l)->m, MTX_DUPOK) +#define spin_lock_irq(lock) spin_lock(lock) +#define spin_unlock_irq(lock) spin_unlock(lock) +#define spin_lock_irqsave(lock, flags) \ + do {(flags) = 0; spin_lock(lock); } while (0) +#define spin_unlock_irqrestore(lock, flags) \ + do { spin_unlock(lock); } while (0) + +static inline void +spin_lock_init(spinlock_t *lock) +{ + + memset(&lock->m, 0, sizeof(lock->m)); + mtx_init(&lock->m, "lnxspin", NULL, MTX_DEF | MTX_NOWITNESS); +} + +#define DEFINE_SPINLOCK(lock) \ + spinlock_t lock; \ + MTX_SYSINIT(lock, &(lock).m, "lnxspin", MTX_DEF) + +#endif /* _LINUX_SPINLOCK_H_ */ Index: sys/ofed/include/linux/kthread.h =================================================================== --- sys/ofed/include/linux/kthread.h (.../base) (revision 0) +++ sys/ofed/include/linux/kthread.h (.../head) (revision 219811) @@ -0,0 +1,104 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_KTHREAD_H_ +#define _LINUX_KTHREAD_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include + +static inline void +_kthread_fn(void *arg) +{ + struct task_struct *task; + + task = arg; + task_struct_set(curthread, task); + if (task->should_stop == 0) + task->task_ret = task->task_fn(task->task_data); + PROC_LOCK(task->task_thread->td_proc); + task->should_stop = TASK_STOPPED; + wakeup(task); + PROC_UNLOCK(task->task_thread->td_proc); + kthread_exit(); +} + +static inline struct task_struct * +_kthread_create(int (*threadfn)(void *data), void *data) +{ + struct task_struct *task; + + task = kzalloc(sizeof(*task), GFP_KERNEL); + task->task_fn = threadfn; + task->task_data = data; + + return (task); +} + +struct task_struct *kthread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) + __attribute__((format(printf, 3, 4))); + +#define kthread_run(fn, data, fmt, ...) \ +({ \ + struct task_struct *_task; \ + \ + _task = _kthread_create((fn), (data)); \ + if (kthread_add(_kthread_fn, _task, NULL, &_task->task_thread, \ + 0, 0, fmt, ## __VA_ARGS__)) { \ + kfree(_task); \ + _task = NULL; \ + } else \ + task_struct_set(_task->task_thread, _task); \ + _task; \ +}) + +#define kthread_should_stop() current->should_stop + +static inline int +kthread_stop(struct task_struct *task) +{ + + PROC_LOCK(task->task_thread->td_proc); + task->should_stop = TASK_SHOULD_STOP; + wake_up_process(task); + while (task->should_stop != TASK_STOPPED) + msleep(task, &task->task_thread->td_proc->p_mtx, PWAIT, + "kstop", hz); + PROC_UNLOCK(task->task_thread->td_proc); + return task->task_ret; +} + +#endif /* _LINUX_KTHREAD_H_ */ Index: sys/ofed/include/linux/semaphore.h =================================================================== --- sys/ofed/include/linux/semaphore.h (.../base) (revision 0) +++ sys/ofed/include/linux/semaphore.h (.../head) (revision 219811) @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SEMAPHORE_H_ +#define _LINUX_SEMAPHORE_H_ + +#include +#include +#include + +/* + * XXX BSD semaphores are disused and slow. They also do not provide a + * sema_wait_sig method. This must be resolved eventually. + */ +struct semaphore { + struct sema sema; +}; + +#define down(_sem) sema_wait(&(_sem)->sema) +#define down_interruptible(_sem) sema_wait(&(_sem)->sema), 0 +#define down_trylock(_sem) !sema_trywait(&(_sem)->sema) +#define up(_sem) sema_post(&(_sem)->sema) + +static inline void +linux_sema_init(struct semaphore *sem, int val) +{ + + memset(&sem->sema, 0, sizeof(sem->sema)); + sema_init(&sem->sema, val, "lnxsema"); +} + +static inline void +init_MUTEX(struct semaphore *sem) +{ + + memset(&sem->sema, 0, sizeof(sem->sema)); + sema_init(&sem->sema, 1, "lnxsema"); +} + +#define sema_init linux_sema_init + +#endif /* _LINUX_SEMAPHORE_H_ */ Index: sys/ofed/include/linux/sched.h =================================================================== --- sys/ofed/include/linux/sched.h (.../base) (revision 0) +++ sys/ofed/include/linux/sched.h (.../head) (revision 219811) @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SCHED_H_ +#define _LINUX_SCHED_H_ + +#include +#include +#include +#include +#include + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_DEAD 64 +#define TASK_WAKEKILL 128 +#define TASK_WAKING 256 + +#define TASK_SHOULD_STOP 1 +#define TASK_STOPPED 2 + +/* + * A task_struct is only provided for those tasks created with kthread. + * Using these routines with threads not started via kthread will cause + * panics because no task_struct is allocated and td_retval[1] is + * overwritten by syscalls which kernel threads will not make use of. + */ +struct task_struct { + struct thread *task_thread; + int (*task_fn)(void *data); + void *task_data; + int task_ret; + int state; + int should_stop; +}; + +#define current ((struct task_struct *)curthread->td_retval[1]) +#define task_struct_get(x) (struct task_struct *)(x)->td_retval[1] +#define task_struct_set(x, y) (x)->td_retval[1] = (register_t)(y) + +#define set_current_state(x) \ + atomic_store_rel_int((volatile int *)¤t->state, (x)) +#define __set_current_state(x) current->state = (x) + + +#define schedule() \ +do { \ + void *c; \ + \ + if (cold) \ + break; \ + c = curthread; \ + sleepq_lock(c); \ + if (current->state == TASK_INTERRUPTIBLE || \ + current->state == TASK_UNINTERRUPTIBLE) { \ + sleepq_add(c, NULL, "task", SLEEPQ_SLEEP, 0); \ + sleepq_wait(c, 0); \ + } else { \ + sleepq_release(c); \ + sched_relinquish(curthread); \ + } \ +} while (0) + +#define wake_up_process(x) \ +do { \ + int wakeup_swapper; \ + void *c; \ + \ + c = (x)->task_thread; \ + sleepq_lock(c); \ + (x)->state = TASK_RUNNING; \ + wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); \ + sleepq_release(c); \ + if (wakeup_swapper) \ + kick_proc0(); \ +} while (0) + +#define cond_resched() if (!cold) sched_relinquish(curthread) + +#define sched_yield() sched_relinquish(curthread) + +#endif /* _LINUX_SCHED_H_ */ Index: sys/ofed/include/linux/err.h =================================================================== --- sys/ofed/include/linux/err.h (.../base) (revision 0) +++ sys/ofed/include/linux/err.h (.../head) (revision 219811) @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_ERR_H_ +#define _LINUX_ERR_H_ + +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * +ERR_PTR(long error) +{ + return (void *)error; +} + +static inline long +PTR_ERR(const void *ptr) +{ + return (long)ptr; +} + +static inline long +IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline void * +ERR_CAST(void *ptr) +{ + return (void *)ptr; +} + +#endif /* _LINUX_ERR_H_ */ Index: sys/ofed/include/linux/interrupt.h =================================================================== --- sys/ofed/include/linux/interrupt.h (.../base) (revision 0) +++ sys/ofed/include/linux/interrupt.h (.../head) (revision 219811) @@ -0,0 +1,139 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_INTERRUPT_H_ +#define _LINUX_INTERRUPT_H_ + +#include +#include + +#include +#include + +typedef irqreturn_t (*irq_handler_t)(int, void *); + +#define IRQ_RETVAL(x) ((x) != IRQ_NONE) + +#define IRQF_SHARED RF_SHAREABLE + +struct irq_ent { + struct list_head links; + struct device *dev; + struct resource *res; + void *arg; + irqreturn_t (*handler)(int, void *); + void *tag; + int irq; +}; + +static inline int +_irq_rid(struct device *dev, int irq) +{ + if (irq == dev->irq) + return (0); + return irq - dev->msix + 1; +} + +static void +_irq_handler(void *ent) +{ + struct irq_ent *irqe; + + irqe = ent; + irqe->handler(irqe->irq, irqe->arg); +} + +static inline struct irq_ent * +_irq_ent(struct device *dev, int irq) +{ + struct irq_ent *irqe; + + list_for_each_entry(irqe, &dev->irqents, links) + if (irqe->irq == irq) + return (irqe); + + return (NULL); +} + +static inline int +request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, + const char *name, void *arg) +{ + struct resource *res; + struct irq_ent *irqe; + struct device *dev; + int error; + int rid; + + dev = _pci_find_irq_dev(irq); + if (dev == NULL) + return -ENXIO; + rid = _irq_rid(dev, irq); + res = bus_alloc_resource_any(dev->bsddev, SYS_RES_IRQ, &rid, + flags | RF_ACTIVE); + if (res == NULL) + return (-ENXIO); + irqe = kmalloc(sizeof(*irqe), GFP_KERNEL); + irqe->dev = dev; + irqe->res = res; + irqe->arg = arg; + irqe->handler = handler; + irqe->irq = irq; + error = bus_setup_intr(dev->bsddev, res, INTR_TYPE_NET | INTR_MPSAFE, + NULL, _irq_handler, irqe, &irqe->tag); + if (error) { + bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res); + kfree(irqe); + return (-error); + } + list_add(&irqe->links, &dev->irqents); + + return 0; +} + +static inline void +free_irq(unsigned int irq, void *device) +{ + struct irq_ent *irqe; + struct device *dev; + int rid; + + dev = _pci_find_irq_dev(irq); + if (dev == NULL) + return; + rid = _irq_rid(dev, irq); + irqe = _irq_ent(dev, irq); + if (irqe == NULL) + return; + bus_teardown_intr(dev->bsddev, irqe->res, irqe->tag); + bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res); + list_del(&irqe->links); + kfree(irqe); +} + +#endif /* _LINUX_INTERRUPT_H_ */ Index: sys/ofed/include/linux/mlx4/device.h =================================================================== --- sys/ofed/include/linux/mlx4/device.h (.../base) (revision 219811) +++ sys/ofed/include/linux/mlx4/device.h (.../head) (revision 219811) @@ -513,7 +513,7 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { - if (BITS_PER_LONG == 64 || buf->nbufs == 1) + if (buf->direct.buf != NULL) return buf->direct.buf + offset; else return buf->page_list[offset >> PAGE_SHIFT].buf + Index: sys/ofed/include/linux/io-mapping.h =================================================================== --- sys/ofed/include/linux/io-mapping.h (.../base) (revision 0) +++ sys/ofed/include/linux/io-mapping.h (.../head) (revision 219811) @@ -0,0 +1,77 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_IO_MAPPING_H_ +#define _LINUX_IO_MAPPING_H_ + +#include +#include + +struct io_mapping; + +static inline struct io_mapping * +io_mapping_create_wc(resource_size_t base, unsigned long size) +{ + + return ioremap_wc(base, size); +} + +static inline void +io_mapping_free(struct io_mapping *mapping) +{ + + iounmap(mapping); +} + +static inline void * +io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) +{ + + return (((char *)mapping) + offset); +} + +static inline void +io_mapping_unmap_atomic(void *vaddr) +{ + +} + +static inline void * +io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) +{ + + return (((char *) mapping) + offset); +} + +static inline void +io_mapping_unmap(void *vaddr) +{ + +} + +#endif /* _LINUX_IO_MAPPING_H_ */ Index: sys/ofed/include/linux/scatterlist.h =================================================================== --- sys/ofed/include/linux/scatterlist.h (.../base) (revision 0) +++ sys/ofed/include/linux/scatterlist.h (.../head) (revision 219811) @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SCATTERLIST_H_ +#define _LINUX_SCATTERLIST_H_ + +#include +#include + +struct scatterlist { + union { + struct page *page; + struct scatterlist *sg; + } sl_un; + unsigned long address; + unsigned long offset; + uint32_t length; + uint32_t flags; +}; + +#define sg_dma_address(sg) (sg)->address +#define sg_dma_len(sg) (sg)->length +#define sg_page(sg) (sg)->sl_un.page +#define sg_scatternext(sg) (sg)->sl_un.sg + +#define SG_END 0x01 +#define SG_CHAIN 0x02 + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + sg_page(sg) = page; + sg_dma_len(sg) = len; + sg->offset = offset; + if (offset > PAGE_SIZE) + panic("sg_set_page: Invalid offset %d\n", offset); +} + +static inline void +sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, + ((uintptr_t)buf) & ~PAGE_MASK); +} + +static inline void +sg_init_table(struct scatterlist *sg, unsigned int nents) +{ + bzero(sg, sizeof(*sg) * nents); + sg[nents - 1].flags = SG_END; +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->flags & SG_END) + return (NULL); + sg++; + if (sg->flags & SG_CHAIN) + sg = sg_scatternext(sg); + return (sg); +} + +static inline vm_paddr_t +sg_phys(struct scatterlist *sg) +{ + return sg_page(sg)->phys_addr + sg->offset; +} + +#define for_each_sg(sglist, sg, sgmax, _itr) \ + for (_itr = 0, sg = (sglist); _itr < (sgmax); _itr++, sg = sg_next(sg)) + +#endif /* _LINUX_SCATTERLIST_H_ */ Index: sys/ofed/include/linux/mount.h =================================================================== --- sys/ofed/include/linux/mount.h (.../base) (revision 0) +++ sys/ofed/include/linux/mount.h (.../head) (revision 219811) @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_MOUNT_H_ +#define _LINUX_MOUNT_H_ + + +#endif /* _LINUX_MOUNT_H_ */ Index: sys/ofed/include/linux/log2.h =================================================================== --- sys/ofed/include/linux/log2.h (.../base) (revision 0) +++ sys/ofed/include/linux/log2.h (.../head) (revision 219811) @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_LOG2_H_ +#define _LINUX_LOG2_H_ + +#include + +#include + +static inline unsigned long +roundup_pow_of_two(unsigned long x) +{ + return (1UL << flsl(x - 1)); +} + +static inline int +is_power_of_2(unsigned long n) +{ + return (n == roundup_pow_of_two(n)); +} + +static inline unsigned long +rounddown_pow_of_two(unsigned long x) +{ + return (1UL << (flsl(x) - 1)); +} + +static inline unsigned long +ilog2(unsigned long x) +{ + return (flsl(x) - 1); +} + +#endif /* _LINUX_LOG2_H_ */ Index: sys/ofed/include/linux/sysfs.h =================================================================== --- sys/ofed/include/linux/sysfs.h (.../base) (revision 0) +++ sys/ofed/include/linux/sysfs.h (.../head) (revision 219811) @@ -0,0 +1,182 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SYSFS_H_ +#define _LINUX_SYSFS_H_ + +#include + +struct attribute { + const char *name; + struct module *owner; + mode_t mode; +}; + +struct sysfs_ops { + ssize_t (*show)(struct kobject *, struct attribute *, char *); + ssize_t (*store)(struct kobject *, struct attribute *, const char *, + size_t); +}; + +struct attribute_group { + const char *name; + mode_t (*is_visible)(struct kobject *, + struct attribute *, int); + struct attribute **attrs; +}; + +#define __ATTR(_name, _mode, _show, _store) { \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, .store = _store, \ +} + +#define __ATTR_RO(_name) { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _name##_show, \ +} + +#define __ATTR_NULL { .attr = { .name = NULL } } + +/* + * Handle our generic '\0' terminated 'C' string. + * Two cases: + * a variable string: point arg1 at it, arg2 is max length. + * a constant string: point arg1 at it, arg2 is zero. + */ + +static inline int +sysctl_handle_attr(SYSCTL_HANDLER_ARGS) +{ + struct kobject *kobj; + struct attribute *attr; + const struct sysfs_ops *ops; + void *buf; + int error; + ssize_t len; + + kobj = arg1; + attr = (struct attribute *)arg2; + buf = (void *)get_zeroed_page(GFP_KERNEL); + len = 1; /* Copy out a NULL byte at least. */ + if (kobj->ktype == NULL || kobj->ktype->sysfs_ops == NULL) + return (ENODEV); + ops = kobj->ktype->sysfs_ops; + if (buf == NULL) + return (ENOMEM); + if (ops->show) { + len = ops->show(kobj, attr, buf); + /* + * It's valid not to have a 'show' so we just return 1 byte + * of NULL. + */ + if (len < 0) { + error = -len; + len = 1; + if (error != EIO) + goto out; + } + } + error = SYSCTL_OUT(req, buf, len); + if (error || !req->newptr || ops->store == NULL) + goto out; + error = SYSCTL_IN(req, buf, PAGE_SIZE); + if (error) + goto out; + len = ops->store(kobj, attr, buf, req->newlen); + if (len < 0) + error = -len; +out: + free_page((unsigned long)buf); + + return (error); +} + +static inline int +sysfs_create_file(struct kobject *kobj, const struct attribute *attr) +{ + + sysctl_add_oid(NULL, SYSCTL_CHILDREN(kobj->oidp), OID_AUTO, + attr->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE, kobj, + (uintptr_t)attr, sysctl_handle_attr, "A", ""); + + return (0); +} + +static inline void +sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) +{ + + if (kobj->oidp) + sysctl_remove_name(kobj->oidp, attr->name, 1, 1); +} + +static inline void +sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) +{ + + if (kobj->oidp) + sysctl_remove_name(kobj->oidp, grp->name, 1, 1); +} + +static inline int +sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) +{ + struct attribute **attr; + struct sysctl_oid *oidp; + + oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->oidp), + OID_AUTO, grp->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, grp->name); + for (attr = grp->attrs; *attr != NULL; attr++) { + sysctl_add_oid(NULL, SYSCTL_CHILDREN(oidp), OID_AUTO, + (*attr)->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE, + kobj, (uintptr_t)*attr, sysctl_handle_attr, "A", ""); + } + + return (0); +} + +static inline int +sysfs_create_dir(struct kobject *kobj) +{ + + kobj->oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->parent->oidp), + OID_AUTO, kobj->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, kobj->name); + + return (0); +} + +static inline void +sysfs_remove_dir(struct kobject *kobj) +{ + + if (kobj->oidp == NULL) + return; + sysctl_remove_oid(kobj->oidp, 1, 1); +} + +#endif /* _LINUX_SYSFS_H_ */ Index: sys/ofed/include/linux/completion.h =================================================================== --- sys/ofed/include/linux/completion.h (.../base) (revision 0) +++ sys/ofed/include/linux/completion.h (.../head) (revision 219811) @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_COMPLETION_H_ +#define _LINUX_COMPLETION_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +struct completion { + unsigned int done; +}; + +#define INIT_COMPLETION(c) ((c).done = 0) +#define init_completion(c) ((c)->done = 0) + +static inline void +_complete_common(struct completion *c, int all) +{ + int wakeup_swapper; + + sleepq_lock(c); + c->done++; + if (all) + wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); + else + wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); + sleepq_release(c); + if (wakeup_swapper) + kick_proc0(); +} + +#define complete(c) _complete_common(c, 0) +#define complete_all(c) _complete_common(c, 1) + +/* + * Indefinite wait for done != 0 with or without signals. + */ +static inline long +_wait_for_common(struct completion *c, int flags) +{ + + flags |= SLEEPQ_SLEEP; + for (;;) { + sleepq_lock(c); + if (c->done) + break; + sleepq_add(c, NULL, "completion", flags, 0); + if (flags & SLEEPQ_INTERRUPTIBLE) { + if (sleepq_wait_sig(c, 0) != 0) + return (-ERESTARTSYS); + } else + sleepq_wait(c, 0); + } + c->done--; + sleepq_release(c); + + return (0); +} + +#define wait_for_completion(c) _wait_for_common(c, 0) +#define wait_for_completion_interuptible(c) \ + _wait_for_common(c, SLEEPQ_INTERRUPTIBLE) + +static inline long +_wait_for_timeout_common(struct completion *c, long timeout, int flags) +{ + long end; + + end = ticks + timeout; + flags |= SLEEPQ_SLEEP; + for (;;) { + sleepq_lock(c); + if (c->done) + break; + sleepq_add(c, NULL, "completion", flags, 0); + sleepq_set_timeout(c, end - ticks); + if (flags & SLEEPQ_INTERRUPTIBLE) { + if (sleepq_timedwait_sig(c, 0) != 0) + return (-ERESTARTSYS); + } else + sleepq_timedwait(c, 0); + } + c->done--; + sleepq_release(c); + timeout = end - ticks; + + return (timeout > 0 ? timeout : 1); +} + +#define wait_for_completion_timeout(c, timeout) \ + _wait_for_timeout_common(c, timeout, 0) +#define wait_for_completion_interruptible_timeout(c, timeout) \ + _wait_for_timeout_common(c, timeout, SLEEPQ_INTERRUPTIBLE) + +static inline int +try_wait_for_completion(struct completion *c) +{ + int isdone; + + isdone = 1; + sleepq_lock(c); + if (c->done) + c->done--; + else + isdone = 0; + sleepq_release(c); + return (isdone); +} + +static inline int +completion_done(struct completion *c) +{ + int isdone; + + isdone = 1; + sleepq_lock(c); + if (c->done == 0) + isdone = 0; + sleepq_release(c); + return (isdone); +} + +#endif /* _LINUX_COMPLETION_H_ */ Index: sys/ofed/include/linux/compiler.h =================================================================== --- sys/ofed/include/linux/compiler.h (.../base) (revision 0) +++ sys/ofed/include/linux/compiler.h (.../head) (revision 219811) @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_COMPILER_H_ +#define _LINUX_COMPILER_H_ + +#include + +#define __user +#define __kernel +#define __safe +#define __force +#define __nocast +#define __iomem +#define __chk_user_ptr(x) 0 +#define __chk_io_ptr(x) 0 +#define __builtin_warning(x, y...) (1) +#define __acquires(x) +#define __releases(x) +#define __acquire(x) 0 +#define __release(x) 0 +#define __cond_lock(x,c) (c) +#define __bitwise +#define __devinitdata +#define __init +#define __devinit +#define __devexit +#define __exit +#define __stringify(x) #x +#define __attribute_const__ __attribute__((__const__)) +#undef __always_inline +#define __always_inline inline + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define typeof(x) __typeof(x) + +#define uninitialized_var(x) x = x + +#endif /* _LINUX_COMPILER_H_ */ Index: sys/ofed/include/linux/pci.h =================================================================== --- sys/ofed/include/linux/pci.h (.../base) (revision 0) +++ sys/ofed/include/linux/pci.h (.../head) (revision 219811) @@ -0,0 +1,580 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_PCI_H_ +#define _LINUX_PCI_H_ + +#define CONFIG_PCI_MSI + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct pci_device_id { + uint32_t vendor; + uint32_t device; + uint32_t subvendor; + uint32_t subdevice; + uint32_t class_mask; + uintptr_t driver_data; +}; + +#define MODULE_DEVICE_TABLE(bus, table) +#define PCI_ANY_ID (-1) +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#define PCI_VENDOR_ID_TOPSPIN 0x1867 +#define PCI_DEVICE_ID_MELLANOX_TAVOR 0x5a44 +#define PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE 0x5a46 +#define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278 +#define PCI_DEVICE_ID_MELLANOX_ARBEL 0x6282 +#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c +#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274 + + +#define PCI_VDEVICE(vendor, device) \ + PCI_VENDOR_ID_##vendor, (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0 +#define PCI_DEVICE(vendor, device) \ + (vendor), (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0 + +#define to_pci_dev(n) container_of(n, struct pci_dev, dev) + +#define PCI_VENDOR_ID PCIR_DEVVENDOR +#define PCI_COMMAND PCIR_COMMAND +#define PCI_EXP_DEVCTL PCIR_EXPRESS_DEVICE_CTL +#define PCI_EXP_LNKCTL PCIR_EXPRESS_LINK_CTL + +#define IORESOURCE_MEM SYS_RES_MEMORY +#define IORESOURCE_IO SYS_RES_IOPORT +#define IORESOURCE_IRQ SYS_RES_IRQ + +struct pci_dev; + +struct pci_driver { + struct list_head links; + char *name; + struct pci_device_id *id_table; + int (*probe)(struct pci_dev *dev, const struct pci_device_id *id); + void (*remove)(struct pci_dev *dev); + driver_t driver; + devclass_t bsdclass; +}; + +extern struct list_head pci_drivers; +extern struct list_head pci_devices; +extern spinlock_t pci_lock; + +#define __devexit_p(x) x + +struct pci_dev { + struct device dev; + struct list_head links; + struct pci_driver *pdrv; + uint64_t dma_mask; + uint16_t device; + uint16_t vendor; + unsigned int irq; +}; + +static inline struct resource_list_entry * +_pci_get_rle(struct pci_dev *pdev, int type, int rid) +{ + struct pci_devinfo *dinfo; + struct resource_list *rl; + + dinfo = device_get_ivars(pdev->dev.bsddev); + rl = &dinfo->resources; + return resource_list_find(rl, type, rid); +} + +static inline struct resource_list_entry * +_pci_get_bar(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + bar = PCIR_BAR(bar); + if ((rle = _pci_get_rle(pdev, SYS_RES_MEMORY, bar)) == NULL) + rle = _pci_get_rle(pdev, SYS_RES_IOPORT, bar); + return (rle); +} + +static inline struct device * +_pci_find_irq_dev(unsigned int irq) +{ + struct pci_dev *pdev; + + spin_lock(&pci_lock); + list_for_each_entry(pdev, &pci_devices, links) { + if (irq == pdev->dev.irq) + break; + if (irq >= pdev->dev.msix && irq < pdev->dev.msix_max) + break; + } + spin_unlock(&pci_lock); + if (pdev) + return &pdev->dev; + return (NULL); +} + +static inline unsigned long +pci_resource_start(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + if ((rle = _pci_get_bar(pdev, bar)) == NULL) + return (0); + return rle->start; +} + +static inline unsigned long +pci_resource_len(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + if ((rle = _pci_get_bar(pdev, bar)) == NULL) + return (0); + return rle->count; +} + +/* + * All drivers just seem to want to inspect the type not flags. + */ +static inline int +pci_resource_flags(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + if ((rle = _pci_get_bar(pdev, bar)) == NULL) + return (0); + return rle->type; +} + +static inline const char * +pci_name(struct pci_dev *d) +{ + + return device_get_desc(d->dev.bsddev); +} + +static inline void * +pci_get_drvdata(struct pci_dev *pdev) +{ + + return dev_get_drvdata(&pdev->dev); +} + +static inline void +pci_set_drvdata(struct pci_dev *pdev, void *data) +{ + + dev_set_drvdata(&pdev->dev, data); +} + +static inline int +pci_enable_device(struct pci_dev *pdev) +{ + + pci_enable_io(pdev->dev.bsddev, SYS_RES_IOPORT); + pci_enable_io(pdev->dev.bsddev, SYS_RES_MEMORY); + return (0); +} + +static inline void +pci_disable_device(struct pci_dev *pdev) +{ +} + +static inline int +pci_set_master(struct pci_dev *pdev) +{ + + pci_enable_busmaster(pdev->dev.bsddev); + return (0); +} + +static inline int +pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) +{ + int rid; + int type; + + type = pci_resource_flags(pdev, bar); + if (type == 0) + return (-ENODEV); + rid = PCIR_BAR(bar); + if (bus_alloc_resource_any(pdev->dev.bsddev, type, &rid, + RF_ACTIVE) == NULL) + return (-EINVAL); + return (0); +} + +static inline void +pci_release_region(struct pci_dev *pdev, int bar) +{ + struct resource_list_entry *rle; + + if ((rle = _pci_get_bar(pdev, bar)) == NULL) + return; + bus_release_resource(pdev->dev.bsddev, rle->type, rle->rid, rle->res); +} + +static inline void +pci_release_regions(struct pci_dev *pdev) +{ + int i; + + for (i = 0; i <= PCIR_MAX_BAR_0; i++) + pci_release_region(pdev, i); +} + +static inline int +pci_request_regions(struct pci_dev *pdev, const char *res_name) +{ + int error; + int i; + + for (i = 0; i <= PCIR_MAX_BAR_0; i++) { + error = pci_request_region(pdev, i, res_name); + if (error && error != -ENODEV) { + pci_release_regions(pdev); + return (error); + } + } + return (0); +} + +static inline void +pci_disable_msix(struct pci_dev *pdev) +{ + + pci_release_msi(pdev->dev.bsddev); +} + +#define PCI_CAP_ID_EXP PCIY_EXPRESS +#define PCI_CAP_ID_PCIX PCIY_PCIX + +static inline int +pci_find_capability(struct pci_dev *pdev, int capid) +{ + int reg; + + if (pci_find_extcap(pdev->dev.bsddev, capid, ®)) + return (0); + return (reg); +} + +static inline int +pci_read_config_byte(struct pci_dev *pdev, int where, u8 *val) +{ + + *val = (u8)pci_read_config(pdev->dev.bsddev, where, 1); + return (0); +} + +static inline int +pci_read_config_word(struct pci_dev *pdev, int where, u16 *val) +{ + + *val = (u16)pci_read_config(pdev->dev.bsddev, where, 2); + return (0); +} + +static inline int +pci_read_config_dword(struct pci_dev *pdev, int where, u32 *val) +{ + + *val = (u32)pci_read_config(pdev->dev.bsddev, where, 4); + return (0); +} + +static inline int +pci_write_config_byte(struct pci_dev *pdev, int where, u8 val) +{ + + pci_write_config(pdev->dev.bsddev, where, val, 1); + return (0); +} + +static inline int +pci_write_config_word(struct pci_dev *pdev, int where, u16 val) +{ + + pci_write_config(pdev->dev.bsddev, where, val, 2); + return (0); +} + +static inline int +pci_write_config_dword(struct pci_dev *pdev, int where, u32 val) +{ + + pci_write_config(pdev->dev.bsddev, where, val, 4); + return (0); +} + +static struct pci_driver * +linux_pci_find(device_t dev, struct pci_device_id **idp) +{ + struct pci_device_id *id; + struct pci_driver *pdrv; + uint16_t vendor; + uint16_t device; + + vendor = pci_get_vendor(dev); + device = pci_get_device(dev); + + spin_lock(&pci_lock); + list_for_each_entry(pdrv, &pci_drivers, links) { + for (id = pdrv->id_table; id->vendor != 0; id++) { + if (vendor == id->vendor && device == id->device) { + *idp = id; + spin_unlock(&pci_lock); + return (pdrv); + } + } + } + spin_unlock(&pci_lock); + return (NULL); +} + +static inline int +linux_pci_probe(device_t dev) +{ + struct pci_device_id *id; + struct pci_driver *pdrv; + + if ((pdrv = linux_pci_find(dev, &id)) == NULL) + return (ENXIO); + if (device_get_driver(dev) != &pdrv->driver) + return (ENXIO); + device_set_desc(dev, pdrv->name); + return (0); +} + +static inline int +linux_pci_attach(device_t dev) +{ + struct resource_list_entry *rle; + struct pci_dev *pdev; + struct pci_driver *pdrv; + struct pci_device_id *id; + int error; + + pdrv = linux_pci_find(dev, &id); + pdev = device_get_softc(dev); + pdev->dev.parent = &linux_rootdev; + pdev->dev.bsddev = dev; + INIT_LIST_HEAD(&pdev->dev.irqents); + pdev->device = id->device; + pdev->vendor = id->vendor; + pdev->dev.dma_mask = &pdev->dma_mask; + pdev->pdrv = pdrv; + kobject_init(&pdev->dev.kobj, &dev_ktype); + kobject_set_name(&pdev->dev.kobj, device_get_nameunit(dev)); + kobject_add(&pdev->dev.kobj, &linux_rootdev.kobj, + kobject_name(&pdev->dev.kobj)); + rle = _pci_get_rle(pdev, SYS_RES_IRQ, 0); + if (rle) + pdev->dev.irq = rle->start; + else + pdev->dev.irq = 0; + pdev->irq = pdev->dev.irq; + mtx_unlock(&Giant); + spin_lock(&pci_lock); + list_add(&pdev->links, &pci_devices); + spin_unlock(&pci_lock); + error = pdrv->probe(pdev, id); + mtx_lock(&Giant); + if (error) { + spin_lock(&pci_lock); + list_del(&pdev->links); + spin_unlock(&pci_lock); + put_device(&pdev->dev); + return (-error); + } + return (0); +} + +static inline int +linux_pci_detach(device_t dev) +{ + struct pci_dev *pdev; + + pdev = device_get_softc(dev); + mtx_unlock(&Giant); + pdev->pdrv->remove(pdev); + mtx_lock(&Giant); + spin_lock(&pci_lock); + list_del(&pdev->links); + spin_unlock(&pci_lock); + put_device(&pdev->dev); + + return (0); +} + +static device_method_t pci_methods[] = { + DEVMETHOD(device_probe, linux_pci_probe), + DEVMETHOD(device_attach, linux_pci_attach), + DEVMETHOD(device_detach, linux_pci_detach), + {0, 0} +}; + +static inline int +pci_register_driver(struct pci_driver *pdrv) +{ + devclass_t bus; + int error; + + spin_lock(&pci_lock); + list_add(&pdrv->links, &pci_drivers); + spin_unlock(&pci_lock); + bus = devclass_find("pci"); + pdrv->driver.name = pdrv->name; + pdrv->driver.methods = pci_methods; + pdrv->driver.size = sizeof(struct pci_dev); + mtx_lock(&Giant); + error = devclass_add_driver(bus, &pdrv->driver, BUS_PASS_DEFAULT, + &pdrv->bsdclass); + mtx_unlock(&Giant); + if (error) + return (-error); + return (0); +} + +static inline void +pci_unregister_driver(struct pci_driver *pdrv) +{ + devclass_t bus; + + list_del(&pdrv->links); + bus = devclass_find("pci"); + mtx_lock(&Giant); + devclass_delete_driver(bus, &pdrv->driver); + mtx_unlock(&Giant); +} + +struct msix_entry { + int entry; + int vector; +}; + +/* + * Enable msix, positive errors indicate actual number of available + * vectors. Negative errors are failures. + */ +static inline int +pci_enable_msix(struct pci_dev *pdev, struct msix_entry *entries, int nreq) +{ + struct resource_list_entry *rle; + int error; + int avail; + int i; + + avail = pci_msix_count(pdev->dev.bsddev); + if (avail < nreq) { + if (avail == 0) + return -EINVAL; + return avail; + } + avail = nreq; + if ((error = -pci_alloc_msix(pdev->dev.bsddev, &avail)) != 0) + return error; + rle = _pci_get_rle(pdev, SYS_RES_IRQ, 1); + pdev->dev.msix = rle->start; + pdev->dev.msix_max = rle->start + avail; + for (i = 0; i < nreq; i++) + entries[i].vector = pdev->dev.msix + i; + return (0); +} + +/* XXX This should not be necessary. */ +#define pcix_set_mmrbc(d, v) 0 +#define pcix_get_max_mmrbc(d) 0 +#define pcie_set_readrq(d, v) 0 + +#define PCI_DMA_BIDIRECTIONAL 0 +#define PCI_DMA_TODEVICE 1 +#define PCI_DMA_FROMDEVICE 2 +#define PCI_DMA_NONE 3 + +#define pci_pool dma_pool +#define pci_pool_destroy dma_pool_destroy +#define pci_pool_alloc dma_pool_alloc +#define pci_pool_free dma_pool_free +#define pci_pool_create(_name, _pdev, _size, _align, _alloc) \ + dma_pool_create(_name, &(_pdev)->dev, _size, _align, _alloc) +#define pci_free_consistent(_hwdev, _size, _vaddr, _dma_handle) \ + dma_free_coherent((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \ + _size, _vaddr, _dma_handle) +#define pci_map_sg(_hwdev, _sg, _nents, _dir) \ + dma_map_sg((_hwdev) == NULL ? NULL : &(_hwdev->dev), \ + _sg, _nents, (enum dma_data_direction)_dir) +#define pci_map_single(_hwdev, _ptr, _size, _dir) \ + dma_map_single((_hwdev) == NULL ? NULL : &(_hwdev->dev), \ + (_ptr), (_size), (enum dma_data_direction)_dir) +#define pci_unmap_single(_hwdev, _addr, _size, _dir) \ + dma_unmap_single((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \ + _addr, _size, (enum dma_data_direction)_dir) +#define pci_unmap_sg(_hwdev, _sg, _nents, _dir) \ + dma_unmap_sg((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \ + _sg, _nents, (enum dma_data_direction)_dir) +#define pci_map_page(_hwdev, _page, _offset, _size, _dir) \ + dma_map_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev, _page,\ + _offset, _size, (enum dma_data_direction)_dir) +#define pci_unmap_page(_hwdev, _dma_address, _size, _dir) \ + dma_unmap_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \ + _dma_address, _size, (enum dma_data_direction)_dir) +#define pci_set_dma_mask(_pdev, mask) dma_set_mask(&(_pdev)->dev, (mask)) +#define pci_dma_mapping_error(_pdev, _dma_addr) \ + dma_mapping_error(&(_pdev)->dev, _dma_addr) +#define pci_set_consistent_dma_mask(_pdev, _mask) \ + dma_set_coherent_mask(&(_pdev)->dev, (_mask)) +#define DECLARE_PCI_UNMAP_ADDR(x) DEFINE_DMA_UNMAP_ADDR(x); +#define DECLARE_PCI_UNMAP_LEN(x) DEFINE_DMA_UNMAP_LEN(x); +#define pci_unmap_addr dma_unmap_addr +#define pci_unmap_addr_set dma_unmap_addr_set +#define pci_unmap_len dma_unmap_len +#define pci_unmap_len_set dma_unmap_len_set + + +#endif /* _LINUX_PCI_H_ */ Index: sys/ofed/include/linux/ethtool.h =================================================================== --- sys/ofed/include/linux/ethtool.h (.../base) (revision 0) +++ sys/ofed/include/linux/ethtool.h (.../head) (revision 219811) @@ -0,0 +1,31 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_ETHTOOL_H_ +#define _LINUX_ETHTOOL_H_ + +#endif /* _LINUX_ETHTOOL_H_ */ Index: sys/ofed/include/linux/jiffies.h =================================================================== --- sys/ofed/include/linux/jiffies.h (.../base) (revision 0) +++ sys/ofed/include/linux/jiffies.h (.../head) (revision 219811) @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_JIFFIES_H_ +#define _LINUX_JIFFIES_H_ + +#include +#include + +#include +#include + +static inline int +msecs_to_jiffies(int msec) +{ + struct timeval tv; + + tv.tv_sec = msec / 1000; + tv.tv_usec = (msec % 1000) * 1000; + return (tvtohz(&tv)); +} + +#define jiffies ticks + +#define time_after(a, b) ((long)(b) - (long)(a) < 0) +#define time_before(a, b) time_after(b,a) +#define time_after_eq(a, b) ((long)(a) - (long)(b) >= 0) +#define time_before_eq(a, b) time_after_eq(b, a) + +#define HZ hz + +#endif /* _LINUX_JIFFIES_H_ */ Index: sys/ofed/include/linux/file.h =================================================================== --- sys/ofed/include/linux/file.h (.../base) (revision 0) +++ sys/ofed/include/linux/file.h (.../head) (revision 219811) @@ -0,0 +1,120 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_FILE_H_ +#define _LINUX_FILE_H_ + +#include +#include +#include +#include +#include + +#include + +struct linux_file; + +#undef file + +extern struct fileops linuxfileops; + +static inline struct linux_file * +linux_fget(unsigned int fd) +{ + struct file *file; + + file = fget_unlocked(curthread->td_proc->p_fd, fd); + return (struct linux_file *)file->f_data; +} + +static inline void +fput(struct linux_file *filp) +{ + if (filp->_file == NULL) { + kfree(filp); + return; + } + if (refcount_release(&filp->_file->f_count)) { + _fdrop(filp->_file, curthread); + kfree(filp); + } +} + +static inline void +put_unused_fd(unsigned int fd) +{ + struct file *file; + + file = fget_unlocked(curthread->td_proc->p_fd, fd); + if (file == NULL) + return; + fdclose(curthread->td_proc->p_fd, file, fd, curthread); +} + +static inline void +fd_install(unsigned int fd, struct linux_file *filp) +{ + struct file *file; + + file = fget_unlocked(curthread->td_proc->p_fd, fd); + filp->_file = file; + finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); +} + +static inline int +get_unused_fd(void) +{ + struct file *file; + int error; + int fd; + + error = falloc(curthread, &file, &fd); + if (error) + return -error; + return fd; +} + +static inline struct linux_file * +_alloc_file(int mode, const struct file_operations *fops) +{ + struct linux_file *filp; + + filp = kzalloc(sizeof(*filp), GFP_KERNEL); + if (filp == NULL) + return (NULL); + filp->f_op = fops; + filp->f_mode = mode; + + return filp; +} + +#define alloc_file(mnt, root, mode, fops) _alloc_file((mode), (fops)) + +#define file linux_file +#define fget linux_fget + +#endif /* _LINUX_FILE_H_ */ Index: sys/ofed/include/linux/random.h =================================================================== --- sys/ofed/include/linux/random.h (.../base) (revision 0) +++ sys/ofed/include/linux/random.h (.../head) (revision 219811) @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_RANDOM_H_ +#define _LINUX_RANDOM_H_ + +#include + +static inline void +get_random_bytes(void *buf, int nbytes) +{ + read_random(buf, nbytes); +} + +#endif /* _LINUX_RANDOM_H_ */ Index: sys/ofed/include/linux/timer.h =================================================================== --- sys/ofed/include/linux/timer.h (.../base) (revision 0) +++ sys/ofed/include/linux/timer.h (.../head) (revision 219811) @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_TIMER_H_ +#define _LINUX_TIMER_H_ + +#include + +#include +#include +#include + +struct timer_list { + struct callout timer_callout; + void (*function)(unsigned long); + unsigned long data; +}; + +#define expires timer_callout.c_time + +static inline void +_timer_fn(void *context) +{ + struct timer_list *timer; + + timer = context; + timer->function(timer->data); +} + +#define setup_timer(timer, func, dat) \ +do { \ + (timer)->function = (func); \ + (timer)->data = (dat); \ + callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \ +} while (0) + +#define init_timer(timer) \ +do { \ + (timer)->function = NULL; \ + (timer)->data = 0; \ + callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \ +} while (0) + +#define mod_timer(timer, expire) \ + callout_reset(&(timer)->timer_callout, (expire) - jiffies, \ + _timer_fn, (timer)) + +#define add_timer(timer) \ + callout_reset(&(timer)->timer_callout, \ + (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer)) + +#define del_timer(timer) callout_stop(&(timer)->timer_callout) +#define del_timer_sync(timer) callout_drain(&(timer)->timer_callout) + +#define timer_pending(timer) callout_pending(&(timer)->timer_callout) + +static inline unsigned long +round_jiffies(unsigned long j) +{ + return roundup(j, hz); +} + +#endif /* _LINUX_TIMER_H_ */ Index: sys/ofed/include/linux/ctype.h =================================================================== --- sys/ofed/include/linux/ctype.h (.../base) (revision 0) +++ sys/ofed/include/linux/ctype.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_CTYPE_H_ +#define _LINUX_CTYPE_H_ + +#include + +#endif /* _LINUX_CTYPE_H_ */ Index: sys/ofed/include/linux/errno.h =================================================================== --- sys/ofed/include/linux/errno.h (.../base) (revision 0) +++ sys/ofed/include/linux/errno.h (.../head) (revision 219811) @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_ERRNO_H_ +#define _LINUX_ERRNO_H_ + +#include + +#define ECOMM ESTALE +#define ENODATA ECONNREFUSED +#define ENOIOCTLCMD ENOIOCTL /* XXX this is negative */ +#define ERESTARTSYS ERESTART /* XXX this is negative */ + +#endif /* _LINUX_ERRNO_H_ */ Index: sys/ofed/include/linux/if_ether.h =================================================================== --- sys/ofed/include/linux/if_ether.h (.../base) (revision 0) +++ sys/ofed/include/linux/if_ether.h (.../head) (revision 219811) @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_IF_ETHER_H_ +#define _LINUX_IF_ETHER_H_ + +#include + +#include + +#define ETH_P_8021Q ETHERTYPE_VLAN + +#endif /* _LINUX_IF_ETHER_H_ */ Index: sys/ofed/include/linux/net.h =================================================================== --- sys/ofed/include/linux/net.h (.../base) (revision 0) +++ sys/ofed/include/linux/net.h (.../head) (revision 219811) @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NET_H_ +#define _LINUX_NET_H_ + +#include +#include +#include + +static inline int +sock_create_kern(int family, int type, int proto, struct socket **res) +{ + return -socreate(family, res, type, proto, curthread->td_ucred, + curthread); +} + +static inline int +sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len, + int peer) +{ + struct sockaddr **nam; + int error; + + nam = NULL; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) + return (-ENOTCONN); + + if (peer) + error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam); + else + error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam); + if (error) + return (-error); + *addr = **nam; + *sockaddr_len = addr->sa_len; + + free(*nam, M_SONAME); + return (0); +} + +static inline void +sock_release(struct socket *so) +{ + soclose(so); +} + +#endif /* _LINUX_NET_H_ */ Index: sys/ofed/include/linux/workqueue.h =================================================================== --- sys/ofed/include/linux/workqueue.h (.../base) (revision 0) +++ sys/ofed/include/linux/workqueue.h (.../head) (revision 219811) @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_WORKQUEUE_H_ +#define _LINUX_WORKQUEUE_H_ + +#include +#include +#include +#include + +#include + +struct workqueue_struct { + struct taskqueue *taskqueue; +}; + +struct work_struct { + struct task work_task; + struct taskqueue *taskqueue; + void (*fn)(struct work_struct *); +}; + +struct delayed_work { + struct work_struct work; + struct callout timer; +}; + +static inline struct delayed_work * +to_delayed_work(struct work_struct *work) +{ + + return container_of(work, struct delayed_work, work); +} + + +static inline void +_work_fn(void *context, int pending) +{ + struct work_struct *work; + + work = context; + work->fn(work); +} + +#define INIT_WORK(work, func) \ +do { \ + (work)->fn = (func); \ + (work)->taskqueue = NULL; \ + TASK_INIT(&(work)->work_task, 0, _work_fn, (work)); \ +} while (0) + +#define INIT_DELAYED_WORK(_work, func) \ +do { \ + INIT_WORK(&(_work)->work, func); \ + callout_init(&(_work)->timer, CALLOUT_MPSAFE); \ +} while (0) + +#define INIT_DELAYED_WORK_DEFERRABLE INIT_DELAYED_WORK + +#define schedule_work(work) \ +do { \ + (work)->taskqueue = taskqueue_thread; \ + taskqueue_enqueue(taskqueue_thread, &(work)->work_task); \ +} while (0) + +#define flush_scheduled_work() flush_taskqueue(taskqueue_thread) + +#define queue_work(q, work) \ +do { \ + (work)->taskqueue = (q)->taskqueue; \ + taskqueue_enqueue((q)->taskqueue, &(work)->work_task); \ +} while (0) + +static inline void +_delayed_work_fn(void *arg) +{ + struct delayed_work *work; + + work = arg; + taskqueue_enqueue(work->work.taskqueue, &work->work.work_task); +} + +static inline int +queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, + unsigned long delay) +{ + int pending; + + pending = work->work.work_task.ta_pending; + work->work.taskqueue = wq->taskqueue; + if (delay != 0) + callout_reset(&work->timer, delay, _delayed_work_fn, work); + else + _delayed_work_fn((void *)work); + + return (!pending); +} + +static inline struct workqueue_struct * +_create_workqueue_common(char *name, int cpus) +{ + struct workqueue_struct *wq; + + wq = kmalloc(sizeof(*wq), M_WAITOK); + wq->taskqueue = taskqueue_create((name), M_WAITOK, + taskqueue_thread_enqueue, &wq->taskqueue); + taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, (name)); + + return (wq); +} + + +#define create_singlethread_workqueue(name) \ + _create_workqueue_common(name, 1) + +#define create_workqueue(name) \ + _create_workqueue_common(name, MAXCPU) + +static inline void +destroy_workqueue(struct workqueue_struct *wq) +{ + taskqueue_free(wq->taskqueue); + kfree(wq); +} + +#define flush_workqueue(wq) flush_taskqueue((wq)->taskqueue) + +static inline void +_flush_fn(void *context, int pending) +{ +} + +static inline void +flush_taskqueue(struct taskqueue *tq) +{ + struct task flushtask; + + TASK_INIT(&flushtask, 0, _flush_fn, NULL); + taskqueue_enqueue(tq, &flushtask); + taskqueue_drain(tq, &flushtask); +} + +static inline int +cancel_work_sync(struct work_struct *work) +{ + if (work->taskqueue && + taskqueue_cancel(work->taskqueue, &work->work_task, NULL)) + taskqueue_drain(work->taskqueue, &work->work_task); + return 0; +} + +/* + * This may leave work running on another CPU as it does on Linux. + */ +static inline int +cancel_delayed_work(struct delayed_work *work) +{ + + callout_stop(&work->timer); + if (work->work.taskqueue && + taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL)) + taskqueue_drain(work->work.taskqueue, &work->work.work_task); + return 0; +} + +#endif /* _LINUX_WORKQUEUE_H_ */ Index: sys/ofed/include/linux/linux_idr.c =================================================================== --- sys/ofed/include/linux/linux_idr.c (.../base) (revision 0) +++ sys/ofed/include/linux/linux_idr.c (.../head) (revision 219811) @@ -0,0 +1,447 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +/* + * IDR Implementation. + * + * This is quick and dirty and not as re-entrant as the linux version + * however it should be fairly fast. It is basically a radix tree with + * a builtin bitmap for allocation. + */ +MALLOC_DEFINE(M_IDR, "idr", "Linux IDR compat"); + +static inline int +idr_max(struct idr *idr) +{ + return (1 << (idr->layers * IDR_BITS)) - 1; +} + +static inline int +idr_pos(int id, int layer) +{ + return (id >> (IDR_BITS * layer)) & IDR_MASK; +} + +void +idr_init(struct idr *idr) +{ + bzero(idr, sizeof(*idr)); + mtx_init(&idr->lock, "idr", NULL, MTX_DEF); +} + +/* Only frees cached pages. */ +void +idr_destroy(struct idr *idr) +{ + struct idr_layer *il, *iln; + + mtx_lock(&idr->lock); + for (il = idr->free; il != NULL; il = iln) { + iln = il->ary[0]; + free(il, M_IDR); + } + mtx_unlock(&idr->lock); +} + +static void +idr_remove_layer(struct idr_layer *il, int layer) +{ + int i; + + if (il == NULL) + return; + if (layer == 0) { + free(il, M_IDR); + return; + } + for (i = 0; i < IDR_SIZE; i++) + if (il->ary[i]) + idr_remove_layer(il->ary[i], layer - 1); +} + +void +idr_remove_all(struct idr *idr) +{ + + mtx_lock(&idr->lock); + idr_remove_layer(idr->top, idr->layers - 1); + idr->top = NULL; + idr->layers = 0; + mtx_unlock(&idr->lock); +} + +void +idr_remove(struct idr *idr, int id) +{ + struct idr_layer *il; + int layer; + int idx; + + id &= MAX_ID_MASK; + mtx_lock(&idr->lock); + il = idr->top; + layer = idr->layers - 1; + if (il == NULL || id > idr_max(idr)) { + mtx_unlock(&idr->lock); + return; + } + /* + * Walk down the tree to this item setting bitmaps along the way + * as we know at least one item will be free along this path. + */ + while (layer && il) { + idx = idr_pos(id, layer); + il->bitmap |= 1 << idx; + il = il->ary[idx]; + layer--; + } + idx = id & IDR_MASK; + /* + * At this point we've set free space bitmaps up the whole tree. + * We could make this non-fatal and unwind but linux dumps a stack + * and a warning so I don't think it's necessary. + */ + if (il == NULL || (il->bitmap & (1 << idx)) != 0) + panic("idr_remove: Item %d not allocated (%p, %p)\n", + id, idr, il); + il->ary[idx] = NULL; + il->bitmap |= 1 << idx; + mtx_unlock(&idr->lock); + return; +} + +void * +idr_replace(struct idr *idr, void *ptr, int id) +{ + struct idr_layer *il; + void *res; + int layer; + int idx; + + res = ERR_PTR(-EINVAL); + id &= MAX_ID_MASK; + mtx_lock(&idr->lock); + il = idr->top; + layer = idr->layers - 1; + if (il == NULL || id > idr_max(idr)) + goto out; + while (layer && il) { + il = il->ary[idr_pos(id, layer)]; + layer--; + } + idx = id & IDR_MASK; + /* + * Replace still returns an error if the item was not allocated. + */ + if (il != NULL && (il->bitmap & (1 << idx)) != 0) { + res = il->ary[idx]; + il->ary[idx] = ptr; + } +out: + mtx_unlock(&idr->lock); + return (res); +} + +void * +idr_find(struct idr *idr, int id) +{ + struct idr_layer *il; + void *res; + int layer; + + res = NULL; + id &= MAX_ID_MASK; + mtx_lock(&idr->lock); + il = idr->top; + layer = idr->layers - 1; + if (il == NULL || id > idr_max(idr)) + goto out; + while (layer && il) { + il = il->ary[idr_pos(id, layer)]; + layer--; + } + if (il != NULL) + res = il->ary[id & IDR_MASK]; +out: + mtx_unlock(&idr->lock); + return (res); +} + +int +idr_pre_get(struct idr *idr, gfp_t gfp_mask) +{ + struct idr_layer *il, *iln; + struct idr_layer *head; + int need; + + mtx_lock(&idr->lock); + for (;;) { + need = idr->layers + 1; + for (il = idr->free; il != NULL; il = il->ary[0]) + need--; + mtx_unlock(&idr->lock); + if (need == 0) + break; + for (head = NULL; need; need--) { + iln = malloc(sizeof(*il), M_IDR, M_ZERO | gfp_mask); + if (iln == NULL) + break; + bitmap_fill(&iln->bitmap, IDR_SIZE); + if (head != NULL) { + il->ary[0] = iln; + il = iln; + } else + head = il = iln; + } + if (head == NULL) + return (0); + mtx_lock(&idr->lock); + il->ary[0] = idr->free; + idr->free = head; + } + return (1); +} + +static inline struct idr_layer * +idr_get(struct idr *idr) +{ + struct idr_layer *il; + + il = idr->free; + if (il) { + idr->free = il->ary[0]; + il->ary[0] = NULL; + return (il); + } + il = malloc(sizeof(*il), M_IDR, M_ZERO | M_NOWAIT); + bitmap_fill(&il->bitmap, IDR_SIZE); + return (il); +} + +/* + * Could be implemented as get_new_above(idr, ptr, 0, idp) but written + * first for simplicity sake. + */ +int +idr_get_new(struct idr *idr, void *ptr, int *idp) +{ + struct idr_layer *stack[MAX_LEVEL]; + struct idr_layer *il; + int error; + int layer; + int idx; + int id; + + error = -EAGAIN; + mtx_lock(&idr->lock); + /* + * Expand the tree until there is free space. + */ + if (idr->top == NULL || idr->top->bitmap == 0) { + if (idr->layers == MAX_LEVEL + 1) { + error = -ENOSPC; + goto out; + } + il = idr_get(idr); + if (il == NULL) + goto out; + il->ary[0] = idr->top; + if (idr->top) + il->bitmap &= ~1; + idr->top = il; + idr->layers++; + } + il = idr->top; + id = 0; + /* + * Walk the tree following free bitmaps, record our path. + */ + for (layer = idr->layers - 1;; layer--) { + stack[layer] = il; + idx = ffsl(il->bitmap); + if (idx == 0) + panic("idr_get_new: Invalid leaf state (%p, %p)\n", + idr, il); + idx--; + id |= idx << (layer * IDR_BITS); + if (layer == 0) + break; + if (il->ary[idx] == NULL) { + il->ary[idx] = idr_get(idr); + if (il->ary[idx] == NULL) + goto out; + } + il = il->ary[idx]; + } + /* + * Allocate the leaf to the consumer. + */ + il->bitmap &= ~(1 << idx); + il->ary[idx] = ptr; + *idp = id; + /* + * Clear bitmaps potentially up to the root. + */ + while (il->bitmap == 0 && ++layer < idr->layers) { + il = stack[layer]; + il->bitmap &= ~(1 << idr_pos(id, layer)); + } + error = 0; +out: + mtx_unlock(&idr->lock); +#ifdef INVARIANTS + if (error == 0 && idr_find(idr, id) != ptr) { + panic("idr_get_new: Failed for idr %p, id %d, ptr %p\n", + idr, id, ptr); + } +#endif + return (error); +} + +int +idr_get_new_above(struct idr *idr, void *ptr, int starting_id, int *idp) +{ + struct idr_layer *stack[MAX_LEVEL]; + struct idr_layer *il; + int error; + int layer; + int idx, sidx; + int id; + + error = -EAGAIN; + mtx_lock(&idr->lock); + /* + * Compute the layers required to support starting_id and the mask + * at the top layer. + */ +restart: + idx = starting_id; + layer = 0; + while (idx & ~IDR_MASK) { + layer++; + idx >>= IDR_BITS; + } + if (layer == MAX_LEVEL + 1) { + error = -ENOSPC; + goto out; + } + /* + * Expand the tree until there is free space at or beyond starting_id. + */ + while (idr->layers <= layer || + idr->top->bitmap < (1 << idr_pos(starting_id, idr->layers - 1))) { + if (idr->layers == MAX_LEVEL + 1) { + error = -ENOSPC; + goto out; + } + il = idr_get(idr); + if (il == NULL) + goto out; + il->ary[0] = idr->top; + if (idr->top && idr->top->bitmap == 0) + il->bitmap &= ~1; + idr->top = il; + idr->layers++; + } + il = idr->top; + id = 0; + /* + * Walk the tree following free bitmaps, record our path. + */ + for (layer = idr->layers - 1;; layer--) { + stack[layer] = il; + sidx = idr_pos(starting_id, layer); + /* Returns index numbered from 0 or size if none exists. */ + idx = find_next_bit(&il->bitmap, IDR_SIZE, sidx); + if (idx == IDR_SIZE && sidx == 0) + panic("idr_get_new: Invalid leaf state (%p, %p)\n", + idr, il); + /* + * We may have walked a path where there was a free bit but + * it was lower than what we wanted. Restart the search with + * a larger starting id. id contains the progress we made so + * far. Search the leaf one above this level. This may + * restart as many as MAX_LEVEL times but that is expected + * to be rare. + */ + if (idx == IDR_SIZE) { + starting_id = id + (1 << (layer+1 * IDR_BITS)); + goto restart; + } + if (idx > sidx) + starting_id = 0; /* Search the whole subtree. */ + id |= idx << (layer * IDR_BITS); + if (layer == 0) + break; + if (il->ary[idx] == NULL) { + il->ary[idx] = idr_get(idr); + if (il->ary[idx] == NULL) + goto out; + } + il = il->ary[idx]; + } + /* + * Allocate the leaf to the consumer. + */ + il->bitmap &= ~(1 << idx); + il->ary[idx] = ptr; + *idp = id; + /* + * Clear bitmaps potentially up to the root. + */ + while (il->bitmap == 0 && ++layer < idr->layers) { + il = stack[layer]; + il->bitmap &= ~(1 << idr_pos(id, layer)); + } + error = 0; +out: + mtx_unlock(&idr->lock); +#ifdef INVARIANTS + if (error == 0 && idr_find(idr, id) != ptr) { + panic("idr_get_new_above: Failed for idr %p, id %d, ptr %p\n", + idr, id, ptr); + } +#endif + return (error); +} Index: sys/ofed/include/linux/socket.h =================================================================== --- sys/ofed/include/linux/socket.h (.../base) (revision 0) +++ sys/ofed/include/linux/socket.h (.../head) (revision 219811) @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LINUX_SOCKET_H_ +#define _LINUX_SOCKET_H_ + +#include + +#ifdef notyet +static inline int +memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len) +{ + struct uio uio; + int error; + + uio.uio_iov = v; + uio.uio_iovcnt = -1; + uio.uio_offset = 0; + uio.uio_resid = len; + uio.uio_segflag = UIO_USERSPACE; + uio.uio_rw = UIO_READ; + error = -uiomove(kdata, len, &uio); + return (error); +} + +static inline int +memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + struct uio uio; + int error; + + uio.uio_iov = v; + uio.uio_iovcnt = -1; + uio.uio_offset = 0; + uio.uio_resid = len; + uio.uio_segflag = UIO_USERSPACE; + uio.uio_rw = UIO_WRITE; + error = -uiomove(kdata, len, &uio); +} +#endif + +#endif /* _LINUX_SOCKET_H_ */ Index: sys/ofed/include/asm/atomic-long.h =================================================================== --- sys/ofed/include/asm/atomic-long.h (.../base) (revision 0) +++ sys/ofed/include/asm/atomic-long.h (.../head) (revision 219811) @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ATOMIC_LONG_H_ +#define _ATOMIC_LONG_H_ + +#include +#include +#include + +typedef struct { + volatile u_long counter; +} atomic_long_t; + +#define atomic_long_add(i, v) atomic_long_add_return((i), (v)) +#define atomic_long_inc_return(v) atomic_long_add_return(1, (v)) + +static inline long +atomic_long_add_return(long i, atomic_long_t *v) +{ + return i + atomic_fetchadd_long(&v->counter, i); +} + +static inline void +atomic_long_set(atomic_long_t *v, long i) +{ + atomic_store_rel_long(&v->counter, i); +} + +static inline long +atomic_long_read(atomic_long_t *v) +{ + return atomic_load_acq_long(&v->counter); +} + +static inline long +atomic_long_inc(atomic_long_t *v) +{ + return atomic_fetchadd_long(&v->counter, 1) + 1; +} + +static inline long +atomic_long_dec(atomic_long_t *v) +{ + return atomic_fetchadd_long(&v->counter, -1) - 1; +} + +static inline long +atomic_long_dec_and_test(atomic_long_t *v) +{ + long i = atomic_long_add(-1, v); + return i == 0 ; +} + +#endif /* _ATOMIC_LONG_H_ */ Index: sys/ofed/include/asm/atomic.h =================================================================== --- sys/ofed/include/asm/atomic.h (.../base) (revision 0) +++ sys/ofed/include/asm/atomic.h (.../head) (revision 219811) @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_ATOMIC_H_ +#define _ASM_ATOMIC_H_ + +#include +#include +#include +#include + +typedef struct { + volatile u_int counter; +} atomic_t; + +#define atomic_add(i, v) atomic_add_return((i), (v)) +#define atomic_sub(i, v) atomic_sub_return((i), (v)) +#define atomic_inc_return(v) atomic_add_return(1, (v)) +#define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) +#define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0) +#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) +#define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0) + +static inline int +atomic_add_return(int i, atomic_t *v) +{ + return i + atomic_fetchadd_int(&v->counter, i); +} + +static inline int +atomic_sub_return(int i, atomic_t *v) +{ + return atomic_fetchadd_int(&v->counter, -i) - i; +} + +static inline void +atomic_set(atomic_t *v, int i) +{ + atomic_store_rel_int(&v->counter, i); +} + +static inline int +atomic_read(atomic_t *v) +{ + return atomic_load_acq_int(&v->counter); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return atomic_fetchadd_int(&v->counter, 1) + 1; +} + +static inline int +atomic_dec(atomic_t *v) +{ + return atomic_fetchadd_int(&v->counter, -1) - 1; +} + +#endif /* _ASM_ATOMIC_H_ */ Index: sys/ofed/include/asm/page.h =================================================================== --- sys/ofed/include/asm/page.h (.../base) (revision 0) +++ sys/ofed/include/asm/page.h (.../head) (revision 219811) @@ -0,0 +1,29 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include Index: sys/ofed/include/asm/pgtable.h =================================================================== --- sys/ofed/include/asm/pgtable.h (.../base) (revision 0) +++ sys/ofed/include/asm/pgtable.h (.../head) (revision 219811) @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ASM_PGTABLE_H_ +#define _ASM_PGTABLE_H_ + +typedef int pgprot_t; + +#endif /* _ASM_PGTABLE_H_ */ Index: sys/ofed/include/asm/byteorder.h =================================================================== --- sys/ofed/include/asm/byteorder.h (.../base) (revision 0) +++ sys/ofed/include/asm/byteorder.h (.../head) (revision 219811) @@ -0,0 +1,90 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ASM_BYTEORDER_H_ +#define _ASM_BYTEORDER_H_ + +#include +#include + +#if BYTE_ORDER == LITTLE_ENDIAN +#define __LITTLE_ENDIAN +#else +#define __BIG_ENDIAN +#endif + +#define cpu_to_le64 htole64 +#define le64_to_cpu le64toh +#define cpu_to_le32 htole32 +#define le32_to_cpu le32toh +#define cpu_to_le16 htole16 +#define le16_to_cpu le16toh +#define cpu_to_be64 htobe64 +#define be64_to_cpu be64toh +#define cpu_to_be32 htobe32 +#define be32_to_cpu be32toh +#define cpu_to_be16 htobe16 +#define be16_to_cpu be16toh +#define __be16_to_cpu be16toh + +#define cpu_to_le64p(x) htole64(*((uint64_t *)x)) +#define le64_to_cpup(x) le64toh(*((uint64_t *)x)) +#define cpu_to_le32p(x) htole32(*((uint32_t *)x)) +#define le32_to_cpup(x) le32toh(*((uint32_t *)x)) +#define cpu_to_le16p(x) htole16(*((uint16_t *)x)) +#define le16_to_cpup(x) le16toh(*((uint16_t *)x)) +#define cpu_to_be64p(x) htobe64(*((uint64_t *)x)) +#define be64_to_cpup(x) be64toh(*((uint64_t *)x)) +#define cpu_to_be32p(x) htobe32(*((uint32_t *)x)) +#define be32_to_cpup(x) be32toh(*((uint32_t *)x)) +#define cpu_to_be16p(x) htobe16(*((uint16_t *)x)) +#define be16_to_cpup(x) be16toh(*((uint16_t *)x)) + +#define cpu_to_le64s(x) do { *((uint64_t *)x) = cpu_to_le64p((x)) } while (0) +#define le64_to_cpus(x) do { *((uint64_t *)x) = le64_to_cpup((x)) } while (0) +#define cpu_to_le32s(x) do { *((uint32_t *)x) = cpu_to_le32p((x)) } while (0) +#define le32_to_cpus(x) do { *((uint32_t *)x) = le32_to_cpup((x)) } while (0) +#define cpu_to_le16s(x) do { *((uint16_t *)x) = cpu_to_le16p((x)) } while (0) +#define le16_to_cpus(x) do { *((uint16_t *)x) = le16_to_cpup((x)) } while (0) +#define cpu_to_be64s(x) do { *((uint64_t *)x) = cpu_to_be64p((x)) } while (0) +#define be64_to_cpus(x) do { *((uint64_t *)x) = be64_to_cpup((x)) } while (0) +#define cpu_to_be32s(x) do { *((uint32_t *)x) = cpu_to_be32p((x)) } while (0) +#define be32_to_cpus(x) do { *((uint32_t *)x) = be32_to_cpup((x)) } while (0) +#define cpu_to_be16s(x) do { *((uint16_t *)x) = cpu_to_be16p((x)) } while (0) +#define be16_to_cpus(x) do { *((uint16_t *)x) = be16_to_cpup((x)) } while (0) + +#define swab16 bswap16 +#define swab32 bswap32 +#define swab64 bswap64 + +static inline void +be16_add_cpu(u16 *var, u16 val) +{ + *var = cpu_to_be16(be16_to_cpu(*var) + val); +} + +#endif /* _ASM_BYTEORDER_H_ */ Index: sys/ofed/include/asm/current.h =================================================================== --- sys/ofed/include/asm/current.h (.../base) (revision 0) +++ sys/ofed/include/asm/current.h (.../head) (revision 219811) @@ -0,0 +1,32 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_CURRENT_H_ +#define _ASM_CURRENT_H_ + +#endif /* _ASM_CURRENT_H_ */ Index: sys/ofed/include/asm/semaphore.h =================================================================== --- sys/ofed/include/asm/semaphore.h (.../base) (revision 0) +++ sys/ofed/include/asm/semaphore.h (.../head) (revision 219811) @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_SEMAPHORE_H_ +#define _ASM_SEMAPHORE_H_ + +#include + +#endif /* _ASM_SEMAPHORE_H_ */ Index: sys/ofed/include/asm/system.h =================================================================== --- sys/ofed/include/asm/system.h (.../base) (revision 0) +++ sys/ofed/include/asm/system.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/asm/types.h =================================================================== --- sys/ofed/include/asm/types.h (.../base) (revision 0) +++ sys/ofed/include/asm/types.h (.../head) (revision 219811) @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ASM_TYPES_H_ +#define _ASM_TYPES_H_ + +typedef unsigned short umode_t; + +typedef __signed__ char __s8; +typedef unsigned char __u8; + +typedef __signed__ short __s16; +typedef unsigned short __u16; + +typedef __signed__ int __s32; +typedef unsigned int __u32; + +#if defined(__GNUC__) // && !defined(__STRICT_ANSI__) +typedef __signed__ long long __s64; +typedef unsigned long long __u64; +#endif + +#ifdef _KERNEL + +typedef signed char s8; +typedef unsigned char u8; + +typedef signed short s16; +typedef unsigned short u16; + +typedef signed int s32; +typedef unsigned int u32; + +typedef signed long long s64; +typedef unsigned long long u64; + +/* DMA addresses come in generic and 64-bit flavours. */ +typedef vm_paddr_t dma_addr_t; +typedef vm_paddr_t dma64_addr_t; + +#endif /* _KERNEL */ + +#endif /* _ASM_TYPES_H_ */ Index: sys/ofed/include/asm/fcntl.h =================================================================== --- sys/ofed/include/asm/fcntl.h (.../base) (revision 0) +++ sys/ofed/include/asm/fcntl.h (.../head) (revision 219811) @@ -0,0 +1,33 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ASM_FCNTL_H_ +#define _ASM_FCNTL_H_ + +#include + +#endif /* _ASM_FCNTL_H_ */ Index: sys/ofed/include/asm/uaccess.h =================================================================== --- sys/ofed/include/asm/uaccess.h (.../base) (revision 0) +++ sys/ofed/include/asm/uaccess.h (.../head) (revision 219811) @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _ASM_UACCESS_H_ +#define _ASM_UACCESS_H_ + +#include + +static inline long +copy_to_user(void *to, const void *from, unsigned long n) +{ + if (copyout(from, to, n) != 0) + return n; + return 0; +} + +static inline long +copy_from_user(void *to, const void *from, unsigned long n) +{ + if (copyin(from, to, n) != 0) + return n; + return 0; +} + +#endif /* _ASM_UACCESS_H_ */ Index: sys/ofed/include/asm/io.h =================================================================== --- sys/ofed/include/asm/io.h (.../base) (revision 0) +++ sys/ofed/include/asm/io.h (.../head) (revision 219811) @@ -0,0 +1,29 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include Index: sys/ofed/include/rdma/ib_user_mad.h =================================================================== --- sys/ofed/include/rdma/ib_user_mad.h (.../base) (revision 219811) +++ sys/ofed/include/rdma/ib_user_mad.h (.../head) (revision 219811) @@ -193,10 +193,9 @@ #define IB_IOCTL_MAGIC 0x1b -#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ - struct ib_user_mad_reg_req) +#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1) -#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) +#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2) #define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) Index: sys/ofed/include/rdma/ib_umem.h =================================================================== --- sys/ofed/include/rdma/ib_umem.h (.../base) (revision 219811) +++ sys/ofed/include/rdma/ib_umem.h (.../head) (revision 219811) @@ -48,8 +48,12 @@ int writable; int hugetlb; struct list_head chunk_list; +#ifdef __linux__ struct work_struct work; struct mm_struct *mm; +#else + unsigned long start; +#endif unsigned long diff; }; @@ -61,25 +65,9 @@ struct scatterlist page_list[0]; }; -#ifdef CONFIG_INFINIBAND_USER_MEM - struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); -#else /* CONFIG_INFINIBAND_USER_MEM */ - -#include - -static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, - unsigned long addr, size_t size, - int access, int dmasync) { - return ERR_PTR(-EINVAL); -} -static inline void ib_umem_release(struct ib_umem *umem) { } -static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } - -#endif /* CONFIG_INFINIBAND_USER_MEM */ - #endif /* IB_UMEM_H */ Index: sys/ofed/include/rdma/ib_addr.h =================================================================== --- sys/ofed/include/rdma/ib_addr.h (.../base) (revision 219811) +++ sys/ofed/include/rdma/ib_addr.h (.../head) (revision 219811) @@ -150,8 +150,16 @@ static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { +#ifdef __linux__ return dev->priv_flags & IFF_802_1Q_VLAN ? vlan_dev_vlan_id(dev) : 0xffff; +#else + uint16_t tag; + + if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0) + return 0xffff; + return tag; +#endif } static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, @@ -216,6 +224,7 @@ return 0; } +#ifdef __linux__ static inline int iboe_get_rate(struct net_device *dev) { struct ethtool_cmd cmd; @@ -235,6 +244,21 @@ else return IB_RATE_PORT_CURRENT; } +#else +static inline int iboe_get_rate(struct net_device *dev) +{ + if (dev->if_baudrate >= IF_Gbps(40ULL)) + return IB_RATE_40_GBPS; + else if (dev->if_baudrate >= IF_Gbps(30ULL)) + return IB_RATE_30_GBPS; + else if (dev->if_baudrate >= IF_Gbps(20ULL)) + return IB_RATE_20_GBPS; + else if (dev->if_baudrate >= IF_Gbps(10ULL)) + return IB_RATE_10_GBPS; + else + return IB_RATE_PORT_CURRENT; +} +#endif static inline int rdma_link_local_addr(struct in6_addr *addr) { @@ -277,8 +301,12 @@ static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { +#ifdef __linux__ return dev->priv_flags & IFF_802_1Q_VLAN ? vlan_dev_real_dev(dev) : 0; +#else + return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev)); +#endif } #endif /* IB_ADDR_H */ Index: sys/ofed/include/net/ip6_route.h =================================================================== --- sys/ofed/include/net/ip6_route.h (.../base) (revision 0) +++ sys/ofed/include/net/ip6_route.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/net/addrconf.h =================================================================== --- sys/ofed/include/net/addrconf.h (.../base) (revision 0) +++ sys/ofed/include/net/addrconf.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/net/arp.h =================================================================== --- sys/ofed/include/net/arp.h (.../base) (revision 0) +++ sys/ofed/include/net/arp.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/net/neighbour.h =================================================================== --- sys/ofed/include/net/neighbour.h (.../base) (revision 0) +++ sys/ofed/include/net/neighbour.h (.../head) (revision 219811) @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ Index: sys/ofed/include/net/ipv6.h =================================================================== --- sys/ofed/include/net/ipv6.h (.../base) (revision 0) +++ sys/ofed/include/net/ipv6.h (.../head) (revision 219811) @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NET_IPV6_H_ +#define _LINUX_NET_IPV6_H_ + +#ifndef KLD_MODULE +#include "opt_inet6.h" +#endif + +#define ipv6_addr_loopback IN6_IS_ADDR_LOOPBACK +#define ipv6_addr_copy(dst, src) \ + memcpy((dst), (src), sizeof(struct in6_addr)) + +#ifdef INET6 +static inline void +ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, + char *buf) +{ + unsigned char scope; + + scope = broadcast[5] & 0xF; + buf[0] = 0; + buf[1] = 0xff; + buf[2] = 0xff; + buf[3] = 0xff; + buf[4] = 0xff; + buf[5] = 0x10 | scope; + buf[6] = 0x60; + buf[7] = 0x1b; + buf[8] = broadcast[8]; + buf[9] = broadcast[9]; + memcpy(&buf[10], &addr->s6_addr[6], 10); +} +#endif + +#endif /* _LINUX_NET_IPV6_H_ */ Index: sys/ofed/include/net/tcp.h =================================================================== --- sys/ofed/include/net/tcp.h (.../base) (revision 0) +++ sys/ofed/include/net/tcp.h (.../head) (revision 219811) @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NET_TCP_H_ +#define _LINUX_NET_TCP_H_ + +#include +#include +#include + +#include + +#endif /* _LINUX_NET_TCP_H_ */ Index: sys/ofed/include/net/ip.h =================================================================== --- sys/ofed/include/net/ip.h (.../base) (revision 0) +++ sys/ofed/include/net/ip.h (.../head) (revision 219811) @@ -0,0 +1,77 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NET_IP_H_ +#define _LINUX_NET_IP_H_ + +#include +#include + +#include +#include +#include + +#include +#include + +static inline void inet_get_local_port_range(int *low, int *high) +{ + *low = V_ipport_firstauto; + *high = V_ipport_lastauto; +} + +static inline void +ip_ib_mc_map(uint32_t addr, const unsigned char *bcast, char *buf) +{ + unsigned char scope; + + addr = ntohl(addr); + scope = bcast[5] & 0xF; + buf[0] = 0; + buf[1] = 0xff; + buf[2] = 0xff; + buf[3] = 0xff; + buf[4] = 0xff; + buf[5] = 0x10 | scope; + buf[6] = 0x40; + buf[7] = 0x1b; + buf[8] = bcast[8]; + buf[9] = bcast[9]; + buf[10] = 0; + buf[11] = 0; + buf[12] = 0; + buf[13] = 0; + buf[14] = 0; + buf[15] = 0; + buf[16] = (addr >> 24) & 0x0f; + buf[17] = (addr >> 16) & 0xff; + buf[18] = (addr >> 8) & 0xff; + buf[19] = addr & 0xff; +} + +#endif /* _LINUX_NET_IP_H_ */ Index: sys/ofed/include/net/netevent.h =================================================================== --- sys/ofed/include/net/netevent.h (.../base) (revision 0) +++ sys/ofed/include/net/netevent.h (.../head) (revision 219811) @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_NET_NETEVENT_H_ +#define _LINUX_NET_NETEVENT_H_ + +#include + +enum netevent_notif_type { + NETEVENT_NEIGH_UPDATE = 0, +#if 0 /* Unsupported events. */ + NETEVENT_PMTU_UPDATE, + NETEVENT_REDIRECT, +#endif +}; + +struct llentry; + +static inline void +_handle_arp_update_event(void *arg, struct llentry *lle) +{ + struct notifier_block *nb; + + nb = arg; + nb->notifier_call(nb, NETEVENT_NEIGH_UPDATE, lle); +} + +static inline int +register_netevent_notifier(struct notifier_block *nb) +{ + nb->tags[NETEVENT_NEIGH_UPDATE] = EVENTHANDLER_REGISTER( + arp_update_event, _handle_arp_update_event, nb, 0); + return (0); +} + +static inline int +unregister_netevent_notifier(struct notifier_block *nb) +{ + + EVENTHANDLER_DEREGISTER(arp_update_event, + nb->tags[NETEVENT_NEIGH_UPDATE]); + + return (0); +} + +#endif /* _LINUX_NET_NETEVENT_H_ */ Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c (.../head) (revision 219811) @@ -36,23 +36,23 @@ #include "ipoib.h" -static void ipoib_get_drvinfo(struct net_device *netdev, +static void ipoib_get_drvinfo(struct ifnet *netdev, struct ethtool_drvinfo *drvinfo) { strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); } -static u32 ipoib_get_rx_csum(struct net_device *dev) +static u32 ipoib_get_rx_csum(struct ifnet *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; return test_bit(IPOIB_FLAG_CSUM, &priv->flags) && !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } -static int ipoib_get_coalesce(struct net_device *dev, +static int ipoib_get_coalesce(struct ifnet *dev, struct ethtool_coalesce *coal) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs; @@ -62,10 +62,10 @@ return 0; } -static int ipoib_set_coalesce(struct net_device *dev, +static int ipoib_set_coalesce(struct ifnet *dev, struct ethtool_coalesce *coal) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; int ret; /* @@ -105,7 +105,7 @@ "LRO avg aggr", "LRO no desc" }; -static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +static void ipoib_get_strings(struct ifnet *netdev, u32 stringset, u8 *data) { switch (stringset) { case ETH_SS_STATS: @@ -114,7 +114,7 @@ } } -static int ipoib_get_sset_count(struct net_device *dev, int sset) +static int ipoib_get_sset_count(struct ifnet *dev, int sset) { switch (sset) { case ETH_SS_STATS: @@ -124,10 +124,10 @@ } } -static void ipoib_get_ethtool_stats(struct net_device *dev, +static void ipoib_get_ethtool_stats(struct ifnet *dev, struct ethtool_stats *stats, uint64_t *data) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; int index = 0; /* Get LRO statistics */ @@ -153,7 +153,7 @@ .get_ethtool_stats = ipoib_get_ethtool_stats, }; -void ipoib_set_ethtool_ops(struct net_device *dev) +void ipoib_set_ethtool_ops(struct ifnet *dev) { SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); } Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (.../head) (revision 219811) @@ -30,16 +30,18 @@ * SOFTWARE. */ +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +#include +#include +#include + #include #include -#include -#include -#include #include -#include -#include "ipoib.h" - int ipoib_max_conn_qp = 128; module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); @@ -76,133 +78,91 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); -static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, - u64 mapping[IPOIB_CM_RX_SG]) +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { - int i; - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req); - for (i = 0; i < frags; ++i) - ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } -static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_recv_wr *bad_wr; - int i, ret; + struct ipoib_rx_buf *rx_req; + struct mbuf *m; + int ret; + int i; + rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id]; + for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { + priv->cm.rx_sge[i].addr = rx_req->mapping[i]; + priv->cm.rx_sge[i].length = m->m_len; + } + + priv->cm.rx_wr.num_sge = i; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; - for (i = 0; i < priv->cm.num_frags; ++i) - priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; - ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); - ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, - priv->cm.srq_ring[id].mapping); - dev_kfree_skb_any(priv->cm.srq_ring[id].skb); - priv->cm.srq_ring[id].skb = NULL; + ipoib_dma_unmap_rx(priv, rx_req); + m_freem(priv->cm.srq_ring[id].mb); + priv->cm.srq_ring[id].mb = NULL; } return ret; } -static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, +static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *rx, struct ib_recv_wr *wr, struct ib_sge *sge, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; - int i, ret; + struct mbuf *m; + int ret; + int i; + rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id]; + for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { + sge[i].addr = rx_req->mapping[i]; + sge[i].length = m->m_len; + } + + wr->num_sge = i; wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; - for (i = 0; i < IPOIB_CM_RX_SG; ++i) - sge[i].addr = rx->rx_ring[id].mapping[i]; - ret = ib_post_recv(rx->qp, wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); - ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, - rx->rx_ring[id].mapping); - dev_kfree_skb_any(rx->rx_ring[id].skb); - rx->rx_ring[id].skb = NULL; + ipoib_dma_unmap_rx(priv, rx_req); + m_freem(rx->rx_ring[id].mb); + rx->rx_ring[id].mb = NULL; } return ret; } -static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, - struct ipoib_cm_rx_buf *rx_ring, - int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) +static struct mbuf * +ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct sk_buff *skb; - int i; - - skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); - if (unlikely(!skb)) - return NULL; - - /* - * IPoIB adds a 4 byte header. So we need 12 more bytes to align the - * IP header to a multiple of 16. - */ - skb_reserve(skb, 12); - - mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, - DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { - dev_kfree_skb_any(skb); - return NULL; - } - - for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); - - if (!page) - goto partial_error; - skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); - - mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) - goto partial_error; - } - - rx_ring[id].skb = skb; - return skb; - -partial_error: - - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); - - for (; i > 0; --i) - ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); - - dev_kfree_skb_any(skb); - return NULL; + return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req, + priv->cm.max_cm_mtu); } -static void ipoib_cm_free_rx_ring(struct net_device *dev, +static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_ring) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int i; for (i = 0; i < ipoib_recvq_size; ++i) - if (rx_ring[i].skb) { - ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, - rx_ring[i].mapping); - dev_kfree_skb_any(rx_ring[i].skb); + if (rx_ring[i].mb) { + ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]); + m_freem(rx_ring[i].mb); } - vfree(rx_ring); + kfree(rx_ring); } static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) @@ -230,7 +190,7 @@ static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) { struct ipoib_cm_rx *p = ctx; - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = p->priv; unsigned long flags; if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) @@ -243,35 +203,33 @@ spin_unlock_irqrestore(&priv->lock, flags); } -static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, +static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *p) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->recv_cq, /* For drain WR */ .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = 1, /* For drain WR */ - .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = p, }; - if (!ipoib_cm_has_srq(dev)) { + if (!ipoib_cm_has_srq(priv)) { attr.cap.max_recv_wr = ipoib_recvq_size; - attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + attr.cap.max_recv_sge = priv->cm.num_frags; } return ib_create_qp(priv->pd, &attr); } -static int ipoib_cm_modify_rx_qp(struct net_device *dev, +static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, unsigned psn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; @@ -322,29 +280,23 @@ return 0; } -static void ipoib_cm_init_rx_wr(struct net_device *dev, +static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv, struct ib_recv_wr *wr, struct ib_sge *sge) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int i; - for (i = 0; i < priv->cm.num_frags; ++i) + for (i = 0; i < IPOIB_CM_RX_SG; i++) sge[i].lkey = priv->mr->lkey; - sge[0].length = IPOIB_CM_HEAD_SIZE; - for (i = 1; i < priv->cm.num_frags; ++i) - sge[i].length = PAGE_SIZE; - wr->next = NULL; wr->sg_list = sge; - wr->num_sge = priv->cm.num_frags; + wr->num_sge = 1; } -static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, - struct ipoib_cm_rx *rx) +static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv, + struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct { struct ib_recv_wr wr; struct ib_sge sge[IPOIB_CM_RX_SG]; @@ -352,7 +304,7 @@ int ret; int i; - rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL); if (!rx->rx_ring) { printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); @@ -367,7 +319,7 @@ goto err_free; } - ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + ipoib_cm_init_rx_wr(priv, &t->wr, t->sge); spin_lock_irq(&priv->lock); @@ -382,13 +334,12 @@ spin_unlock_irq(&priv->lock); for (i = 0; i < ipoib_recvq_size; ++i) { - if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; } - ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i); if (ret) { ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " "failed for buf %d\n", i); @@ -410,27 +361,26 @@ err_free: kfree(t); - ipoib_cm_free_rx_ring(dev, rx->rx_ring); + ipoib_cm_free_rx_ring(priv, rx->rx_ring); return ret; } -static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, +static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, struct ib_cm_req_event_param *req, unsigned psn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_rep_param rep = {}; data.qpn = cpu_to_be32(priv->qp->qp_num); - data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); rep.private_data = &data; rep.private_data_len = sizeof data; rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; - rep.srq = ipoib_cm_has_srq(dev); + rep.srq = ipoib_cm_has_srq(priv); rep.qp_num = qp->qp_num; rep.starting_psn = psn; return ib_send_cm_rep(cm_id, &rep); @@ -438,8 +388,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { - struct net_device *dev = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = cm_id->context; struct ipoib_cm_rx *p; unsigned psn; int ret; @@ -448,26 +397,26 @@ p = kzalloc(sizeof *p, GFP_KERNEL); if (!p) return -ENOMEM; - p->dev = dev; + p->priv = priv; p->id = cm_id; cm_id->context = p; p->state = IPOIB_CM_RX_LIVE; p->jiffies = jiffies; INIT_LIST_HEAD(&p->list); - p->qp = ipoib_cm_create_rx_qp(dev, p); + p->qp = ipoib_cm_create_rx_qp(priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); goto err_qp; } - psn = random32() & 0xffffff; - ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + psn = random() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn); if (ret) goto err_modify; - if (!ipoib_cm_has_srq(dev)) { - ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (!ipoib_cm_has_srq(priv)) { + ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p); if (ret) goto err_modify; } @@ -482,7 +431,7 @@ list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irq(&priv->lock); - ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn); if (ret) { ipoib_warn(priv, "failed to send REP: %d\n", ret); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) @@ -512,7 +461,7 @@ /* Fall through */ case IB_CM_REJ_RECEIVED: p = cm_id->context; - priv = netdev_priv(p->dev); + priv = p->priv; if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); /* Fall through */ @@ -520,62 +469,30 @@ return 0; } } -/* Adjust length of skb with fragments to match received data */ -static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, - unsigned int length, struct sk_buff *toskb) -{ - int i, num_frags; - unsigned int size; - /* put header into skb */ - size = min(length, hdr_space); - skb->tail += size; - skb->len += size; - length -= size; - - num_frags = skb_shinfo(skb)->nr_frags; - for (i = 0; i < num_frags; i++) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (length == 0) { - /* don't need this page */ - skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); - --skb_shinfo(skb)->nr_frags; - } else { - size = min(length, (unsigned) PAGE_SIZE); - - frag->size = size; - skb->data_len += size; - skb->truesize += size; - skb->len += size; - length -= size; - } - } -} - -void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx_buf saverx; struct ipoib_cm_rx_buf *rx_ring; unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); - struct sk_buff *skb, *newskb; + struct ifnet *dev = priv->dev; + struct mbuf *mb, *newmb; struct ipoib_cm_rx *p; - unsigned long flags; - u64 mapping[IPOIB_CM_RX_SG]; - int frags; int has_srq; - struct sk_buff *small_skb; + u_short proto; ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { - spin_lock_irqsave(&priv->lock, flags); + spin_lock(&priv->lock); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); - spin_unlock_irqrestore(&priv->lock, flags); + if (priv->cm.id != NULL) + queue_work(ipoib_workqueue, + &priv->cm.rx_reap_task); + spin_unlock(&priv->lock); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); @@ -584,24 +501,24 @@ p = wc->qp->qp_context; - has_srq = ipoib_cm_has_srq(dev); + has_srq = ipoib_cm_has_srq(priv); rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; - skb = rx_ring[wr_id].skb; + mb = rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { ipoib_dbg(priv, "cm recv error " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); - ++dev->stats.rx_dropped; + ++dev->if_ierrors; if (has_srq) goto repost; else { if (!--p->recv_count) { - spin_lock_irqsave(&priv->lock, flags); + spin_lock(&priv->lock); list_move(&p->list, &priv->cm.rx_reap_list); - spin_unlock_irqrestore(&priv->lock, flags); queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock(&priv->lock); } return; } @@ -609,76 +526,51 @@ if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { - spin_lock_irqsave(&priv->lock, flags); p->jiffies = jiffies; /* Move this entry to list head, but do not re-add it * if it has been moved out of list. */ if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); - spin_unlock_irqrestore(&priv->lock, flags); } } - if (wc->byte_len < IPOIB_CM_COPYBREAK) { - int dlen = wc->byte_len; - - small_skb = dev_alloc_skb(dlen + 12); - if (small_skb) { - skb_reserve(small_skb, 12); - ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], - dlen, DMA_FROM_DEVICE); - skb_copy_from_linear_data(skb, small_skb->data, dlen); - ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], - dlen, DMA_FROM_DEVICE); - skb_put(small_skb, dlen); - skb = small_skb; - goto copied; - } - } - - frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, - (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); - if (unlikely(!newskb)) { + memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx)); + newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]); + if (unlikely(!newmb)) { /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); - ++dev->stats.rx_dropped; + ++dev->if_ierrors; + memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx)); goto repost; } - ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); - memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + ipoib_cm_dma_unmap_rx(priv, &saverx); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); - skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + ipoib_dma_mb(priv, mb, wc->byte_len); -copied: - skb->protocol = ((struct ipoib_header *) skb->data)->proto; - skb_reset_mac_header(skb); - skb_pull(skb, IPOIB_ENCAP_LEN); + ++dev->if_opackets; + dev->if_obytes += mb->m_pkthdr.len; - dev->last_rx = jiffies; - ++dev->stats.rx_packets; - dev->stats.rx_bytes += skb->len; + mb->m_pkthdr.rcvif = dev; + proto = *mtod(mb, uint16_t *); + m_adj(mb, IPOIB_ENCAP_LEN); - skb->dev = dev; - /* XXX get correct PACKET_ type here */ - skb->pkt_type = PACKET_HOST; - netif_receive_skb(skb); + IPOIB_MTAP_PROTO(dev, mb, proto); + ipoib_demux(dev, mb, ntohs(proto)); repost: if (has_srq) { - if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id))) ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " "for buf %d\n", wr_id); } else { - if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p, &priv->cm.rx_wr, priv->cm.rx_sge, wr_id))) { @@ -691,64 +583,70 @@ static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx, - unsigned int wr_id, - u64 addr, int len) + struct ipoib_cm_tx_buf *tx_req, + unsigned int wr_id) { struct ib_send_wr *bad_wr; + struct mbuf *mb = tx_req->mb; + u64 *mapping = tx_req->mapping; + struct mbuf *m; + int i; - priv->tx_sge[0].addr = addr; - priv->tx_sge[0].length = len; + for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { + priv->tx_sge[i].addr = mapping[i]; + priv->tx_sge[i].length = m->m_len; + } + priv->tx_wr.num_sge = i; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; + priv->tx_wr.opcode = IB_WR_SEND; - priv->tx_wr.num_sge = 1; - priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; - return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); } -void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx_buf *tx_req; - u64 addr; + struct ifnet *dev = priv->dev; - if (unlikely(skb->len > tx->mtu)) { + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (ipoib_poll_tx(priv)); /* nothing */ + + m_adj(mb, sizeof(struct ipoib_pseudoheader)); + if (unlikely(mb->m_pkthdr.len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, tx->mtu); - ++dev->stats.tx_dropped; - ++dev->stats.tx_errors; - ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + mb->m_pkthdr.len, tx->mtu); + ++dev->if_oerrors; + ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu)); return; } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", - tx->tx_head, skb->len, tx->qp->qp_num); + tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num); + /* - * We put the skb into the tx_ring _before_ we call post_send() + * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; - tx_req->skb = skb; - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { - ++dev->stats.tx_errors; - dev_kfree_skb_any(skb); + tx_req->mb = mb; + if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req, + priv->cm.num_frags))) { + ++dev->if_oerrors; + if (tx_req->mb) + m_freem(tx_req->mb); return; } - tx_req->mapping = addr; - - if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), - addr, skb->len))) { + if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) { ipoib_warn(priv, "post_send failed\n"); - ++dev->stats.tx_errors; - ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); - dev_kfree_skb_any(skb); + ++dev->if_oerrors; + ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); + m_freem(mb); } else { - dev->trans_start = jiffies; ++tx->tx_head; if (++priv->tx_outstanding == ipoib_sendq_size) { @@ -756,18 +654,18 @@ tx->qp->qp_num); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); - netif_stop_queue(dev); + dev->if_drv_flags |= IFF_DRV_OACTIVE; } } + } -void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; - unsigned long flags; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", wr_id, wc->status); @@ -780,41 +678,34 @@ tx_req = &tx->tx_ring[wr_id]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); /* FIXME: is this right? Shouldn't we only increment on success? */ - ++dev->stats.tx_packets; - dev->stats.tx_bytes += tx_req->skb->len; + ++dev->if_opackets; + dev->if_obytes += tx_req->mb->m_pkthdr.len; - dev_kfree_skb_any(tx_req->skb); + m_freem(tx_req->mb); - netif_tx_lock(dev); - ++tx->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && - netif_queue_stopped(dev) && + (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - netif_wake_queue(dev); + dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { - struct ipoib_neigh *neigh; + struct ipoib_path *path; ipoib_dbg(priv, "failed cm send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); - spin_lock_irqsave(&priv->lock, flags); - neigh = tx->neigh; + path = tx->path; - if (neigh) { - neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - - tx->neigh = NULL; + if (path) { + path->cm = NULL; + rb_erase(&path->rb_node, &priv->path_tree); + list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { @@ -823,22 +714,18 @@ } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); - - spin_unlock_irqrestore(&priv->lock, flags); } - netif_tx_unlock(dev); } -int ipoib_cm_dev_open(struct net_device *dev) +int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev))) return 0; - priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv); if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); @@ -862,9 +749,8 @@ return ret; } -static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_rx *rx, *n; LIST_HEAD(list); @@ -875,8 +761,8 @@ list_for_each_entry_safe(rx, n, &list, list) { ib_destroy_cm_id(rx->id); ib_destroy_qp(rx->qp); - if (!ipoib_cm_has_srq(dev)) { - ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + if (!ipoib_cm_has_srq(priv)) { + ipoib_cm_free_rx_ring(priv, rx->rx_ring); spin_lock_irq(&priv->lock); --priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); @@ -885,19 +771,20 @@ } } -void ipoib_cm_dev_stop(struct net_device *dev) +void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_rx *p; unsigned long begin; int ret; - if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; + cancel_work_sync(&priv->cm.rx_reap_task); + spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); @@ -932,13 +819,13 @@ } spin_unlock_irq(&priv->lock); msleep(1); - ipoib_drain_cq(dev); + ipoib_drain_cq(priv); spin_lock_irq(&priv->lock); } spin_unlock_irq(&priv->lock); - ipoib_cm_free_rx_reap_list(dev); + ipoib_cm_free_rx_reap_list(priv); cancel_delayed_work(&priv->cm.stale_task); } @@ -946,13 +833,14 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = p->priv; struct ipoib_cm_data *data = event->private_data; - struct sk_buff_head skqueue; + struct ifqueue mbqueue; struct ib_qp_attr qp_attr; int qp_attr_mask, ret; - struct sk_buff *skb; + struct mbuf *mb; + ipoib_dbg(priv, "cm rep handler\n"); p->mtu = be32_to_cpu(data->mtu); if (p->mtu <= IPOIB_ENCAP_LEN) { @@ -987,18 +875,26 @@ return ret; } - skb_queue_head_init(&skqueue); + bzero(&mbqueue, sizeof(mbqueue)); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); - if (p->neigh) - while ((skb = __skb_dequeue(&p->neigh->queue))) - __skb_queue_tail(&skqueue, skb); + if (p->path) + for (;;) { + _IF_DEQUEUE(&p->path->queue, mb); + if (mb == NULL) + break; + _IF_ENQUEUE(&mbqueue, mb); + } spin_unlock_irq(&priv->lock); - while ((skb = __skb_dequeue(&skqueue))) { - skb->dev = p->dev; - if (dev_queue_xmit(skb)) + for (;;) { + struct ifnet *dev = p->priv->dev; + _IF_DEQUEUE(&mbqueue, mb); + if (mb == NULL) + break; + mb->m_pkthdr.rcvif = dev; + if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } @@ -1011,15 +907,15 @@ return 0; } -static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { - .send_cq = priv->recv_cq, + .send_cq = priv->send_cq, .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = ipoib_sendq_size, - .cap.max_send_sge = 1, + .cap.max_send_sge = priv->cm.num_frags, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = tx @@ -1028,17 +924,18 @@ return ib_create_qp(priv->pd, &attr); } -static int ipoib_cm_send_req(struct net_device *dev, +static int ipoib_cm_send_req(struct ipoib_dev_priv *priv, struct ib_cm_id *id, struct ib_qp *qp, u32 qpn, struct ib_sa_path_rec *pathrec) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_req_param req = {}; + ipoib_dbg(priv, "cm send req\n"); + data.qpn = cpu_to_be32(priv->qp->qp_num); - data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); req.primary_path = pathrec; req.alternate_path = NULL; @@ -1061,14 +958,13 @@ req.retry_count = 0; /* RFC draft warns against retries */ req.rnr_retry_count = 0; /* RFC draft warns against retries */ req.max_cm_retries = 15; - req.srq = ipoib_cm_has_srq(dev); + req.srq = ipoib_cm_has_srq(priv); return ib_send_cm_req(id, &req); } -static int ipoib_cm_modify_tx_init(struct net_device *dev, +static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); @@ -1093,10 +989,10 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ib_sa_path_rec *pathrec) { - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = p->priv; int ret; - p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; @@ -1104,7 +1000,7 @@ } memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); - p->qp = ipoib_cm_create_tx_qp(p->dev, p); + p->qp = ipoib_cm_create_tx_qp(p->priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); @@ -1118,13 +1014,13 @@ goto err_id; } - ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + ret = ipoib_cm_modify_tx_init(p->priv, p->id, p->qp); if (ret) { ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); goto err_modify; } - ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec); if (ret) { ipoib_warn(priv, "failed to send cm req: %d\n", ret); goto err_send_cm; @@ -1143,20 +1039,24 @@ ib_destroy_qp(p->qp); err_qp: p->qp = NULL; - vfree(p->tx_ring); + kfree(p->tx_ring); err_tx: return ret; } static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = p->priv; + struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + if (p->path) + ipoib_path_free(priv, p->path); + if (p->id) ib_destroy_cm_id(p->id); @@ -1178,22 +1078,19 @@ while ((int) p->tx_tail - (int) p->tx_head < 0) { tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_req->skb); + ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); + m_freem(tx_req->mb); ++p->tx_tail; - netif_tx_lock_bh(p->dev); if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && - netif_queue_stopped(p->dev) && + (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - netif_wake_queue(p->dev); - netif_tx_unlock_bh(p->dev); + dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } if (p->qp) ib_destroy_qp(p->qp); - vfree(p->tx_ring); + kfree(p->tx_ring); kfree(p); } @@ -1201,9 +1098,8 @@ struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(tx->dev); - struct net_device *dev = priv->dev; - struct ipoib_neigh *neigh; + struct ipoib_dev_priv *priv = tx->priv; + struct ipoib_path *path; unsigned long flags; int ret; @@ -1223,18 +1119,14 @@ case IB_CM_REJ_RECEIVED: case IB_CM_TIMEWAIT_EXIT: ipoib_dbg(priv, "CM error %d.\n", event->event); - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); - neigh = tx->neigh; + path = tx->path; - if (neigh) { - neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - - tx->neigh = NULL; + if (path) { + path->cm = NULL; + tx->path = NULL; + rb_erase(&path->rb_node, &priv->path_tree); + list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { @@ -1243,7 +1135,8 @@ } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); + if (path) + ipoib_path_free(tx->priv, path); break; default: break; @@ -1252,20 +1145,19 @@ return 0; } -struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, - struct ipoib_neigh *neigh) +struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, + struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx *tx; tx = kzalloc(sizeof *tx, GFP_ATOMIC); if (!tx) return NULL; - neigh->cm = tx; - tx->neigh = neigh; + ipoib_dbg(priv, "Creating cm tx\n"); + path->cm = tx; tx->path = path; - tx->dev = dev; + tx->priv = priv; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); queue_work(ipoib_workqueue, &priv->cm.start_task); @@ -1274,13 +1166,15 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct ipoib_dev_priv *priv = tx->priv; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock(&priv->lock); list_move(&tx->list, &priv->cm.reap_list); + spin_unlock(&priv->lock); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", - tx->neigh->dgid.raw); - tx->neigh = NULL; + tx->path->pathrec.dgid.raw); + tx->path = NULL; } } @@ -1288,8 +1182,7 @@ { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.start_task); - struct net_device *dev = priv->dev; - struct ipoib_neigh *neigh; + struct ipoib_path *path; struct ipoib_cm_tx *p; unsigned long flags; int ret; @@ -1297,32 +1190,29 @@ struct ib_sa_path_rec pathrec; u32 qpn; - netif_tx_lock_bh(dev); + ipoib_dbg(priv, "cm start task\n"); spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.start_list)) { p = list_entry(priv->cm.start_list.next, typeof(*p), list); list_del_init(&p->list); - neigh = p->neigh; - qpn = IPOIB_QPN(neigh->neighbour->ha); + path = p->path; + qpn = IPOIB_QPN(path->hwaddr); memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); ret = ipoib_cm_tx_init(p, qpn, &pathrec); - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); if (ret) { - neigh = p->neigh; - if (neigh) { - neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + path = p->path; + if (path) { + path->cm = NULL; + rb_erase(&path->rb_node, &priv->path_tree); + list_del(&path->list); + ipoib_path_free(priv, path); } list_del(&p->list); kfree(p); @@ -1330,84 +1220,76 @@ } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); } static void ipoib_cm_tx_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.reap_task); - struct net_device *dev = priv->dev; struct ipoib_cm_tx *p; unsigned long flags; - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.reap_list)) { p = list_entry(priv->cm.reap_list.next, typeof(*p), list); list_del(&p->list); spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); ipoib_cm_tx_destroy(p); - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); } -static void ipoib_cm_skb_reap(struct work_struct *work) +static void ipoib_cm_mb_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, - cm.skb_task); - struct net_device *dev = priv->dev; - struct sk_buff *skb; + cm.mb_task); + struct mbuf *mb; unsigned long flags; unsigned mtu = priv->mcast_mtu; + uint16_t proto; - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); - while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + for (;;) { + IF_DEQUEUE(&priv->cm.mb_queue, mb); + if (mb == NULL) + break; spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); - if (skb->protocol == htons(ETH_P_IP)) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev); + proto = htons(*mtod(mb, uint16_t *)); + m_adj(mb, IPOIB_ENCAP_LEN); + if (proto == ETHERTYPE_IP) + icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); +#if defined(INET6) + else if (proto == ETHERTYPE_IPV6) + icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu); #endif - dev_kfree_skb_any(skb); + else + m_freem(mb); - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); } -void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, - unsigned int mtu) +void +ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - int e = skb_queue_empty(&priv->cm.skb_queue); + int e = priv->cm.mb_queue.ifq_len; - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - skb_queue_tail(&priv->cm.skb_queue, skb); - if (e) - queue_work(ipoib_workqueue, &priv->cm.skb_task); + IF_ENQUEUE(&priv->cm.mb_queue, mb); + if (e == 0) + queue_work(ipoib_workqueue, &priv->cm.mb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) { ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, - cm.rx_reap_task)->dev); + cm.rx_reap_task)); } static void ipoib_cm_stale_task(struct work_struct *work) @@ -1440,72 +1322,8 @@ } -static ssize_t show_mode(struct device *d, struct device_attribute *attr, - char *buf) +static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); - - if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) - return sprintf(buf, "connected\n"); - else - return sprintf(buf, "datagram\n"); -} - -static ssize_t set_mode(struct device *d, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct net_device *dev = to_net_dev(d); - struct ipoib_dev_priv *priv = netdev_priv(dev); - - /* flush paths if we switch modes so that connections are restarted */ - if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { - set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - ipoib_warn(priv, "enabling connected mode " - "will cause multicast packet drops\n"); - - rtnl_lock(); - dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; - - if (ipoib_cm_max_mtu(dev) > priv->mcast_mtu) - ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", - priv->mcast_mtu); - dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); - rtnl_unlock(); - - ipoib_flush_paths(dev); - return count; - } - - if (!strcmp(buf, "datagram\n")) { - clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - - rtnl_lock(); - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { - dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; - if (priv->hca_caps & IB_DEVICE_UD_TSO) - dev->features |= NETIF_F_TSO; - } - dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); - rtnl_unlock(); - ipoib_flush_paths(dev); - - return count; - } - - return -EINVAL; -} - -static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); - -int ipoib_cm_add_mode_attr(struct net_device *dev) -{ - return device_create_file(&dev->dev, &dev_attr_mode); -} - -static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_srq_init_attr srq_init_attr = { .attr = { .max_wr = ipoib_recvq_size, @@ -1522,7 +1340,7 @@ return; } - priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL); if (!priv->cm.srq_ring) { printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); @@ -1534,9 +1352,9 @@ memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); } -int ipoib_cm_dev_init(struct net_device *dev) +int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; int i, ret; struct ib_device_attr attr; @@ -1549,11 +1367,13 @@ INIT_LIST_HEAD(&priv->cm.rx_reap_list); INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); - INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap); INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); - skb_queue_head_init(&priv->cm.skb_queue); + bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue)); + mtx_init(&priv->cm.mb_queue.ifq_mtx, + dev->if_xname, "if send queue", MTX_DEF); ret = ib_query_device(priv->ca, &attr); if (ret) { @@ -1564,47 +1384,43 @@ ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); - ipoib_cm_create_srq(dev, attr.max_srq_sge); - if (ipoib_cm_has_srq(dev)) { - - priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; + ipoib_cm_create_srq(priv, attr.max_srq_sge); + if (ipoib_cm_has_srq(priv)) { + priv->cm.max_cm_mtu = attr.max_srq_sge * MJUMPAGESIZE; priv->cm.num_frags = attr.max_srq_sge; ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", priv->cm.max_cm_mtu, priv->cm.num_frags); } else { - priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU; priv->cm.num_frags = IPOIB_CM_RX_SG; } - ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge); - if (ipoib_cm_has_srq(dev)) { + if (ipoib_cm_has_srq(priv)) { for (i = 0; i < ipoib_recvq_size; ++i) { - if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, - priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); - ipoib_cm_dev_cleanup(dev); + ipoib_cm_dev_cleanup(priv); return -ENOMEM; } - if (ipoib_cm_post_receive_srq(dev, i)) { + if (ipoib_cm_post_receive_srq(priv, i)) { ipoib_warn(priv, "ipoib_cm_post_receive_srq " "failed for buf %d\n", i); - ipoib_cm_dev_cleanup(dev); + ipoib_cm_dev_cleanup(priv); return -EIO; } } } - priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC; return 0; } -void ipoib_cm_dev_cleanup(struct net_device *dev) +void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; if (!priv->cm.srq) @@ -1620,6 +1436,10 @@ if (!priv->cm.srq_ring) return; - ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring); priv->cm.srq_ring = NULL; + + mtx_destroy(&priv->cm.mb_queue.ifq_mtx); } + +#endif /* CONFIG_INFINIBAND_IPOIB_CM */ Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c (.../head) (revision 219811) @@ -43,14 +43,14 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, char *buf) { - struct net_device *dev = to_net_dev(d); - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = dev->if_softc; return sprintf(buf, "%s\n", priv->parent->name); } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int ipoib_vlan_add(struct ifnet *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv; char intf_name[IFNAMSIZ]; @@ -59,7 +59,7 @@ if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = netdev_priv(pdev); + ppriv = pdev->if_softc; rtnl_lock(); mutex_lock(&ppriv->vlan_mutex); @@ -102,9 +102,9 @@ priv->pkey = pkey; - memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); - priv->dev->broadcast[8] = pkey >> 8; - priv->dev->broadcast[9] = pkey & 0xff; + memcpy(IF_LLADDR(priv->dev), ppriv->dev->dev_addr, INFINIBAND_ALEN); + priv->broadcastaddr[8] = pkey >> 8; + priv->broadcastaddr[9] = pkey & 0xff; result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); if (result < 0) { @@ -157,15 +157,15 @@ return result; } -int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) +int ipoib_vlan_delete(struct ifnet *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - struct net_device *dev = NULL; + struct ifnet *dev = NULL; if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = netdev_priv(pdev); + ppriv = pdev->if_softc; rtnl_lock(); mutex_lock(&ppriv->vlan_mutex); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c (.../head) (revision 219811) @@ -34,9 +34,8 @@ #include "ipoib.h" #include -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) +int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; @@ -73,9 +72,8 @@ return ret; } -int ipoib_init_qp(struct net_device *dev) +int ipoib_init_qp(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; struct ib_qp_attr qp_attr; int attr_mask; @@ -127,9 +125,8 @@ return ret; } -int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) +int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr init_attr = { .cap = { .max_send_wr = ipoib_sendq_size, @@ -143,7 +140,7 @@ int ret, size; int i; - struct ethtool_coalesce *coal; + /* XXX struct ethtool_coalesce *coal; */ priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { @@ -158,23 +155,23 @@ } size = ipoib_recvq_size + 1; - ret = ipoib_cm_dev_init(dev); + ret = ipoib_cm_dev_init(priv); if (!ret) { size += ipoib_sendq_size; - if (ipoib_cm_has_srq(dev)) + if (ipoib_cm_has_srq(priv)) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; } - priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); goto out_free_mr; } priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, - dev, ipoib_sendq_size, 0); + priv, ipoib_sendq_size, 0); if (IS_ERR(priv->send_cq)) { printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; @@ -183,6 +180,8 @@ if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) goto out_free_send_cq; +#if 0 + /* XXX */ coal = kzalloc(sizeof *coal, GFP_KERNEL); if (coal) { coal->rx_coalesce_usecs = 10; @@ -192,6 +191,7 @@ dev->ethtool_ops->set_coalesce(dev, coal); kfree(coal); } +#endif init_attr.send_cq = priv->send_cq; init_attr.recv_cq = priv->recv_cq; @@ -202,8 +202,7 @@ if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; - if (dev->features & NETIF_F_SG) - init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + init_attr.cap.max_send_sge = IPOIB_UD_TX_SG; priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { @@ -211,27 +210,19 @@ goto out_free_send_cq; } - priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; - priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; - priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; + IF_LLADDR(priv->dev)[1] = (priv->qp->qp_num >> 16) & 0xff; + IF_LLADDR(priv->dev)[2] = (priv->qp->qp_num >> 8) & 0xff; + IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff; - for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + for (i = 0; i < IPOIB_MAX_TX_SG; ++i) priv->tx_sge[i].lkey = priv->mr->lkey; priv->tx_wr.opcode = IB_WR_SEND; priv->tx_wr.sg_list = priv->tx_sge; priv->tx_wr.send_flags = IB_SEND_SIGNALED; - priv->rx_sge[0].lkey = priv->mr->lkey; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; - priv->rx_sge[1].length = PAGE_SIZE; - priv->rx_sge[1].lkey = priv->mr->lkey; - priv->rx_wr.num_sge = IPOIB_UD_RX_SG; - } else { - priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - priv->rx_wr.num_sge = 1; - } + for (i = 0; i < IPOIB_UD_RX_SG; ++i) + priv->rx_sge[i].lkey = priv->mr->lkey; priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; @@ -245,16 +236,15 @@ out_free_mr: ib_dereg_mr(priv->mr); - ipoib_cm_dev_cleanup(dev); + ipoib_cm_dev_cleanup(priv); out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; } -void ipoib_transport_dev_cleanup(struct net_device *dev) +void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); if (priv->qp) { if (ib_destroy_qp(priv->qp)) @@ -270,7 +260,7 @@ if (ib_destroy_cq(priv->recv_cq)) ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); - ipoib_cm_dev_cleanup(dev); + ipoib_cm_dev_cleanup(priv); if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (.../head) (revision 219811) @@ -34,6 +34,10 @@ #include "ipoib.h" +static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, + struct sockaddr *); + + #include #include @@ -42,42 +46,31 @@ #include #include /* For ARPHRD_xxx */ +#include +#include +#include -#include -#include - -#include - MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); -int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; -int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; +int ipoib_sendq_size = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); -static int lro; -module_param(lro, bool, 0444); -MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)"); - -static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; -module_param(lro_max_aggr, int, 0644); -MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " - "(default = 64)"); - #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -int ipoib_debug_level; +int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { - struct net_device *dev; + struct ipoib_dev_priv *priv; struct ipoib_path path; }; @@ -93,28 +86,73 @@ static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); +static void ipoib_start(struct ifnet *dev); +static int ipoib_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro); +static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); +static void ipoib_input(struct ifnet *ifp, struct mbuf *m); +#define IPOIB_MTAP(_ifp, _m) \ +do { \ + if (bpf_peers_present((_ifp)->if_bpf)) { \ + M_ASSERTVALID(_m); \ + ipoib_mtap_mb((_ifp), (_m)); \ + } \ +} while (0) + +/* + * This is for clients that have an ipoib_header in the mbuf. + */ +static void +ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) +{ + struct ipoib_header *ih; + struct ether_header eh; + + ih = mtod(mb, struct ipoib_header *); + eh.ether_type = ih->proto; + bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); + bzero(&eh.ether_shost, ETHER_ADDR_LEN); + mb->m_data += sizeof(struct ipoib_header); + mb->m_len -= sizeof(struct ipoib_header); + bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); + mb->m_data -= sizeof(struct ipoib_header); + mb->m_len += sizeof(struct ipoib_header); +} + +void +ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) +{ + struct ether_header eh; + + eh.ether_type = proto; + bzero(&eh.ether_shost, ETHER_ADDR_LEN); + bzero(&eh.ether_dhost, ETHER_ADDR_LEN); + bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); +} + static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one }; -int ipoib_open(struct net_device *dev) +int +ipoib_open(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_pkey_dev_delay_open(dev)) + if (ipoib_pkey_dev_delay_open(priv)) return 0; - if (ipoib_ib_dev_open(dev)) + if (ipoib_ib_dev_open(priv)) goto err_disable; - if (ipoib_ib_dev_up(dev)) + if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { @@ -122,24 +160,18 @@ /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); - list_for_each_entry(cpriv, &priv->child_intfs, list) { - int flags; - - flags = cpriv->dev->flags; - if (flags & IFF_UP) - continue; - - dev_change_flags(cpriv->dev, flags | IFF_UP); - } + list_for_each_entry(cpriv, &priv->child_intfs, list) + if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) + ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } + dev->if_drv_flags |= IFF_DRV_RUNNING; + dev->if_drv_flags &= ~IFF_DRV_OACTIVE; - netif_start_queue(dev); - return 0; err_stop: - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -147,53 +179,63 @@ return -EINVAL; } -static int ipoib_stop(struct net_device *dev) +static void +ipoib_init(void *arg) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev; + struct ipoib_dev_priv *priv; + priv = arg; + dev = priv->dev; + if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) + ipoib_open(priv); + queue_work(ipoib_workqueue, &priv->flush_light); +} + + +static int +ipoib_stop(struct ipoib_dev_priv *priv) +{ + struct ifnet *dev = priv->dev; + ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - netif_stop_queue(dev); + dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_down(priv, 0); + ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); - list_for_each_entry(cpriv, &priv->child_intfs, list) { - int flags; - - flags = cpriv->dev->flags; - if (!(flags & IFF_UP)) - continue; - - dev_change_flags(cpriv->dev, flags & ~IFF_UP); - } + list_for_each_entry(cpriv, &priv->child_intfs, list) + if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) + ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } -static int ipoib_change_mtu(struct net_device *dev, int new_mtu) +int +ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; - /* dev->mtu > 2K ==> connected mode */ - if (ipoib_cm_admin_enabled(dev)) { - if (new_mtu > ipoib_cm_max_mtu(dev)) + /* dev->if_mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(priv)) { + if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); - dev->mtu = new_mtu; + dev->if_mtu = new_mtu; return 0; } @@ -202,16 +244,78 @@ priv->admin_mtu = new_mtu; - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } -static struct ipoib_path *__path_find(struct net_device *dev, void *gid) +static int +ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ifp->if_softc; + struct ifaddr *ifa = (struct ifaddr *) data; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + switch (command) { + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + error = -ipoib_open(priv); + } else + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + ipoib_stop(priv); + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + queue_work(ipoib_workqueue, &priv->restart_task); + break; + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifp->if_init(ifp->if_softc); /* before arpwhohas */ + arp_ifinit(ifp, ifa); + break; +#endif + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + + case SIOCGIFADDR: + { + struct sockaddr *sa; + + sa = (struct sockaddr *) & ifr->ifr_data; + bcopy(IF_LLADDR(ifp), + (caddr_t) sa->sa_data, INFINIBAND_ALEN); + } + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + error = -ipoib_change_mtu(priv, ifr->ifr_mtu); + break; + default: + error = EINVAL; + break; + } + return (error); +} + + +static struct ipoib_path * +__path_find(struct ipoib_dev_priv *priv, void *gid) +{ struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; @@ -233,9 +337,9 @@ return NULL; } -static int __path_add(struct net_device *dev, struct ipoib_path *path) +static int +__path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; @@ -263,42 +367,24 @@ return 0; } -static void path_free(struct net_device *dev, struct ipoib_path *path) +void +ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tn; - struct sk_buff *skb; - unsigned long flags; - while ((skb = __skb_dequeue(&path->queue))) - dev_kfree_skb_irq(skb); + _IF_DRAIN(&path->queue); - spin_lock_irqsave(&priv->lock, flags); - - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that path->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - - ipoib_neigh_free(dev, neigh); - } - - spin_unlock_irqrestore(&priv->lock, flags); - if (path->ah) ipoib_put_ah(path->ah); + if (ipoib_cm_get(path)) + ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +struct ipoib_path_iter * +ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; @@ -306,7 +392,7 @@ if (!iter) return NULL; - iter->dev = dev; + iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { @@ -317,9 +403,10 @@ return iter; } -int ipoib_path_iter_next(struct ipoib_path_iter *iter) +int +ipoib_path_iter_next(struct ipoib_path_iter *iter) { - struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; @@ -346,39 +433,38 @@ return ret; } -void ipoib_path_iter_read(struct ipoib_path_iter *iter, - struct ipoib_path *path) +void +ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ -void ipoib_mark_paths_invalid(struct net_device *dev) +void +ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { - ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n", + ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), - path->pathrec.dgid.raw); + path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } -void ipoib_flush_paths(struct net_device *dev) +void +ipoib_flush_paths(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); @@ -390,45 +476,40 @@ if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); wait_for_completion(&path->done); - path_free(dev, path); - netif_tx_lock_bh(dev); + ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); } -static void path_rec_completion(int status, - struct ib_sa_path_rec *pathrec, - void *path_ptr) +static void +path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; - struct net_device *dev = path->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = path->priv; + struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; - struct ipoib_neigh *neigh, *tn; - struct sk_buff_head skqueue; - struct sk_buff *skb; + struct ifqueue mbqueue; + struct mbuf *mb; unsigned long flags; if (!status) - ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", - be16_to_cpu(pathrec->dlid), pathrec->dgid.raw); + ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", + be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else - ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", - status, path->pathrec.dgid.raw); + ipoib_dbg(priv, "PathRec status %d for GID %16D\n", + status, path->pathrec.dgid.raw, ":"); - skb_queue_head_init(&skqueue); + bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) - ah = ipoib_create_ah(dev, priv->pd, &av); + ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); @@ -442,43 +523,18 @@ ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); - while ((skb = __skb_dequeue(&path->queue))) - __skb_queue_tail(&skqueue, skb); + for (;;) { + _IF_DEQUEUE(&path->queue, mb); + if (mb == NULL) + break; + _IF_ENQUEUE(&mbqueue, mb); + } - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { - if (neigh->ah) { - WARN_ON(neigh->ah != old_ah); - /* - * Dropping the ah reference inside - * priv->lock is safe here, because we - * will hold one more reference from - * the original value of path->ah (ie - * old_ah). - */ - ipoib_put_ah(neigh->ah); - } - kref_get(&path->ah->ref); - neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) + ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); +#endif - if (ipoib_cm_enabled(dev, neigh->neighbour)) { - if (!ipoib_cm_get(neigh)) - ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, - path, - neigh)); - if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - continue; - } - } - - while ((skb = __skb_dequeue(&neigh->queue))) - __skb_queue_tail(&skqueue, skb); - } path->valid = 1; } @@ -490,17 +546,20 @@ if (old_ah) ipoib_put_ah(old_ah); - while ((skb = __skb_dequeue(&skqueue))) { - skb->dev = dev; - if (dev_queue_xmit(skb)) + for (;;) { + _IF_DEQUEUE(&mbqueue, mb); + if (mb == NULL) + break; + mb->m_pkthdr.rcvif = dev; + if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } -static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) +static struct ipoib_path * +path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; if (!priv->broadcast) @@ -510,13 +569,14 @@ if (!path) return NULL; - path->dev = dev; + path->priv = priv; - skb_queue_head_init(&path->queue); + bzero(&path->queue, sizeof(path->queue)); - INIT_LIST_HEAD(&path->neigh_list); - - memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); +#endif + memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; @@ -525,17 +585,18 @@ return path; } -static int path_rec_start(struct net_device *dev, - struct ipoib_path *path) +static int +path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; + ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; - switch (roundup_pow_of_two(dev->mtu + IPOIB_ENCAP_LEN)) { + switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; @@ -555,8 +616,8 @@ p_rec.mtu_selector = 0; } - ipoib_dbg(priv, "Start path record lookup for %pI6 MTU > %d\n", - p_rec.dgid.raw, + ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", + p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); @@ -582,367 +643,118 @@ return 0; } -static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) +static void +ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; - struct ipoib_neigh *neigh; - unsigned long flags; - neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev); - if (!neigh) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - return; - } - - spin_lock_irqsave(&priv->lock, flags); - - path = __path_find(dev, skb->dst->neighbour->ha + 4); - if (!path) { - path = path_rec_create(dev, skb->dst->neighbour->ha + 4); - if (!path) - goto err_path; - - __path_add(dev, path); - } - - list_add_tail(&neigh->list, &path->neigh_list); - - if (path->ah) { - kref_get(&path->ah->ref); - neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); - - if (ipoib_cm_enabled(dev, neigh->neighbour)) { - if (!ipoib_cm_get(neigh)) - ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); - if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - goto err_drop; - } - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) - __skb_queue_tail(&neigh->queue, skb); - else { - ipoib_warn(priv, "queue length limit %d. Packet drop.\n", - skb_queue_len(&neigh->queue)); - goto err_drop; - } - } else - ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); - } else { - neigh->ah = NULL; - - if (!path->query && path_rec_start(dev, path)) - goto err_list; - - __skb_queue_tail(&neigh->queue, skb); - } - - spin_unlock_irqrestore(&priv->lock, flags); - return; - -err_list: - list_del(&neigh->list); - -err_path: - ipoib_neigh_free(dev, neigh); -err_drop: - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - - spin_unlock_irqrestore(&priv->lock, flags); -} - -static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(skb->dev); - - /* Look up path record for unicasts */ - if (skb->dst->neighbour->ha[4] != 0xff) { - neigh_add_path(skb, dev); - return; - } - - /* Add in the P_Key for multicasts */ - skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; - skb->dst->neighbour->ha[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); -} - -static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, - struct ipoib_pseudoheader *phdr) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_path *path; - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - - path = __path_find(dev, phdr->hwaddr + 4); + path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { - path = path_rec_create(dev, phdr->hwaddr + 4); + path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); - __skb_queue_tail(&path->queue, skb); - - if (!path->query && path_rec_start(dev, path)) { + _IF_ENQUEUE(&path->queue, mb); + if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) - path_free(dev, path); + ipoib_path_free(priv, path); return; } else - __path_add(dev, path); + __path_add(priv, path); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + ++priv->dev->if_oerrors; + m_freem(mb); } - spin_unlock_irqrestore(&priv->lock, flags); return; } - if (path->ah) { - ipoib_dbg(priv, "Send unicast ARP to %04x\n", - be16_to_cpu(path->pathrec.dlid)); - - ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); - } else if ((path->query || !path_rec_start(dev, path)) && - skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); - __skb_queue_tail(&path->queue, skb); + if (ipoib_cm_get(path) && ipoib_cm_up(path)) { + ipoib_cm_send(priv, mb, ipoib_cm_get(path)); + } else if (path->ah) { + ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); + } else if ((path->query || !path_rec_start(priv, path)) && + path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { + _IF_ENQUEUE(&path->queue, mb); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + ++priv->dev->if_oerrors; + m_freem(mb); } - - spin_unlock_irqrestore(&priv->lock, flags); } -static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) +static int +ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh; - unsigned long flags; + struct ipoib_header *eh; - if (likely(skb->dst && skb->dst->neighbour)) { - if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { - ipoib_path_lookup(skb, dev); - return NETDEV_TX_OK; - } + eh = mtod(mb, struct ipoib_header *); + if (IPOIB_IS_MULTICAST(eh->hwaddr)) { + /* Add in the P_Key for multicast*/ + eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; + eh->hwaddr[9] = priv->pkey & 0xff; - neigh = *to_ipoib_neigh(skb->dst->neighbour); + ipoib_mcast_send(priv, eh->hwaddr + 4, mb); + } else + ipoib_unicast_send(mb, priv, eh); - if (unlikely((memcmp(&neigh->dgid.raw, - skb->dst->neighbour->ha + 4, - sizeof(union ib_gid))) || - (neigh->dev != dev))) { - spin_lock_irqsave(&priv->lock, flags); - /* - * It's safe to call ipoib_put_ah() inside - * priv->lock here, because we know that - * path->ah will always hold one more reference, - * so ipoib_put_ah() will never do more than - * decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - list_del(&neigh->list); - ipoib_neigh_free(dev, neigh); - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_path_lookup(skb, dev); - return NETDEV_TX_OK; - } - - if (ipoib_cm_get(neigh)) { - if (ipoib_cm_up(neigh)) { - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); - return NETDEV_TX_OK; - } - } else if (neigh->ah) { - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha)); - return NETDEV_TX_OK; - } - - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - spin_lock_irqsave(&priv->lock, flags); - __skb_queue_tail(&neigh->queue, skb); - spin_unlock_irqrestore(&priv->lock, flags); - } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - } - } else { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb->data; - skb_pull(skb, sizeof *phdr); - - if (phdr->hwaddr[4] == 0xff) { - /* Add in the P_Key for multicast*/ - phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; - phdr->hwaddr[9] = priv->pkey & 0xff; - - ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); - } else { - /* unicast GID -- should be ARP or RARP reply */ - - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", - skb->dst ? "neigh" : "dst", - be16_to_cpup((__be16 *) skb->data), - IPOIB_QPN(phdr->hwaddr), - phdr->hwaddr + 4); - dev_kfree_skb_any(skb); - ++dev->stats.tx_dropped; - return NETDEV_TX_OK; - } - - unicast_arp_send(skb, dev, phdr); - } - } - - return NETDEV_TX_OK; + return 0; } -static void ipoib_timeout(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_warn(priv, "transmit timeout: latency %d msecs\n", - jiffies_to_msecs(jiffies - dev->trans_start)); - ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", - netif_queue_stopped(dev), - priv->tx_head, priv->tx_tail); - /* XXX reset QP, etc. */ -} - -static int ipoib_hard_header(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned len) +static void +_ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { - struct ipoib_header *header; + struct mbuf *mb; - header = (struct ipoib_header *) skb_push(skb, sizeof *header); + if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING) + return; - header->proto = htons(type); - header->reserved = 0; - - /* - * If we don't have a neighbour structure, stuff the - * destination address onto the front of the skb so we can - * figure out where to send the packet later. - */ - if ((!skb->dst || !skb->dst->neighbour) && daddr) { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); - memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + spin_lock(&priv->lock); + while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && + (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { + IFQ_DRV_DEQUEUE(&dev->if_snd, mb); + if (mb == NULL) + break; + IPOIB_MTAP(dev, mb); + ipoib_send_one(priv, mb); } - - return 0; + spin_unlock(&priv->lock); } -static void ipoib_set_mcast_list(struct net_device *dev) +static void +ipoib_start(struct ifnet *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - - if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); - return; - } - - queue_work(ipoib_workqueue, &priv->restart_task); + _ipoib_start(dev, dev->if_softc); } -static void ipoib_neigh_cleanup(struct neighbour *n) +static void +ipoib_vlan_start(struct ifnet *dev) { - struct ipoib_neigh *neigh; - struct ipoib_dev_priv *priv = netdev_priv(n->dev); - unsigned long flags; - struct ipoib_ah *ah = NULL; + struct ipoib_dev_priv *priv; + struct mbuf *mb; - if (n->dev->type != ARPHRD_INFINIBAND) - return; - - neigh = *to_ipoib_neigh(n); - if (neigh) - priv = netdev_priv(neigh->dev); - else - return; - ipoib_dbg(priv, - "neigh_cleanup for %06x %pI6\n", - IPOIB_QPN(n->ha), - n->ha + 4); - - spin_lock_irqsave(&priv->lock, flags); - - if (neigh->ah) - ah = neigh->ah; - list_del(&neigh->list); - ipoib_neigh_free(n->dev, neigh); - - spin_unlock_irqrestore(&priv->lock, flags); - - if (ah) - ipoib_put_ah(ah); -} - -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, - struct net_device *dev) -{ - struct ipoib_neigh *neigh; - - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); - if (!neigh) - return NULL; - - neigh->neighbour = neighbour; - neigh->dev = dev; - memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); - *to_ipoib_neigh(neighbour) = neigh; - skb_queue_head_init(&neigh->queue); - ipoib_cm_set(neigh, NULL); - - return neigh; -} - -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) -{ - struct sk_buff *skb; - *to_ipoib_neigh(neigh->neighbour) = NULL; - while ((skb = __skb_dequeue(&neigh->queue))) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + priv = VLAN_COOKIE(dev); + if (priv != NULL) + return _ipoib_start(dev, priv); + while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { + IFQ_DRV_DEQUEUE(&dev->if_snd, mb); + if (mb == NULL) + break; + m_freem(mb); + dev->if_oerrors++; } - if (ipoib_cm_get(neigh)) - ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); - kfree(neigh); } -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +int +ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { - parms->neigh_cleanup = ipoib_neigh_cleanup; - return 0; -} - -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - /* Allocate RX/TX "rings" to hold queued skbs */ + /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { @@ -951,7 +763,7 @@ goto out; } - priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); @@ -961,13 +773,13 @@ /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ - if (ipoib_ib_dev_init(dev, ca, port)) + if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: - vfree(priv->tx_ring); + kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); @@ -976,133 +788,56 @@ return -ENOMEM; } -void ipoib_dev_cleanup(struct net_device *dev) +static void +ipoib_detach(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + struct ifnet *dev; - ipoib_delete_debug_files(dev); + dev = priv->dev; + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + bpfdetach(dev); + if_detach(dev); + if_free(dev); + } else + VLAN_SETCOOKIE(priv->dev, NULL); + free(priv, M_TEMP); +} + +void +ipoib_dev_cleanup(struct ipoib_dev_priv *priv) +{ + struct ipoib_dev_priv *cpriv, *tcpriv; + /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - unregister_netdev(cpriv->dev); - ipoib_dev_cleanup(cpriv->dev); - free_netdev(cpriv->dev); + ipoib_dev_cleanup(cpriv); + ipoib_detach(cpriv); } - ipoib_ib_dev_cleanup(dev); + ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); - vfree(priv->tx_ring); + kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } -static const struct header_ops ipoib_header_ops = { - .create = ipoib_hard_header, -}; +static volatile int ipoib_unit; -static int get_skb_hdr(struct sk_buff *skb, void **iphdr, - void **tcph, u64 *hdr_flags, void *priv) +static struct ipoib_dev_priv * +ipoib_priv_alloc(void) { - unsigned int ip_len; - struct iphdr *iph; + struct ipoib_dev_priv *priv; - if (unlikely(skb->protocol != htons(ETH_P_IP))) - return -1; - - /* - * In the future we may add an else clause that verifies the - * checksum and allows devices which do not calculate checksum - * to use LRO. - */ - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) - return -1; - - /* Check for non-TCP packet */ - skb_reset_network_header(skb); - iph = ip_hdr(skb); - if (iph->protocol != IPPROTO_TCP) - return -1; - - ip_len = ip_hdrlen(skb); - skb_set_transport_header(skb, ip_len); - *tcph = tcp_hdr(skb); - - /* check if IP header and TCP header are complete */ - if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) - return -1; - - *hdr_flags = LRO_IPV4 | LRO_TCP; - *iphdr = iph; - - return 0; -} - -static void ipoib_lro_setup(struct ipoib_dev_priv *priv) -{ - priv->lro.lro_mgr.max_aggr = lro_max_aggr; - priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; - priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; - priv->lro.lro_mgr.get_skb_header = get_skb_hdr; - priv->lro.lro_mgr.features = LRO_F_NAPI; - priv->lro.lro_mgr.dev = priv->dev; - priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; -} - -static const struct net_device_ops ipoib_netdev_ops = { - .ndo_open = ipoib_open, - .ndo_stop = ipoib_stop, - .ndo_change_mtu = ipoib_change_mtu, - .ndo_start_xmit = ipoib_start_xmit, - .ndo_tx_timeout = ipoib_timeout, - .ndo_set_multicast_list = ipoib_set_mcast_list, - .ndo_neigh_setup = ipoib_neigh_setup_dev, -}; - -static void ipoib_setup(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - dev->netdev_ops = &ipoib_netdev_ops; - dev->header_ops = &ipoib_header_ops; - - ipoib_set_ethtool_ops(dev); - - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); - - dev->watchdog_timeo = HZ; - - dev->flags |= IFF_BROADCAST | IFF_MULTICAST; - - /* - * We add in INFINIBAND_ALEN to allow for the destination - * address "pseudoheader" for skbs without neighbour struct. - */ - dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; - dev->addr_len = INFINIBAND_ALEN; - dev->type = ARPHRD_INFINIBAND; - dev->tx_queue_len = ipoib_sendq_size * 2; - dev->features = (NETIF_F_VLAN_CHALLENGED | - NETIF_F_HIGHDMA); - - memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); - - netif_carrier_off(dev); - - priv->dev = dev; - - ipoib_lro_setup(priv); - + priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); - INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); - INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); @@ -1111,112 +846,52 @@ INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); + + return (priv); } -struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) +struct ipoib_dev_priv * +ipoib_intf_alloc(const char *name) { - struct net_device *dev; + struct ipoib_dev_priv *priv; + struct sockaddr_dl *sdl; + struct ifnet *dev; - dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, - ipoib_setup); - if (!dev) + priv = ipoib_priv_alloc(); + dev = priv->dev = if_alloc(IFT_INFINIBAND); + if (!dev) { + free(priv, M_TEMP); return NULL; + } + dev->if_softc = priv; + if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); + dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; + dev->if_addrlen = INFINIBAND_ALEN; + dev->if_hdrlen = IPOIB_HEADER_LEN; + if_attach(dev); + dev->if_init = ipoib_init; + dev->if_ioctl = ipoib_ioctl; + dev->if_start = ipoib_start; + dev->if_output = ipoib_output; + dev->if_input = ipoib_input; + dev->if_resolvemulti = ipoib_resolvemulti; + dev->if_baudrate = IF_Gbps(10LL); + dev->if_broadcastaddr = priv->broadcastaddr; + dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; + sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; + sdl->sdl_type = IFT_INFINIBAND; + sdl->sdl_alen = dev->if_addrlen; + priv->dev = dev; + if_link_state_change(dev, LINK_STATE_DOWN); + bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); - return netdev_priv(dev); + return dev->if_softc; } -static ssize_t show_pkey(struct device *dev, - struct device_attribute *attr, char *buf) +int +ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); - - return sprintf(buf, "0x%04x\n", priv->pkey); -} -static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); - -static ssize_t show_umcast(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); - - return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); -} - -static ssize_t set_umcast(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); - unsigned long umcast_val = simple_strtoul(buf, NULL, 0); - - if (umcast_val > 0) { - set_bit(IPOIB_FLAG_UMCAST, &priv->flags); - ipoib_warn(priv, "ignoring multicast groups joined directly " - "by userspace\n"); - } else - clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); - - return count; -} -static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); - -int ipoib_add_umcast_attr(struct net_device *dev) -{ - return device_create_file(&dev->dev, &dev_attr_umcast); -} - -static ssize_t create_child(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - int pkey; - int ret; - - if (sscanf(buf, "%i", &pkey) != 1) - return -EINVAL; - - if (pkey < 0 || pkey > 0xffff) - return -EINVAL; - - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - pkey |= 0x8000; - - ret = ipoib_vlan_add(to_net_dev(dev), pkey); - - return ret ? ret : count; -} -static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); - -static ssize_t delete_child(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - int pkey; - int ret; - - if (sscanf(buf, "%i", &pkey) != 1) - return -EINVAL; - - if (pkey < 0 || pkey > 0xffff) - return -EINVAL; - - ret = ipoib_vlan_delete(to_net_dev(dev), pkey); - - return ret ? ret : count; - -} -static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); - -int ipoib_add_pkey_attr(struct net_device *dev) -{ - return device_create_file(&dev->dev, &dev_attr_pkey); -} - -int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) -{ struct ib_device_attr *device_attr; int result = -ENOMEM; @@ -1238,23 +913,31 @@ kfree(device_attr); + priv->dev->if_hwassist = 0; + priv->dev->if_capabilities = 0; + +#ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); - priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; + priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } - if (lro) - priv->dev->features |= NETIF_F_LRO; - +#if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) - priv->dev->features |= NETIF_F_TSO; + priv->dev->if_capabilities |= IFCAP_TSO4 | CSUM_TSO; +#endif +#endif + priv->dev->if_capabilities |= + IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; + priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } -static struct net_device *ipoib_add_port(const char *format, - struct ib_device *hca, u8 port) +static struct ifnet * +ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; @@ -1264,9 +947,6 @@ if (!priv) goto alloc_mem_failed; - SET_NETDEV_DEV(priv->dev, hca->dma_device); - priv->dev->dev_id = port - 1; - if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { @@ -1276,8 +956,8 @@ } /* MTU will be reset when mcast join happens */ - priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); - priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { @@ -1295,23 +975,25 @@ */ priv->pkey |= 0x8000; - priv->dev->broadcast[8] = priv->pkey >> 8; - priv->dev->broadcast[9] = priv->pkey & 0xff; + priv->broadcastaddr[8] = priv->pkey >> 8; + priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; - } else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + } + memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); - result = ipoib_dev_init(priv->dev, hca, port); + result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } + if (ipoib_cm_admin_enabled(priv)) + priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); @@ -1322,51 +1004,25 @@ hca->name, port, result); goto event_failed; } + if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); - result = register_netdev(priv->dev); - if (result) { - printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", - hca->name, port, result); - goto register_failed; - } - - ipoib_create_debug_files(priv->dev); - - if (ipoib_cm_add_mode_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) - goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) - goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) - goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) - goto sysfs_failed; - return priv->dev; -sysfs_failed: - ipoib_delete_debug_files(priv->dev); - unregister_netdev(priv->dev); - -register_failed: - ib_unregister_event_handler(&priv->event_handler); - flush_workqueue(ipoib_workqueue); - event_failed: - ipoib_dev_cleanup(priv->dev); + ipoib_dev_cleanup(priv); device_init_failed: - free_netdev(priv->dev); + ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } -static void ipoib_add_one(struct ib_device *device) +static void +ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; - struct net_device *dev; + struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; @@ -1390,9 +1046,9 @@ for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; - dev = ipoib_add_port("ib%d", device, p); + dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { - priv = netdev_priv(dev); + priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } @@ -1400,7 +1056,8 @@ ib_set_client_data(device, &ipoib_client, dev_list); } -static void ipoib_remove_one(struct ib_device *device) +static void +ipoib_remove_one(struct ib_device *device) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; @@ -1416,22 +1073,118 @@ ib_unregister_event_handler(&priv->event_handler); - rtnl_lock(); - dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); - rtnl_unlock(); + /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); - unregister_netdev(priv->dev); - ipoib_dev_cleanup(priv->dev); - free_netdev(priv->dev); + ipoib_dev_cleanup(priv); + ipoib_detach(priv); } kfree(dev_list); } -static int __init ipoib_init_module(void) +static void +ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { + struct ipoib_dev_priv *parent; + struct ipoib_dev_priv *priv; + struct ifnet *dev; + uint16_t pkey; + int error; + + if (ifp->if_type != IFT_INFINIBAND) + return; + dev = VLAN_DEVAT(ifp, vtag); + if (dev == NULL) + return; + priv = NULL; + error = 0; + parent = ifp->if_softc; + /* We only support 15 bits of pkey. */ + if (vtag & 0x8000) + return; + pkey = vtag | 0x8000; /* Set full membership bit. */ + if (pkey == parent->pkey) + return; + /* Check for dups */ + mutex_lock(&parent->vlan_mutex); + list_for_each_entry(priv, &parent->child_intfs, list) { + if (priv->pkey == pkey) { + priv = NULL; + error = EBUSY; + goto out; + } + } + priv = ipoib_priv_alloc(); + priv->dev = dev; + priv->max_ib_mtu = parent->max_ib_mtu; + priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + error = ipoib_set_dev_features(priv, parent->ca); + if (error) + goto out; + priv->pkey = pkey; + priv->broadcastaddr[8] = pkey >> 8; + priv->broadcastaddr[9] = pkey & 0xff; + dev->if_broadcastaddr = priv->broadcastaddr; + error = ipoib_dev_init(priv, parent->ca, parent->port); + if (error) + goto out; + priv->parent = parent->dev; + list_add_tail(&priv->list, &parent->child_intfs); + VLAN_SETCOOKIE(dev, priv); + dev->if_start = ipoib_vlan_start; + dev->if_drv_flags &= ~IFF_DRV_RUNNING; + dev->if_hdrlen = IPOIB_HEADER_LEN; + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + ipoib_open(priv); + mutex_unlock(&parent->vlan_mutex); + return; +out: + mutex_unlock(&parent->vlan_mutex); + if (priv) + free(priv, M_TEMP); + if (error) + ipoib_warn(parent, + "failed to initialize subinterface: device %s, port %d vtag 0x%X", + parent->ca->name, parent->port, vtag); + return; +} + +static void +ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) +{ + struct ipoib_dev_priv *parent; + struct ipoib_dev_priv *priv; + struct ifnet *dev; + uint16_t pkey; + + if (ifp->if_type != IFT_INFINIBAND) + return; + + dev = VLAN_DEVAT(ifp, vtag); + if (dev) + VLAN_SETCOOKIE(dev, NULL); + pkey = vtag | 0x8000; + parent = ifp->if_softc; + mutex_lock(&parent->vlan_mutex); + list_for_each_entry(priv, &parent->child_intfs, list) { + if (priv->pkey == pkey) { + ipoib_dev_cleanup(priv); + list_del(&priv->list); + break; + } + } + mutex_unlock(&parent->vlan_mutex); +} + +eventhandler_tag ipoib_vlan_attach; +eventhandler_tag ipoib_vlan_detach; + +static int __init +ipoib_init_module(void) +{ int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); @@ -1446,16 +1199,11 @@ ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif - /* - * When copying small received packets, we only copy from the - * linear data part of the SKB, so we rely on this condition. - */ - BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); + ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); - ret = ipoib_register_debugfs(); - if (ret) - return ret; - /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't @@ -1483,18 +1231,307 @@ destroy_workqueue(ipoib_workqueue); err_fs: - ipoib_unregister_debugfs(); - return ret; } -static void __exit ipoib_cleanup_module(void) +static void __exit +ipoib_cleanup_module(void) { + + EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); + EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); - ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } +/* + * Infiniband output routine. + */ +static int +ipoib_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + u_char edst[INFINIBAND_ALEN]; + struct llentry *lle = NULL; + struct rtentry *rt0 = NULL; + struct ipoib_header *eh; + int error = 0; + short type; + + if (ro != NULL) { + if (!(m->m_flags & (M_BCAST | M_MCAST))) + lle = ro->ro_lle; + rt0 = ro->ro_rt; + } +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + goto bad; +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) { + error = ENETDOWN; + goto bad; + } + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) { + error = ENETDOWN; + goto bad; + } + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if (lle != NULL && (lle->la_flags & LLE_VALID)) + memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); + else if (m->m_flags & M_MCAST) + ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); + else + error = arpresolve(ifp, rt0, m, dst, edst, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + type = htons(ETHERTYPE_IP); + break; + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_INFINIBAND); + + switch(ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + type = htons(ETHERTYPE_REVARP); + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + type = htons(ETHERTYPE_ARP); + break; + } + + if (m->m_flags & M_BCAST) + bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); + else + bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); + + } + break; +#endif +#ifdef INET6 + case AF_INET6: + if (lle != NULL && (lle->la_flags & LLE_VALID)) + memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); + else if (m->m_flags & M_MCAST) + ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); + else + error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + if (error) + return error; + type = htons(ETHERTYPE_IPV6); + break; +#endif + + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + error = EAFNOSUPPORT; + goto bad; + } + + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, IPOIB_HEADER_LEN, M_DONTWAIT); + if (m == NULL) { + error = ENOBUFS; + goto bad; + } + eh = mtod(m, struct ipoib_header *); + (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); + (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); + + /* + * Queue message on interface, update output statistics if + * successful, and start output if interface not yet active. + */ + return ((ifp->if_transmit)(ifp, m)); +bad: + if (m != NULL) + m_freem(m); + return (error); +} + +/* + * Upper layer processing for a received Infiniband packet. + */ +void +ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) +{ + int isr; + +#ifdef MAC + /* + * Tag the mbuf with an appropriate MAC label before any other + * consumers can get to it. + */ + mac_ifnet_create_mbuf(ifp, m); +#endif + /* Allow monitor mode to claim this frame, after stats are updated. */ + if (ifp->if_flags & IFF_MONITOR) { + if_printf(ifp, "discard frame at IFF_MONITOR\n"); + m_freem(m); + return; + } + /* + * Dispatch frame to upper layer. + */ + switch (proto) { +#ifdef INET + case ETHERTYPE_IP: + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + if (ifp->if_flags & IFF_NOARP) { + /* Discard packet if ARP is disabled on interface */ + m_freem(m); + return; + } + isr = NETISR_ARP; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + break; +#endif + default: + goto discard; + } + netisr_dispatch(isr, m); + return; + +discard: + m_freem(m); +} + +/* + * Process a received Infiniband packet. + */ +static void +ipoib_input(struct ifnet *ifp, struct mbuf *m) +{ + struct ipoib_header *eh; + + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } + CURVNET_SET_QUIET(ifp->if_vnet); + + /* Let BPF have it before we strip the header. */ + IPOIB_MTAP(ifp, m); + eh = mtod(m, struct ipoib_header *); + /* + * Reset layer specific mbuf flags to avoid confusing upper layers. + * Strip off Infiniband header. + */ + m->m_flags &= ~M_VLANTAG; + m->m_flags &= ~(M_PROTOFLAGS); + m_adj(m, IPOIB_HEADER_LEN); + + if (IPOIB_IS_MULTICAST(eh->hwaddr)) { + if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, + ifp->if_addrlen) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + ifp->if_imcasts++; + } + + ipoib_demux(ifp, m, ntohs(eh->proto)); + CURVNET_RESTORE(); +} + +static int +ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + u_char *e_addr; + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + e_addr = LLADDR(sdl); + if (!IPOIB_IS_MULTICAST(e_addr)) + return EADDRNOTAVAIL; + *llsa = 0; + return 0; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return ENOMEM; + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_INFINIBAND; + sdl->sdl_alen = INFINIBAND_ALEN; + e_addr = LLADDR(sdl); + ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, + e_addr); + *llsa = (struct sockaddr *)sdl; + return 0; +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + /* + * An IP6 address of 0 means listen to all + * of the multicast address used for IP6. + * This has no meaning in ipoib. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + return EADDRNOTAVAIL; + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_INFINIBAND; + sdl->sdl_alen = INFINIBAND_ALEN; + e_addr = LLADDR(sdl); + ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); + *llsa = (struct sockaddr *)sdl; + return 0; +#endif + + default: + return EAFNOSUPPORT; + } +} + module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (.../head) (revision 219811) @@ -35,25 +35,72 @@ #ifndef _IPOIB_H #define _IPOIB_H +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ofed.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#include +#include +#include +#endif +#ifdef INET6 +#include +#endif + +#include + #include -#include -#include + #include #include -#include #include -#include - #include #include #include #include -#include /* constants */ +#define INFINIBAND_ALEN 20 /* Octets in IPoIB HW addr */ + +#ifdef IPOIB_CM +#define CONFIG_INFINIBAND_IPOIB_CM +#endif + +#ifdef IPOIB_DEBUG +#define CONFIG_INFINIBAND_IPOIB_DEBUG +#define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +#endif + enum ipoib_flush_level { IPOIB_FLUSH_LIGHT, IPOIB_FLUSH_NORMAL, @@ -62,16 +109,17 @@ enum { IPOIB_ENCAP_LEN = 4, - - IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, - IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ - - IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ - IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, - IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, - IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN, + IPOIB_UD_MAX_MTU = 4 * 1024, + IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), + IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2, + IPOIB_CM_MAX_MTU = (64 * 1024), + IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2, + IPOIB_CM_RX_SG = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE), IPOIB_RX_RING_SIZE = 256, IPOIB_TX_RING_SIZE = 128, + IPOIB_MAX_RX_SG = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG), + IPOIB_MAX_TX_SG = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG), IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, @@ -89,7 +137,6 @@ IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, - IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, @@ -117,6 +164,7 @@ /* structs */ struct ipoib_header { + u8 hwaddr[INFINIBAND_ALEN]; __be16 proto; u16 reserved; }; @@ -140,28 +188,31 @@ unsigned long flags; unsigned char logcount; - struct list_head neigh_list; + struct ifqueue pkt_queue; - struct sk_buff_head pkt_queue; + struct ipoib_dev_priv *priv; +}; - struct net_device *dev; +struct ipoib_cm_rx_buf { + struct mbuf *mb; + u64 mapping[IPOIB_CM_RX_SG]; }; +struct ipoib_cm_tx_buf { + struct mbuf *mb; + u64 mapping[IPOIB_CM_TX_SG]; +}; + struct ipoib_rx_buf { - struct sk_buff *skb; + struct mbuf *mb; u64 mapping[IPOIB_UD_RX_SG]; }; struct ipoib_tx_buf { - struct sk_buff *skb; - u64 mapping[MAX_SKB_FRAGS + 1]; + struct mbuf *mb; + u64 mapping[IPOIB_UD_TX_SG]; }; -struct ipoib_cm_tx_buf { - struct sk_buff *skb; - u64 mapping; -}; - struct ib_cm_id; struct ipoib_cm_data { @@ -207,7 +258,7 @@ struct ib_qp *qp; struct ipoib_cm_rx_buf *rx_ring; struct list_head list; - struct net_device *dev; + struct ipoib_dev_priv *priv; unsigned long jiffies; enum ipoib_cm_state state; int recv_count; @@ -217,21 +268,15 @@ struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; - struct net_device *dev; - struct ipoib_neigh *neigh; + struct ipoib_dev_priv *priv; struct ipoib_path *path; struct ipoib_cm_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; - u32 mtu; + u32 mtu; /* remote specified mtu, with grh. */ }; -struct ipoib_cm_rx_buf { - struct sk_buff *skb; - u64 mapping[IPOIB_CM_RX_SG]; -}; - struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; @@ -243,17 +288,16 @@ struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; - struct work_struct skb_task; + struct work_struct mb_task; struct work_struct rx_reap_task; struct delayed_work stale_task; - struct sk_buff_head skb_queue; + struct ifqueue mb_queue; struct list_head start_list; struct list_head reap_list; - struct ib_wc ibwc[IPOIB_NUM_WC]; struct ib_sge rx_sge[IPOIB_CM_RX_SG]; - struct ib_recv_wr rx_wr; + struct ib_recv_wr rx_wr; int nonsrq_conn_qp; - int max_cm_mtu; + int max_cm_mtu; /* Actual buf size. */ int num_frags; }; @@ -262,11 +306,6 @@ u16 max_coalesced_frames; }; -struct ipoib_lro { - struct net_lro_mgr lro_mgr; - struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; -}; - /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside @@ -275,9 +314,9 @@ struct ipoib_dev_priv { spinlock_t lock; - struct net_device *dev; + struct ifnet *dev; - struct napi_struct napi; + u8 broadcastaddr[INFINIBAND_ALEN]; unsigned long flags; @@ -313,22 +352,22 @@ union ib_gid local_gid; u16 local_lid; - unsigned int admin_mtu; - unsigned int mcast_mtu; - unsigned int max_ib_mtu; + unsigned int admin_mtu; /* User selected MTU, no GRH. */ + unsigned int mcast_mtu; /* Minus GRH bytes, from mcast group. */ + unsigned int max_ib_mtu; /* Without header, actual buf size. */ struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; - struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; + struct ib_sge tx_sge[IPOIB_MAX_TX_SG]; struct ib_send_wr tx_wr; unsigned tx_outstanding; struct ib_wc send_wc[MAX_SEND_CQE]; struct ib_recv_wr rx_wr; - struct ib_sge rx_sge[IPOIB_UD_RX_SG]; + struct ib_sge rx_sge[IPOIB_MAX_RX_SG]; struct ib_wc ibwc[IPOIB_NUM_WC]; @@ -336,7 +375,7 @@ struct ib_event_handler event_handler; - struct net_device *parent; + struct ifnet *parent; struct list_head child_intfs; struct list_head list; @@ -352,12 +391,10 @@ int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; - - struct ipoib_lro lro; }; struct ipoib_ah { - struct net_device *dev; + struct ipoib_dev_priv *priv; struct ib_ah *ah; struct list_head list; struct kref ref; @@ -365,69 +402,46 @@ }; struct ipoib_path { - struct net_device *dev; + struct ipoib_dev_priv *priv; + struct rb_node rb_node; + struct list_head list; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + uint8_t hwaddr[INFINIBAND_ALEN]; + struct ipoib_cm_tx *cm; +#endif + struct ipoib_ah *ah; struct ib_sa_path_rec pathrec; - struct ipoib_ah *ah; - struct sk_buff_head queue; + struct ifqueue queue; - struct list_head neigh_list; - int query_id; struct ib_sa_query *query; struct completion done; - struct rb_node rb_node; - struct list_head list; int valid; }; -struct ipoib_neigh { - struct ipoib_ah *ah; -#ifdef CONFIG_INFINIBAND_IPOIB_CM - struct ipoib_cm_tx *cm; -#endif - union ib_gid dgid; - struct sk_buff_head queue; - - struct neighbour *neighbour; - struct net_device *dev; - - struct list_head list; -}; - +/* UD Only transmits encap len but we want the two sizes to be symmetrical. */ #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) -#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) +#define IPOIB_CM_MTU(ib_mtu) (ib_mtu - 0x10) -static inline int ipoib_ud_need_sg(unsigned int ib_mtu) -{ - return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; -} +#define IPOIB_IS_MULTICAST(addr) ((addr)[4] == 0xff) -/* - * We stash a pointer to our private neighbour information after our - * hardware address in neigh->ha. The ALIGN() expression here makes - * sure that this pointer is stored aligned so that an unaligned - * load is not needed to dereference it. - */ -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) -{ - return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + - INFINIBAND_ALEN, sizeof(void *)); -} +extern struct workqueue_struct *ipoib_workqueue; -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, - struct net_device *dev); -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); +#define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \ +do { \ + if (bpf_peers_present((_ifp)->if_bpf)) { \ + M_ASSERTVALID(_m); \ + ipoib_mtap_proto((_ifp), (_m), (_proto)); \ + } \ +} while (0) -extern struct workqueue_struct *ipoib_workqueue; - /* functions */ - -int ipoib_poll(struct napi_struct *napi, int budget); +void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); -struct ipoib_ah *ipoib_create_ah(struct net_device *dev, +struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *, struct ib_pd *pd, struct ib_ah_attr *attr); void ipoib_free_ah(struct kref *kref); static inline void ipoib_put_ah(struct ipoib_ah *ah) @@ -435,46 +449,51 @@ kref_put(&ah->ref, ipoib_free_ah); } -int ipoib_open(struct net_device *dev); -int ipoib_add_pkey_attr(struct net_device *dev); -int ipoib_add_umcast_attr(struct net_device *dev); +int ipoib_open(struct ipoib_dev_priv *priv); +int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv); +int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv); -void ipoib_send(struct net_device *dev, struct sk_buff *skb, +void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto); + +void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); -void ipoib_mark_paths_invalid(struct net_device *dev); -void ipoib_flush_paths(struct net_device *dev); +void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv); +void ipoib_flush_paths(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); -int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, + int port); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); -void ipoib_ib_dev_cleanup(struct net_device *dev); +void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv); -int ipoib_ib_dev_open(struct net_device *dev); -int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev, int flush); +int ipoib_ib_dev_open(struct ipoib_dev_priv *priv); +int ipoib_ib_dev_up(struct ipoib_dev_priv *priv); +int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush); +int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush); -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); -void ipoib_dev_cleanup(struct net_device *dev); +int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); +void ipoib_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); +void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb); void ipoib_mcast_restart_task(struct work_struct *work); -int ipoib_mcast_start_thread(struct net_device *dev); -int ipoib_mcast_stop_thread(struct net_device *dev, int flush); +void ipoib_mcast_restart(struct ipoib_dev_priv *); +int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv); +int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush); -void ipoib_mcast_dev_down(struct net_device *dev); -void ipoib_mcast_dev_flush(struct net_device *dev); +void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv); +void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv); +void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *gid, @@ -483,30 +502,38 @@ unsigned int *complete, unsigned int *send_only); -struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv); int ipoib_path_iter_next(struct ipoib_path_iter *iter); void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path); #endif -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, +int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu); + +int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey); -int ipoib_init_qp(struct net_device *dev); -int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); -void ipoib_transport_dev_cleanup(struct net_device *dev); +int ipoib_init_qp(struct ipoib_dev_priv *priv); +int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca); +void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_event(struct ib_event_handler *handler, struct ib_event *record); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); -int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); - void ipoib_pkey_poll(struct work_struct *work); -int ipoib_pkey_dev_delay_open(struct net_device *dev); -void ipoib_drain_cq(struct net_device *dev); +int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv); +void ipoib_drain_cq(struct ipoib_dev_priv *priv); -void ipoib_set_ethtool_ops(struct net_device *dev); +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max); +void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); +int ipoib_poll_tx(struct ipoib_dev_priv *priv); + +void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req); +void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length); +struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size); + + +void ipoib_set_ethtool_ops(struct ifnet *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #ifdef CONFIG_INFINIBAND_IPOIB_CM @@ -519,135 +546,128 @@ extern int ipoib_max_conn_qp; -static inline int ipoib_cm_admin_enabled(struct net_device *dev) +static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - return IPOIB_CM_SUPPORTED(dev->dev_addr) && - test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)); } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - return IPOIB_CM_SUPPORTED(n->ha) && - test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + return IPOIB_CM_SUPPORTED(hwaddr); } -static inline int ipoib_cm_up(struct ipoib_neigh *neigh) +static inline int ipoib_cm_up(struct ipoib_path *path) { - return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); + return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags); } -static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { - return neigh->cm; + return path->cm; } -static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { - neigh->cm = tx; + path->cm = tx; } -static inline int ipoib_cm_has_srq(struct net_device *dev) +static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); return !!priv->cm.srq; } -static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); return priv->cm.max_cm_mtu; } -void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); -int ipoib_cm_dev_open(struct net_device *dev); -void ipoib_cm_dev_stop(struct net_device *dev); -int ipoib_cm_dev_init(struct net_device *dev); -int ipoib_cm_add_mode_attr(struct net_device *dev); -void ipoib_cm_dev_cleanup(struct net_device *dev); -struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, - struct ipoib_neigh *neigh); +void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct ipoib_dev_priv *priv); +void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv); +int ipoib_cm_dev_init(struct ipoib_dev_priv *priv); +int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv); +void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, + struct ipoib_path *path); void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); -void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, +void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu); -void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); -void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); #else struct ipoib_cm_tx; #define ipoib_max_conn_qp 0 -static inline int ipoib_cm_admin_enabled(struct net_device *dev) +static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return 0; } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return 0; } -static inline int ipoib_cm_up(struct ipoib_neigh *neigh) +static inline int ipoib_cm_up(struct ipoib_path *path) { return 0; } -static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return NULL; } -static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { } -static inline int ipoib_cm_has_srq(struct net_device *dev) +static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return 0; } -static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return 0; } static inline -void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { return; } static inline -int ipoib_cm_dev_open(struct net_device *dev) +int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { return 0; } static inline -void ipoib_cm_dev_stop(struct net_device *dev) +void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { return; } static inline -int ipoib_cm_dev_init(struct net_device *dev) +int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { return -ENOSYS; } static inline -void ipoib_cm_dev_cleanup(struct net_device *dev) +void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { return; } static inline -struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, - struct ipoib_neigh *neigh) +struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { return NULL; } @@ -659,40 +679,40 @@ } static inline -int ipoib_cm_add_mode_attr(struct net_device *dev) +int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv) { return 0; } -static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, +static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { - dev_kfree_skb_any(skb); + m_freem(mb); } -static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } -static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -void ipoib_create_debug_files(struct net_device *dev); -void ipoib_delete_debug_files(struct net_device *dev); +void ipoib_create_debug_files(struct ipoib_dev_priv *priv); +void ipoib_delete_debug_files(struct ipoib_dev_priv *priv); int ipoib_register_debugfs(void); void ipoib_unregister_debugfs(void); #else -static inline void ipoib_create_debug_files(struct net_device *dev) { } -static inline void ipoib_delete_debug_files(struct net_device *dev) { } +static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { } +static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { } static inline int ipoib_register_debugfs(void) { return 0; } static inline void ipoib_unregister_debugfs(void) { } #endif #define ipoib_printk(level, priv, format, arg...) \ - printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) + printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg) #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (.../head) (revision 219811) @@ -32,21 +32,13 @@ * SOFTWARE. */ -#include -#include -#include -#include -#include -#include +#include "ipoib.h" + #include #include -#include - -#include "ipoib.h" - #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -static int mcast_debug_level; +static int mcast_debug_level = 1; module_param(mcast_debug_level, int, 0644); MODULE_PARM_DESC(mcast_debug_level, @@ -56,7 +48,7 @@ static DEFINE_MUTEX(mcast_mutex); struct ipoib_mcast_iter { - struct net_device *dev; + struct ipoib_dev_priv *priv; union ib_gid mgid; unsigned long created; unsigned int queuelen; @@ -66,46 +58,24 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast) { - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tmp; + struct ifnet *dev = mcast->priv->dev; int tx_dropped = 0; - ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", - mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n", + mcast->mcmember.mgid.raw, ":"); - spin_lock_irq(&priv->lock); - - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that mcast->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - } - - spin_unlock_irq(&priv->lock); - if (mcast->ah) ipoib_put_ah(mcast->ah); - while (!skb_queue_empty(&mcast->pkt_queue)) { - ++tx_dropped; - dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); - } + tx_dropped = mcast->pkt_queue.ifq_len; + _IF_DRAIN(&mcast->pkt_queue); /* XXX Locking. */ - netif_tx_lock_bh(dev); - dev->stats.tx_dropped += tx_dropped; - netif_tx_unlock_bh(dev); + dev->if_oerrors += tx_dropped; kfree(mcast); } -static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, +static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv, int can_sleep) { struct ipoib_mcast *mcast; @@ -114,20 +84,19 @@ if (!mcast) return NULL; - mcast->dev = dev; + mcast->priv = priv; mcast->created = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); - INIT_LIST_HEAD(&mcast->neigh_list); - skb_queue_head_init(&mcast->pkt_queue); + bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue)); return mcast; } -static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv, + void *mgid) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->multicast_tree.rb_node; while (n) { @@ -149,9 +118,9 @@ return NULL; } -static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) +static int __ipoib_mcast_add(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; while (*n) { @@ -180,8 +149,8 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ib_sa_mcmember_rec *mcmember) { - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = mcast->priv; + struct ifnet *dev = priv->dev; struct ipoib_ah *ah; int ret; int set_qkey = 0; @@ -189,7 +158,7 @@ mcast->mcmember = *mcmember; /* Set the cached Q_Key before we attach if it's the broadcast group */ - if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); if (!priv->broadcast) { @@ -204,17 +173,17 @@ if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - ipoib_warn(priv, "multicast group %pI6 already attached\n", - mcast->mcmember.mgid.raw); + ipoib_warn(priv, "multicast group %16D already attached\n", + mcast->mcmember.mgid.raw, ":"); return 0; } - ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), + ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid), &mcast->mcmember.mgid, set_qkey); if (ret < 0) { - ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", - mcast->mcmember.mgid.raw); + ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n", + mcast->mcmember.mgid.raw, ":"); clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); return ret; @@ -237,7 +206,7 @@ }; av.grh.dgid = mcast->mcmember.mgid; - ah = ipoib_create_ah(dev, priv->pd, &av); + ah = ipoib_create_ah(priv, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { @@ -245,8 +214,8 @@ mcast->ah = ah; spin_unlock_irq(&priv->lock); - ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", - mcast->mcmember.mgid.raw, + ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, ":", mcast->ah->ah, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); @@ -254,23 +223,14 @@ } /* actually send any queued packets */ - netif_tx_lock_bh(dev); - while (!skb_queue_empty(&mcast->pkt_queue)) { - struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - netif_tx_unlock_bh(dev); + while (mcast->pkt_queue.ifq_len) { + struct mbuf *mb; + _IF_DEQUEUE(&mcast->pkt_queue, mb); + mb->m_pkthdr.rcvif = dev; - skb->dev = dev; - - if (!skb->dst || !skb->dst->neighbour) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof (struct ipoib_pseudoheader)); - } - - if (dev_queue_xmit(skb)) + if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); - netif_tx_lock_bh(dev); } - netif_tx_unlock_bh(dev); return 0; } @@ -280,7 +240,7 @@ struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; - struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = mcast->priv; /* We trap for port events ourselves. */ if (status == -ENETRESET) @@ -291,16 +251,12 @@ if (status) { if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", + mcast->mcmember.mgid.raw, ":", status); /* Flush out any queued packets */ - netif_tx_lock_bh(dev); - while (!skb_queue_empty(&mcast->pkt_queue)) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); - } - netif_tx_unlock_bh(dev); + priv->dev->if_oerrors += mcast->pkt_queue.ifq_len; + _IF_DRAIN(&mcast->pkt_queue); /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, @@ -311,8 +267,7 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) { - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = mcast->priv; struct ib_sa_mcmember_rec rec = { #if 0 /* Some SMs don't support send-only yet */ .join_state = 4 @@ -351,8 +306,8 @@ ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", ret); } else { - ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", - mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n", + mcast->mcmember.mgid.raw, ":"); } return ret; @@ -374,21 +329,17 @@ ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } - - rtnl_lock(); - netif_carrier_on(priv->dev); - rtnl_unlock(); + if_link_state_change(priv->dev, LINK_STATE_UP); } static int ipoib_mcast_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = mcast->priv; - ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", - mcast->mcmember.mgid.raw, status); + ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n", + mcast->mcmember.mgid.raw, ":", status); /* We trap for port events ourselves. */ if (status == -ENETRESET) @@ -417,11 +368,11 @@ if (mcast->logcount++ < 20) { if (status == -ETIMEDOUT || status == -EAGAIN) { - ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", + mcast->mcmember.mgid.raw, ":", status); } else { - ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + ipoib_warn(priv, "multicast join failed for %16D, status %d\n", + mcast->mcmember.mgid.raw, ":", status); } } @@ -443,17 +394,17 @@ return status; } -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, - int create) +static void ipoib_mcast_join(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast, int create) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_sa_mcmember_rec rec = { .join_state = 1 }; ib_sa_comp_mask comp_mask; int ret = 0; - ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(priv, "joining MGID %16D\n", + mcast->mcmember.mgid.raw, ":"); rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; @@ -514,15 +465,17 @@ { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); - struct net_device *dev = priv->dev; + struct ifnet *dev = priv->dev; + ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags); + if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); { struct ib_port_attr attr; @@ -539,7 +492,7 @@ if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) return; - broadcast = ipoib_mcast_alloc(dev, 1); + broadcast = ipoib_mcast_alloc(priv, 1); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); mutex_lock(&mcast_mutex); @@ -551,11 +504,11 @@ } spin_lock_irq(&priv->lock); - memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; - __ipoib_mcast_add(dev, priv->broadcast); + __ipoib_mcast_add(priv, priv->broadcast); spin_unlock_irq(&priv->lock); } @@ -563,7 +516,7 @@ !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) - ipoib_mcast_join(dev, priv->broadcast, 0); + ipoib_mcast_join(priv, priv->broadcast, 0); return; } @@ -586,7 +539,7 @@ break; } - ipoib_mcast_join(dev, mcast, 1); + ipoib_mcast_join(priv, mcast, 1); return; } @@ -597,23 +550,19 @@ priv->mcast_mtu = priv->admin_mtu; spin_unlock_irq(&priv->lock); - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } + if (!ipoib_cm_admin_enabled(priv)) + ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu)); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); } -int ipoib_mcast_start_thread(struct net_device *dev) +int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n", + priv->flags); - ipoib_dbg_mcast(priv, "starting multicast thread\n"); - mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); @@ -622,9 +571,8 @@ return 0; } -int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush) { - struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg_mcast(priv, "stopping multicast thread\n"); @@ -639,17 +587,16 @@ return 0; } -static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", - mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(priv, "leaving MGID %16D\n", + mcast->mcmember.mgid.raw, ":"); /* Remove ourselves from the multicast group */ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, @@ -661,49 +608,47 @@ return 0; } -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) +void +ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; struct ipoib_mcast *mcast; - unsigned long flags; - spin_lock_irqsave(&priv->lock, flags); - if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || !priv->broadcast || !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - goto unlock; + ++dev->if_oerrors; + m_freem(mb); + return; } - mcast = __ipoib_mcast_find(dev, mgid); + mcast = __ipoib_mcast_find(priv, mgid); if (!mcast) { /* Let's create a new send only group now */ - ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", - mgid); + ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n", + mgid, ":"); - mcast = ipoib_mcast_alloc(dev, 0); + mcast = ipoib_mcast_alloc(priv, 0); if (!mcast) { ipoib_warn(priv, "unable to allocate memory for " "multicast structure\n"); - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + ++dev->if_oerrors; + m_freem(mb); goto out; } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); - __ipoib_mcast_add(dev, mcast); + __ipoib_mcast_add(priv, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } if (!mcast->ah) { - if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) - skb_queue_tail(&mcast->pkt_queue, skb); - else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) { + _IF_ENQUEUE(&mcast->pkt_queue, mb); + } else { + ++dev->if_oerrors; + m_freem(mb); } if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) @@ -720,30 +665,12 @@ } out: - if (mcast && mcast->ah) { - if (skb->dst && - skb->dst->neighbour && - !*to_ipoib_neigh(skb->dst->neighbour)) { - struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, - skb->dev); - - if (neigh) { - kref_get(&mcast->ah->ref); - neigh->ah = mcast->ah; - list_add_tail(&neigh->list, &mcast->neigh_list); - } - } - - ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); - } - -unlock: - spin_unlock_irqrestore(&priv->lock, flags); + if (mcast && mcast->ah) + ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN); } -void ipoib_mcast_dev_flush(struct net_device *dev) +void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); LIST_HEAD(remove_list); struct ipoib_mcast *mcast, *tmcast; unsigned long flags; @@ -767,7 +694,7 @@ spin_unlock_irqrestore(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { - ipoib_mcast_leave(dev, mcast); + ipoib_mcast_leave(priv, mcast); ipoib_mcast_free(mcast); } } @@ -790,19 +717,24 @@ { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, restart_task); - struct net_device *dev = priv->dev; - struct dev_mc_list *mclist; + ipoib_mcast_restart(priv); +} + +void ipoib_mcast_restart(struct ipoib_dev_priv *priv) +{ + struct ifnet *dev = priv->dev; + struct ifmultiaddr *ifma;; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); - unsigned long flags; struct ib_sa_mcmember_rec rec; + int addrlen; - ipoib_dbg_mcast(priv, "restarting multicast task\n"); + ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n", + priv->flags); - ipoib_mcast_stop_thread(dev, 0); + ipoib_mcast_stop_thread(priv, 0); - local_irq_save(flags); - netif_addr_lock(dev); + if_maddr_rlock(dev); spin_lock(&priv->lock); /* @@ -816,33 +748,39 @@ clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ - for (mclist = dev->mc_list; mclist; mclist = mclist->next) { + + + TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { union ib_gid mgid; + uint8_t *addr; - if (!ipoib_mcast_addr_is_valid(mclist->dmi_addr, - mclist->dmi_addrlen, - dev->broadcast)) + if (ifma->ifma_addr->sa_family != AF_LINK) continue; + addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); + addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen; + if (!ipoib_mcast_addr_is_valid(addr, addrlen, + dev->if_broadcastaddr)) + continue; - memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); + memcpy(mgid.raw, addr + 4, sizeof mgid); - mcast = __ipoib_mcast_find(dev, &mgid); + mcast = __ipoib_mcast_find(priv, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; /* ignore group which is directly joined by userspace */ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { - ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", - mgid.raw); + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n", + mgid.raw, ":"); continue; } /* Not found or send-only group, let's add a new entry */ - ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", - mgid.raw); + ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n", + mgid.raw, ":"); - nmcast = ipoib_mcast_alloc(dev, 0); + nmcast = ipoib_mcast_alloc(priv, 0); if (!nmcast) { ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); continue; @@ -860,7 +798,7 @@ &nmcast->rb_node, &priv->multicast_tree); } else - __ipoib_mcast_add(dev, nmcast); + __ipoib_mcast_add(priv, nmcast); list_add_tail(&nmcast->list, &priv->multicast_list); } @@ -873,8 +811,8 @@ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n", - mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(priv, "deleting multicast group %16D\n", + mcast->mcmember.mgid.raw, ":"); rb_erase(&mcast->rb_node, &priv->multicast_tree); @@ -884,22 +822,21 @@ } spin_unlock(&priv->lock); - netif_addr_unlock(dev); - local_irq_restore(flags); + if_maddr_runlock(dev); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { - ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_leave(mcast->priv, mcast); ipoib_mcast_free(mcast); } if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_mcast_start_thread(dev); + ipoib_mcast_start_thread(priv); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG -struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_mcast_iter *iter; @@ -907,7 +844,7 @@ if (!iter) return NULL; - iter->dev = dev; + iter->priv = priv; memset(iter->mgid.raw, 0, 16); if (ipoib_mcast_iter_next(iter)) { @@ -920,7 +857,7 @@ int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { - struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_mcast *mcast; int ret = 1; @@ -936,7 +873,7 @@ sizeof (union ib_gid)) < 0) { iter->mgid = mcast->mcmember.mgid; iter->created = mcast->created; - iter->queuelen = skb_queue_len(&mcast->pkt_queue); + iter->queuelen = mcast->pkt_queue.ifq_len; iter->complete = !!mcast->ah; iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c (.../head) (revision 219811) @@ -258,27 +258,27 @@ .release = seq_release }; -void ipoib_create_debug_files(struct net_device *dev) +void ipoib_create_debug_files(struct ifnet *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; char name[IFNAMSIZ + sizeof "_path"]; - snprintf(name, sizeof name, "%s_mcg", dev->name); + snprintf(name, sizeof name, "%s_mcg", if_name(dev)); priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_mcg_fops); if (!priv->mcg_dentry) ipoib_warn(priv, "failed to create mcg debug file\n"); - snprintf(name, sizeof name, "%s_path", dev->name); + snprintf(name, sizeof name, "%s_path", if_name(dev)); priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_path_fops); if (!priv->path_dentry) ipoib_warn(priv, "failed to create path debug file\n"); } -void ipoib_delete_debug_files(struct net_device *dev) +void ipoib_delete_debug_files(struct ifnet *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev->if_softc; if (priv->mcg_dentry) debugfs_remove(priv->mcg_dentry); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (.../head) (revision 219811) @@ -33,15 +33,15 @@ * SOFTWARE. */ -#include -#include +#include "ipoib.h" #include -#include -#include -#include "ipoib.h" +#include +#include +#include + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; @@ -52,7 +52,7 @@ static DEFINE_MUTEX(pkey_mutex); -struct ipoib_ah *ipoib_create_ah(struct net_device *dev, +struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; @@ -61,7 +61,7 @@ if (!ah) return NULL; - ah->dev = dev; + ah->priv = priv; ah->last_send = 0; kref_init(&ah->ref); @@ -70,7 +70,7 @@ kfree(ah); ah = NULL; } else - ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + ipoib_dbg(priv, "Created ah %p\n", ah->ah); return ah; } @@ -78,7 +78,7 @@ void ipoib_free_ah(struct kref *kref) { struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); - struct ipoib_dev_priv *priv = netdev_priv(ah->dev); + struct ipoib_dev_priv *priv = ah->priv; unsigned long flags; @@ -87,128 +87,102 @@ spin_unlock_irqrestore(&priv->lock, flags); } -static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, - u64 mapping[IPOIB_UD_RX_SG]) +void +ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req) { - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, - DMA_FROM_DEVICE); - ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, - DMA_FROM_DEVICE); - } else - ib_dma_unmap_single(priv->ca, mapping[0], - IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), - DMA_FROM_DEVICE); + struct mbuf *m; + int i; + + for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++) + ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len, + DMA_FROM_DEVICE); } -static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, - struct sk_buff *skb, - unsigned int length) +void +ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length) { - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; - unsigned int size; - /* - * There is only two buffers needed for max_payload = 4K, - * first buf size is IPOIB_UD_HEAD_SIZE - */ - skb->tail += IPOIB_UD_HEAD_SIZE; - skb->len += length; - size = length - IPOIB_UD_HEAD_SIZE; + m_adj(mb, -(mb->m_pkthdr.len - length)); +} - frag->size = size; - skb->data_len += size; - skb->truesize += size; - } else - skb_put(skb, length); +struct mbuf * +ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, + int size) +{ + struct mbuf *mb, *m; + int i, j; + rx_req->mb = NULL; + mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR); + if (mb == NULL) + return (NULL); + for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { + m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size : + ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN); + mb->m_pkthdr.len += m->m_len; + rx_req->mapping[i] = ib_dma_map_single(priv->ca, + mtod(m, void *), m->m_len, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, + rx_req->mapping[i]))) + goto error; + + } + rx_req->mb = mb; + return (mb); +error: + for (j = 0, m = mb; j < i; m = m->m_next, j++) + ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len, + DMA_FROM_DEVICE); + m_freem(mb); + return (NULL); + } -static int ipoib_ib_post_receive(struct net_device *dev, int id) +static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; + struct mbuf *m; int ret; + int i; - priv->rx_wr.wr_id = id | IPOIB_OP_RECV; - priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; - priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; + rx_req = &priv->rx_ring[id]; + for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { + priv->rx_sge[i].addr = rx_req->mapping[i]; + priv->rx_sge[i].length = m->m_len; + } + priv->rx_wr.num_sge = i; + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; - ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); - ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); - dev_kfree_skb_any(priv->rx_ring[id].skb); - priv->rx_ring[id].skb = NULL; + ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]); + m_freem(priv->rx_ring[id].mb); + priv->rx_ring[id].mb = NULL; } return ret; } -static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) +static struct mbuf * +ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct sk_buff *skb; - int buf_size; - u64 *mapping; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) - buf_size = IPOIB_UD_HEAD_SIZE; - else - buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - - skb = dev_alloc_skb(buf_size + 4); - if (unlikely(!skb)) - return NULL; - - /* - * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte - * header. So we need 4 more bytes to get to 48 and align the - * IP header to a multiple of 16. - */ - skb_reserve(skb, 4); - - mapping = priv->rx_ring[id].mapping; - mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, - DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) - goto error; - - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - struct page *page = alloc_page(GFP_ATOMIC); - if (!page) - goto partial_error; - skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); - mapping[1] = - ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page, - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) - goto partial_error; - } - - priv->rx_ring[id].skb = skb; - return skb; - -partial_error: - ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); -error: - dev_kfree_skb_any(skb); - return NULL; + return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], + priv->max_ib_mtu + IB_GRH_BYTES); } -static int ipoib_ib_post_receives(struct net_device *dev) +static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int i; for (i = 0; i < ipoib_recvq_size; ++i) { - if (!ipoib_alloc_rx_skb(dev, i)) { + if (!ipoib_alloc_rx_mb(priv, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; } - if (ipoib_ib_post_receive(dev, i)) { + if (ipoib_ib_post_receive(priv, i)) { ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); return -EIO; } @@ -217,12 +191,14 @@ return 0; } -static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +static void +ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_rx_buf saverx; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; - struct sk_buff *skb; - u64 mapping[IPOIB_UD_RX_SG]; + struct ifnet *dev = priv->dev; + struct ipoib_header *eh; + struct mbuf *mb; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -233,16 +209,20 @@ return; } - skb = priv->rx_ring[wr_id].skb; + mb = priv->rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->status != IB_WC_WR_FLUSH_ERR) { ipoib_warn(priv, "failed recv event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); - ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); - dev_kfree_skb_any(skb); - priv->rx_ring[wr_id].skb = NULL; + goto repost; + } + if (mb) { + ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]); + m_freem(mb); + priv->rx_ring[wr_id].mb = NULL; + } return; } @@ -253,116 +233,101 @@ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) goto repost; - memcpy(mapping, priv->rx_ring[wr_id].mapping, - IPOIB_UD_RX_SG * sizeof *mapping); - + memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx)); /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ - if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { - ++dev->stats.rx_dropped; + if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) { + memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx)); + dev->if_iqdrops++; goto repost; } ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); - ipoib_ud_dma_unmap_rx(priv, mapping); - ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + ipoib_dma_unmap_rx(priv, &saverx); + ipoib_dma_mb(priv, mb, wc->byte_len); - skb_pull(skb, IB_GRH_BYTES); + ++dev->if_ipackets; + dev->if_ibytes += mb->m_pkthdr.len; + mb->m_pkthdr.rcvif = dev; + m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN); + eh = mtod(mb, struct ipoib_header *); + bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ - skb->protocol = ((struct ipoib_header *) skb->data)->proto; - skb_reset_mac_header(skb); - skb_pull(skb, IPOIB_ENCAP_LEN); - - dev->last_rx = jiffies; - ++dev->stats.rx_packets; - dev->stats.rx_bytes += skb->len; - - skb->dev = dev; - /* XXX get correct PACKET_ type here */ - skb->pkt_type = PACKET_HOST; - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) - skb->ip_summed = CHECKSUM_UNNECESSARY; + mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; - if (dev->features & NETIF_F_LRO) - lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); - else - netif_receive_skb(skb); + dev->if_input(dev, mb); repost: - if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + if (unlikely(ipoib_ib_post_receive(priv, wr_id))) ipoib_warn(priv, "ipoib_ib_post_receive failed " "for buf %d\n", wr_id); } -static int ipoib_dma_map_tx(struct ib_device *ca, - struct ipoib_tx_buf *tx_req) +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max) { - struct sk_buff *skb = tx_req->skb; + struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; + struct mbuf *m, *p; + int error; int i; - int off; - if (skb_headlen(skb)) { - mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), - DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) { + if (m->m_len != 0) + continue; + if (p == NULL) + panic("ipoib_dma_map_tx: First mbuf empty\n"); + p->m_next = m_free(m); + m = p; + i--; + } + i--; + if (i >= max) { + tx_req->mb = mb = m_defrag(mb, M_DONTWAIT); + if (mb == NULL) return -EIO; - - off = 1; - } else - off = 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - mapping[i + off] = ib_dma_map_page(ca, frag->page, - frag->page_offset, frag->size, - DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) - goto partial_error; + for (m = mb, i = 0; m != NULL; m = m->m_next, i++); + if (i >= max) + return -EIO; } - return 0; - -partial_error: - for (; i > 0; --i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; - ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE); + error = 0; + for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { + mapping[i] = ib_dma_map_single(ca, mtod(m, void *), + m->m_len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) { + error = -EIO; + break; + } } + if (error) { + int end; - if (off) - ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); - - return -EIO; + end = i; + for (m = mb, i = 0; i < end; m = m->m_next, i++) + ib_dma_unmap_single(ca, mapping[i], m->m_len, + DMA_TO_DEVICE); + } + return error; } -static void ipoib_dma_unmap_tx(struct ib_device *ca, - struct ipoib_tx_buf *tx_req) +void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) { - struct sk_buff *skb = tx_req->skb; + struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; + struct mbuf *m; int i; - int off; - if (skb_headlen(skb)) { - ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); - off = 1; - } else - off = 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - ib_dma_unmap_page(ca, mapping[i + off], frag->size, - DMA_TO_DEVICE); - } + for (m = mb, i = 0; m != NULL; m = m->m_next, i++) + ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); } -static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; unsigned int wr_id = wc->wr_id; struct ipoib_tx_buf *tx_req; @@ -379,16 +344,16 @@ ipoib_dma_unmap_tx(priv->ca, tx_req); - ++dev->stats.tx_packets; - dev->stats.tx_bytes += tx_req->skb->len; + ++dev->if_opackets; + dev->if_obytes += tx_req->mb->m_pkthdr.len; - dev_kfree_skb_any(tx_req->skb); + m_freem(tx_req->mb); ++priv->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && - netif_queue_stopped(dev) && + (dev->if_drv_flags & IFF_DRV_OACTIVE) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - netif_wake_queue(dev); + dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) @@ -397,126 +362,104 @@ wc->status, wr_id, wc->vendor_err); } -static int poll_tx(struct ipoib_dev_priv *priv) +int +ipoib_poll_tx(struct ipoib_dev_priv *priv) { int n, i; n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); - for (i = 0; i < n; ++i) - ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); + for (i = 0; i < n; ++i) { + struct ib_wc *wc = priv->send_wc + i; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(priv, wc); + else + ipoib_ib_handle_tx_wc(priv, wc); + } return n == MAX_SEND_CQE; } -int ipoib_poll(struct napi_struct *napi, int budget) +static void +ipoib_poll(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi); - struct net_device *dev = priv->dev; - int done; - int t; int n, i; - done = 0; - poll_more: - while (done < budget) { - int max = (budget - done); + for (;;) { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); - t = min(IPOIB_NUM_WC, max); - n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); - for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i; - if (wc->wr_id & IPOIB_OP_RECV) { - ++done; - if (wc->wr_id & IPOIB_OP_CM) - ipoib_cm_handle_rx_wc(dev, wc); - else - ipoib_ib_handle_rx_wc(dev, wc); - } else - ipoib_cm_handle_tx_wc(priv->dev, wc); + if ((wc->wr_id & IPOIB_OP_RECV) == 0) + panic("ipoib_poll: Bad wr_id 0x%jX\n", + (intmax_t)wc->wr_id); + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(priv, wc); + else + ipoib_ib_handle_rx_wc(priv, wc); } - if (n != t) + if (n != IPOIB_NUM_WC) break; } - if (done < budget) { - if (dev->features & NETIF_F_LRO) - lro_flush_all(&priv->lro.lro_mgr); - - napi_complete(napi); - if (unlikely(ib_req_notify_cq(priv->recv_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS)) && - napi_reschedule(napi)) - goto poll_more; - } - - return done; + if (ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) + goto poll_more; } void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { - struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = dev_ptr; - napi_schedule(&priv->napi); + ipoib_poll(priv); } -static void drain_tx_cq(struct net_device *dev) +static void drain_tx_cq(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; - netif_tx_lock(dev); - while (poll_tx(priv)) + spin_lock(&priv->lock); + while (ipoib_poll_tx(priv)) ; /* nothing */ - if (netif_queue_stopped(dev)) + if (dev->if_drv_flags & IFF_DRV_OACTIVE) mod_timer(&priv->poll_timer, jiffies + 1); - netif_tx_unlock(dev); + spin_unlock(&priv->lock); } void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) { - struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + struct ipoib_dev_priv *priv = dev_ptr; mod_timer(&priv->poll_timer, jiffies); } -static inline int post_send(struct ipoib_dev_priv *priv, - unsigned int wr_id, - struct ib_ah *address, u32 qpn, - struct ipoib_tx_buf *tx_req, - void *head, int hlen) +static inline int +post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, + struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head, + int hlen) { struct ib_send_wr *bad_wr; - int i, off; - struct sk_buff *skb = tx_req->skb; - skb_frag_t *frags = skb_shinfo(skb)->frags; - int nr_frags = skb_shinfo(skb)->nr_frags; + struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; + struct mbuf *m; + int i; - if (skb_headlen(skb)) { - priv->tx_sge[0].addr = mapping[0]; - priv->tx_sge[0].length = skb_headlen(skb); - off = 1; - } else - off = 0; - - for (i = 0; i < nr_frags; ++i) { - priv->tx_sge[i + off].addr = mapping[i + off]; - priv->tx_sge[i + off].length = frags[i].size; + for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { + priv->tx_sge[i].addr = mapping[i]; + priv->tx_sge[i].length = m->m_len; } - priv->tx_wr.num_sge = nr_frags + off; + priv->tx_wr.num_sge = i; priv->tx_wr.wr_id = wr_id; priv->tx_wr.wr.ud.remote_qpn = qpn; priv->tx_wr.wr.ud.ah = address; + if (head) { - priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ priv->tx_wr.wr.ud.header = head; priv->tx_wr.wr.ud.hlen = hlen; priv->tx_wr.opcode = IB_WR_LSO; @@ -526,31 +469,36 @@ return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); } -void ipoib_send(struct net_device *dev, struct sk_buff *skb, - struct ipoib_ah *address, u32 qpn) +void +ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, + struct ipoib_ah *address, u32 qpn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; struct ipoib_tx_buf *tx_req; int hlen; void *phead; - if (skb_is_gso(skb)) { - hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); - phead = skb->data; - if (unlikely(!skb_pull(skb, hlen))) { + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (ipoib_poll_tx(priv)) + ; /* nothing */ + + m_adj(mb, sizeof (struct ipoib_pseudoheader)); + if (0 /* XXX segment offload mb_is_gso(mb) */) { + /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */ + phead = mtod(mb, void *); + if (mb->m_len < hlen) { ipoib_warn(priv, "linear data too small\n"); - ++dev->stats.tx_dropped; - ++dev->stats.tx_errors; - dev_kfree_skb_any(skb); + ++dev->if_oerrors; + m_freem(mb); return; } + m_adj(mb, hlen); } else { - if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); - ++dev->stats.tx_dropped; - ++dev->stats.tx_errors; - ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + mb->m_pkthdr.len, priv->mcast_mtu); + ++dev->if_oerrors; + ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu); return; } phead = NULL; @@ -558,24 +506,25 @@ } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", - skb->len, address, qpn); + mb->m_pkthdr.len, address, qpn); /* - * We put the skb into the tx_ring _before_ we call post_send() + * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; - tx_req->skb = skb; - if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { - ++dev->stats.tx_errors; - dev_kfree_skb_any(skb); + tx_req->mb = mb; + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) { + ++dev->if_oerrors; + if (tx_req->mb) + m_freem(tx_req->mb); return; } - if (skb->ip_summed == CHECKSUM_PARTIAL) + if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; else priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; @@ -584,40 +533,31 @@ ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); - netif_stop_queue(dev); + dev->if_drv_flags |= IFF_DRV_OACTIVE; } - if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), - address->ah, qpn, tx_req, phead, hlen))) { + if (unlikely(post_send(priv, + priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, + tx_req, phead, hlen))) { ipoib_warn(priv, "post_send failed\n"); - ++dev->stats.tx_errors; + ++dev->if_oerrors; --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); - dev_kfree_skb_any(skb); - if (netif_queue_stopped(dev)) - netif_wake_queue(dev); + m_freem(mb); + if (dev->if_drv_flags & IFF_DRV_OACTIVE) + dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } else { - dev->trans_start = jiffies; - address->last_send = priv->tx_head; ++priv->tx_head; - skb_orphan(skb); - } - - if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) - while (poll_tx(priv)) - ; /* nothing */ } -static void __ipoib_reap_ah(struct net_device *dev) +static void __ipoib_reap_ah(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); unsigned long flags; - netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) @@ -628,31 +568,28 @@ } spin_unlock_irqrestore(&priv->lock, flags); - netif_tx_unlock_bh(dev); } void ipoib_reap_ah(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, ah_reap_task.work); - struct net_device *dev = priv->dev; - __ipoib_reap_ah(dev); + __ipoib_reap_ah(priv); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, - round_jiffies_relative(HZ)); + HZ); } -static void ipoib_ah_dev_cleanup(struct net_device *dev) +static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); unsigned long begin; begin = jiffies; while (!list_empty(&priv->dead_ahs)) { - __ipoib_reap_ah(dev); + __ipoib_reap_ah(priv); if (time_after(jiffies, begin + HZ)) { ipoib_warn(priv, "timing out; will leak address handles\n"); @@ -665,12 +602,11 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) { - drain_tx_cq((struct net_device *)ctx); + drain_tx_cq((struct ipoib_dev_priv *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev) +int ipoib_ib_dev_open(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { @@ -680,39 +616,34 @@ } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ret = ipoib_init_qp(dev); + ret = ipoib_init_qp(priv); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); return -1; } - ret = ipoib_ib_post_receives(dev); + ret = ipoib_ib_post_receives(priv); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(priv, 1); return -1; } - ret = ipoib_cm_dev_open(dev); + ret = ipoib_cm_dev_open(priv); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(priv, 1); return -1; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, - round_jiffies_relative(HZ)); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); - if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - napi_enable(&priv->napi); - return 0; } -static void ipoib_pkey_dev_check_presence(struct net_device *dev) +static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); u16 pkey_index = 0; if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) @@ -721,11 +652,10 @@ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } -int ipoib_ib_dev_up(struct net_device *dev) +int ipoib_ib_dev_up(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_pkey_dev_check_presence(dev); + ipoib_pkey_dev_check_presence(priv); if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { ipoib_dbg(priv, "PKEY is not assigned.\n"); @@ -734,17 +664,16 @@ set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); - return ipoib_mcast_start_thread(dev); + return ipoib_mcast_start_thread(priv); } -int ipoib_ib_dev_down(struct net_device *dev, int flush) +int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush) { - struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "downing ib_dev\n"); clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); - netif_carrier_off(dev); + if_link_state_change(priv->dev, LINK_STATE_DOWN); /* Shutdown the P_Key thread if still active */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { @@ -756,39 +685,30 @@ flush_workqueue(ipoib_workqueue); } - ipoib_mcast_stop_thread(dev, flush); - ipoib_mcast_dev_flush(dev); + ipoib_mcast_stop_thread(priv, flush); + ipoib_mcast_dev_flush(priv); - ipoib_flush_paths(dev); + ipoib_flush_paths(priv); return 0; } -static int recvs_pending(struct net_device *dev) +static int recvs_pending(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int pending = 0; int i; for (i = 0; i < ipoib_recvq_size; ++i) - if (priv->rx_ring[i].skb) + if (priv->rx_ring[i].mb) ++pending; return pending; } -void ipoib_drain_cq(struct net_device *dev) +void ipoib_drain_cq(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); int i, n; - /* - * We call completion handling routines that expect to be - * called from the BH-disabled NAPI poll context, so disable - * BHs here too. - */ - local_bh_disable(); - do { n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { @@ -800,35 +720,32 @@ if (priv->ibwc[i].status == IB_WC_SUCCESS) priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; - if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { - if (priv->ibwc[i].wr_id & IPOIB_OP_CM) - ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); - else - ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); - } else - ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0) + panic("ipoib_drain_cq: Bad wrid 0x%jX\n", + (intmax_t)priv->ibwc[i].wr_id); + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(priv, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); } } while (n == IPOIB_NUM_WC); - while (poll_tx(priv)) + spin_lock(&priv->lock); + while (ipoib_poll_tx(priv)) ; /* nothing */ - local_bh_enable(); + spin_unlock(&priv->lock); } -int ipoib_ib_dev_stop(struct net_device *dev, int flush) +int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; unsigned long begin; struct ipoib_tx_buf *tx_req; int i; - if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - napi_disable(&priv->napi); + ipoib_cm_dev_stop(priv); - ipoib_cm_dev_stop(dev); - /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. @@ -840,10 +757,10 @@ /* Wait for all sends and receives to complete */ begin = jiffies; - while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", - priv->tx_head - priv->tx_tail, recvs_pending(dev)); + priv->tx_head - priv->tx_tail, recvs_pending(priv)); /* * assume the HW is wedged and just free up @@ -853,7 +770,7 @@ tx_req = &priv->tx_ring[priv->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, tx_req); - dev_kfree_skb_any(tx_req->skb); + m_freem(tx_req->mb); ++priv->tx_tail; --priv->tx_outstanding; } @@ -862,18 +779,17 @@ struct ipoib_rx_buf *rx_req; rx_req = &priv->rx_ring[i]; - if (!rx_req->skb) + if (!rx_req->mb) continue; - ipoib_ud_dma_unmap_rx(priv, - priv->rx_ring[i].mapping); - dev_kfree_skb_any(rx_req->skb); - rx_req->skb = NULL; + ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]); + m_freem(rx_req->mb); + rx_req->mb = NULL; } goto timeout; } - ipoib_drain_cq(dev); + ipoib_drain_cq(priv); msleep(1); } @@ -892,32 +808,32 @@ if (flush) flush_workqueue(ipoib_workqueue); - ipoib_ah_dev_cleanup(dev); + ipoib_ah_dev_cleanup(priv); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); return 0; } -int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ifnet *dev = priv->dev; priv->ca = ca; priv->port = port; priv->qp = NULL; - if (ipoib_transport_dev_init(dev, ca)) { + if (ipoib_transport_dev_init(priv, ca)) { printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); return -ENODEV; } setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, - (unsigned long) dev); + (unsigned long) priv); - if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev)) { - ipoib_transport_dev_cleanup(dev); + if (dev->if_flags & IFF_UP) { + if (ipoib_ib_dev_open(priv)) { + ipoib_transport_dev_cleanup(priv); return -ENODEV; } } @@ -929,7 +845,6 @@ enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; - struct net_device *dev = priv->dev; u16 new_index; mutex_lock(&priv->vlan_mutex); @@ -956,9 +871,9 @@ if (level == IPOIB_FLUSH_HEAVY) { if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) + ipoib_ib_dev_down(priv, 0); + ipoib_ib_dev_stop(priv, 0); + if (ipoib_pkey_dev_delay_open(priv)) return; } @@ -972,16 +887,16 @@ } if (level == IPOIB_FLUSH_LIGHT) { - ipoib_mark_paths_invalid(dev); - ipoib_mcast_dev_flush(dev); + ipoib_mark_paths_invalid(priv); + ipoib_mcast_dev_flush(priv); } if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(priv, 0); if (level == IPOIB_FLUSH_HEAVY) { - ipoib_ib_dev_stop(dev, 0); - ipoib_ib_dev_open(dev); + ipoib_ib_dev_stop(priv, 0); + ipoib_ib_dev_open(priv); } /* @@ -990,7 +905,7 @@ */ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_up(dev); + ipoib_ib_dev_up(priv); ipoib_mcast_restart_task(&priv->restart_task); } } @@ -1019,17 +934,16 @@ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } -void ipoib_ib_dev_cleanup(struct net_device *dev) +void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); - ipoib_mcast_stop_thread(dev, 1); - ipoib_mcast_dev_flush(dev); + ipoib_mcast_stop_thread(priv, 1); + ipoib_mcast_dev_flush(priv); - ipoib_ah_dev_cleanup(dev); - ipoib_transport_dev_cleanup(dev); + ipoib_ah_dev_cleanup(priv); + ipoib_transport_dev_cleanup(priv); } /* @@ -1046,12 +960,11 @@ { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); - struct net_device *dev = priv->dev; - ipoib_pkey_dev_check_presence(dev); + ipoib_pkey_dev_check_presence(priv); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) - ipoib_open(dev); + ipoib_open(priv); else { mutex_lock(&pkey_mutex); if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) @@ -1062,13 +975,12 @@ } } -int ipoib_pkey_dev_delay_open(struct net_device *dev) +int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = netdev_priv(dev); /* Look for the interface pkey value in the IB Port P_Key table and */ /* set the interface pkey assigment flag */ - ipoib_pkey_dev_check_presence(dev); + ipoib_pkey_dev_check_presence(priv); /* P_Key value not assigned yet - start polling */ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c (.../head) (revision 219811) @@ -29,371 +29,257 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include -#include -#include -#include #include "sdp.h" SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes."); -SDP_MODPARAM_SINT(rcvbuf_scale, 0x10, +SDP_MODPARAM_SINT(rcvbuf_scale, 0x8, "Receive buffer size scale factor."); -SDP_MODPARAM_INT(top_mem_usage, 0, - "Top system wide sdp memory usage for recv (in MB)."); -#ifdef CONFIG_PPC -SDP_MODPARAM_SINT(max_large_sockets, 100, - "Max number of large sockets (32k buffers)."); -#else -SDP_MODPARAM_SINT(max_large_sockets, 1000, - "Max number of large sockets (32k buffers)."); -#endif - -static int curr_large_sockets; -atomic_t sdp_current_mem_usage; -spinlock_t sdp_large_sockets_lock; - -static int sdp_get_large_socket(const struct sdp_sock *ssk) -{ - int ret; - - if (ssk->recv_request) - return 1; - - spin_lock_irq(&sdp_large_sockets_lock); - ret = curr_large_sockets < max_large_sockets; - if (ret) - curr_large_sockets++; - spin_unlock_irq(&sdp_large_sockets_lock); - - return ret; -} - -void sdp_remove_large_sock(const struct sdp_sock *ssk) -{ - if (ssk->recv_frags) { - spin_lock_irq(&sdp_large_sockets_lock); - curr_large_sockets--; - spin_unlock_irq(&sdp_large_sockets_lock); - } -} - /* Like tcp_fin - called when SDP_MID_DISCONNECT is received */ -void sdp_handle_disconn(struct sock *sk) +static void +sdp_handle_disconn(struct sdp_sock *ssk) { - sdp_dbg(sk, "%s\n", __func__); - sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(sk, SOCK_DONE); + sdp_dbg(ssk->socket, "%s\n", __func__); - switch (sk->sk_state) { - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - sdp_exch_state(sk, TCPF_SYN_RECV | TCPF_ESTABLISHED, - TCP_CLOSE_WAIT); + SDP_WLOCK_ASSERT(ssk); + if (TCPS_HAVERCVDFIN(ssk->state) == 0) + socantrcvmore(ssk->socket); + + switch (ssk->state) { + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + ssk->state = TCPS_CLOSE_WAIT; break; - case TCP_FIN_WAIT1: + case TCPS_FIN_WAIT_1: /* Received a reply FIN - start Infiniband tear down */ - sdp_dbg(sk, "%s: Starting Infiniband tear down sending DREQ\n", - __func__); + sdp_dbg(ssk->socket, + "%s: Starting Infiniband tear down sending DREQ\n", + __func__); - sdp_cancel_dreq_wait_timeout(sdp_sk(sk)); + sdp_cancel_dreq_wait_timeout(ssk); + ssk->qp_active = 0; + if (ssk->id) { + struct rdma_cm_id *id; - sdp_exch_state(sk, TCPF_FIN_WAIT1, TCP_TIME_WAIT); - - if (sdp_sk(sk)->id) { - sdp_sk(sk)->qp_active = 0; - rdma_disconnect(sdp_sk(sk)->id); + id = ssk->id; + SDP_WUNLOCK(ssk); + rdma_disconnect(id); + SDP_WLOCK(ssk); } else { - /* possible in a case of device removal */ - sdp_dbg(sk, "sdp_sk(sk)->id is NULL\n"); + sdp_warn(ssk->socket, + "%s: ssk->id is NULL\n", __func__); return; } break; - case TCP_TIME_WAIT: + case TCPS_TIME_WAIT: /* This is a mutual close situation and we've got the DREQ from the peer before the SDP_MID_DISCONNECT */ break; - case TCP_CLOSE: + case TCPS_CLOSED: /* FIN arrived after IB teardown started - do nothing */ - sdp_dbg(sk, "%s: fin in state %s\n", - __func__, sdp_state_str(sk->sk_state)); + sdp_dbg(ssk->socket, "%s: fin in state %s\n", + __func__, sdp_state_str(ssk->state)); return; default: - sdp_warn(sk, "%s: FIN in unexpected state. sk->sk_state=%s\n", - __func__, sdp_state_str(sk->sk_state)); + sdp_warn(ssk->socket, + "%s: FIN in unexpected state. state=%d\n", + __func__, ssk->state); break; } - - - sk_mem_reclaim(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - - /* Do not send POLL_HUP for half duplex close. */ - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, 1, POLL_HUP); - else - sk_wake_async(sk, 1, POLL_IN); - } } -static int sdp_post_recv(struct sdp_sock *ssk) +static int +sdp_post_recv(struct sdp_sock *ssk) { struct sdp_buf *rx_req; - int i, rc, frags; + int i, rc; u64 addr; struct ib_device *dev; struct ib_recv_wr rx_wr = { NULL }; struct ib_sge ibsge[SDP_MAX_RECV_SGES]; struct ib_sge *sge = ibsge; struct ib_recv_wr *bad_wr; - struct sk_buff *skb; - struct page *page; - skb_frag_t *frag; + struct mbuf *mb, *m; struct sdp_bsdh *h; int id = ring_head(ssk->rx_ring); - gfp_t gfp_page; - int pages_alloced = 0; /* Now, allocate and repost recv */ - /* TODO: allocate from cache */ - - if (unlikely(sk_ssk(ssk)->sk_allocation)) { - skb = sdp_stream_alloc_skb(sk_ssk(ssk), SDP_SKB_HEAD_SIZE, - sk_ssk(ssk)->sk_allocation); - gfp_page = sk_ssk(ssk)->sk_allocation | __GFP_HIGHMEM; - } else { - skb = sdp_stream_alloc_skb(sk_ssk(ssk), SDP_SKB_HEAD_SIZE, - GFP_KERNEL); - gfp_page = GFP_HIGHUSER; - } - - if (unlikely(!skb)) + sdp_prf(ssk->socket, mb, "Posting mb"); + mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR); + if (mb == NULL) { + /* Retry so we can't stall out with no memory. */ + if (!rx_ring_posted(ssk)) + queue_work(rx_comp_wq, &ssk->rx_comp_work); return -1; - - sdp_prf(sk_ssk(ssk), skb, "Posting skb"); - h = (struct sdp_bsdh *)skb->head; - - rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1)); - rx_req->skb = skb; - - for (i = 0; i < ssk->recv_frags; ++i) { - if (rx_req->mapping[i + 1]) - page = rx_req->pages[i]; - else { - if (unlikely(!sdp_has_free_mem())) - goto err; - rx_req->pages[i] = page = alloc_pages(gfp_page, 0); - if (unlikely(!page)) - goto err; - pages_alloced++; - } - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = min(PAGE_SIZE, SDP_MAX_PAYLOAD); - ++skb_shinfo(skb)->nr_frags; } - skb->truesize += ssk->recv_frags * min(PAGE_SIZE, SDP_MAX_PAYLOAD); - + for (m = mb; m != NULL; m = m->m_next) { + m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size : + ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN); + mb->m_pkthdr.len += m->m_len; + } + h = mtod(mb, struct sdp_bsdh *); + rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1)); + rx_req->mb = mb; dev = ssk->ib_device; - addr = ib_dma_map_single(dev, h, SDP_SKB_HEAD_SIZE, DMA_FROM_DEVICE); - BUG_ON(ib_dma_mapping_error(dev, addr)); - - rx_req->mapping[0] = addr; - - /* TODO: proper error handling */ - sge->addr = (u64)addr; - sge->length = SDP_SKB_HEAD_SIZE; - sge->lkey = ssk->sdp_dev->mr->lkey; - frags = skb_shinfo(skb)->nr_frags; - for (i = 0; i < frags; ++i) { - ++sge; - if (rx_req->mapping[i + 1]) { - addr = rx_req->mapping[i + 1]; - } else { - addr = ib_dma_map_page(dev, skb_shinfo(skb)->frags[i].page, - skb_shinfo(skb)->frags[i].page_offset, - skb_shinfo(skb)->frags[i].size, - DMA_FROM_DEVICE); - BUG_ON(ib_dma_mapping_error(dev, addr)); - rx_req->mapping[i + 1] = addr; - } + for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { + addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, + DMA_TO_DEVICE); + /* TODO: proper error handling */ + BUG_ON(ib_dma_mapping_error(dev, addr)); + BUG_ON(i >= SDP_MAX_RECV_SGES); + rx_req->mapping[i] = addr; sge->addr = addr; - sge->length = skb_shinfo(skb)->frags[i].size; + sge->length = mb->m_len; sge->lkey = ssk->sdp_dev->mr->lkey; - } + } rx_wr.next = NULL; rx_wr.wr_id = id | SDP_OP_RECV; rx_wr.sg_list = ibsge; - rx_wr.num_sge = frags + 1; + rx_wr.num_sge = i; rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr); if (unlikely(rc)) { - sdp_warn(sk_ssk(ssk), "ib_post_recv failed. status %d\n", rc); - goto err; + sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc); + + sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE); + m_freem(mb); + + sdp_notify(ssk, ECONNRESET); + + return -1; } atomic_inc(&ssk->rx_ring.head); SDPSTATS_COUNTER_INC(post_recv); - atomic_add(pages_alloced, &sdp_current_mem_usage); return 0; - -err: - atomic_add(pages_alloced, &sdp_current_mem_usage); - sdp_cleanup_sdp_buf(ssk, rx_req, SDP_SKB_HEAD_SIZE, DMA_FROM_DEVICE); - sdp_free_skb(skb); - sdp_reset(sk_ssk(ssk)); - return -1; } -static inline int sdp_post_recvs_needed(struct sdp_sock *ssk) +static inline int +sdp_post_recvs_needed(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - int buffer_size = SDP_SKB_HEAD_SIZE + ssk->recv_frags * PAGE_SIZE; - unsigned long max_bytes = ssk->rcvbuf_scale; unsigned long bytes_in_process; - int posted = rx_ring_posted(ssk); + unsigned long max_bytes; + int buffer_size; + int posted; - if (unlikely(!ssk->qp_active || !sdp_has_free_mem())) + if (!ssk->qp_active || !ssk->socket) return 0; - if (likely(posted >= SDP_RX_SIZE)) + posted = rx_ring_posted(ssk); + if (posted >= SDP_RX_SIZE) return 0; - - if (unlikely(posted < SDP_MIN_TX_CREDITS)) + if (posted < SDP_MIN_TX_CREDITS) return 1; - /* If rcvbuf is very small, must leave at least 1 skb for data, - * in addition to SDP_MIN_TX_CREDITS */ - max_bytes *= max(sk->sk_rcvbuf, (1 + SDP_MIN_TX_CREDITS) * buffer_size); - - /* Bytes posted to HW */ + buffer_size = ssk->recv_bytes; + max_bytes = max(ssk->socket->so_snd.sb_hiwat, + (1 + SDP_MIN_TX_CREDITS) * buffer_size); + max_bytes *= rcvbuf_scale; + /* + * Compute bytes in the receive queue and socket buffer. + */ bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size; + bytes_in_process += ssk->socket->so_rcv.sb_cc; - /* Bytes waiting in socket RX queue */ - bytes_in_process += rcv_nxt(ssk) - ssk->copied_seq; - return bytes_in_process < max_bytes; } -static inline void sdp_post_recvs(struct sdp_sock *ssk) +static inline void +sdp_post_recvs(struct sdp_sock *ssk) { -again: - while (sdp_post_recvs_needed(ssk)) { - if (sdp_post_recv(ssk)) - goto out; - } - sk_mem_reclaim(sk_ssk(ssk)); - - if (sdp_post_recvs_needed(ssk)) - goto again; -out: - sk_mem_reclaim(sk_ssk(ssk)); + while (sdp_post_recvs_needed(ssk)) + if (sdp_post_recv(ssk)) + return; } -static inline struct sk_buff *sdp_sock_queue_rcv_skb(struct sock *sk, - struct sk_buff *skb) +static inline struct mbuf * +sdp_sock_queue_rcv_mb(struct socket *sk, struct mbuf *mb) { - int skb_len; struct sdp_sock *ssk = sdp_sk(sk); - struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + struct sdp_bsdh *h; - /* not needed since sk_rmem_alloc is not currently used - * TODO - remove this? - skb_set_owner_r(skb, sk); */ + h = mtod(mb, struct sdp_bsdh *); - SDP_SKB_CB(skb)->seq = rcv_nxt(ssk); - if (unlikely(h->flags & SDP_OOB_PRES)) - sdp_urg(ssk, skb); - +#ifdef SDP_ZCOPY + SDP_SKB_CB(mb)->seq = rcv_nxt(ssk); if (h->mid == SDP_MID_SRCAVAIL) { struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1); struct rx_srcavail_state *rx_sa; + + ssk->srcavail_cancel_mseq = 0; - SDP_WARN_ON(ssk->rx_sa); - ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(skb) = kzalloc( - sizeof(struct rx_srcavail_state), GFP_ATOMIC); - if (unlikely(!rx_sa)) { - /* if there is no space, fall to BCopy. */ - sdp_dbg(sk, "Can't allocate memory for rx_sa\n"); - h->mid = SDP_MID_DATA; - goto mid_data; - } + ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(mb) = kzalloc( + sizeof(struct rx_srcavail_state), M_NOWAIT); rx_sa->mseq = ntohl(h->mseq); - rx_sa->len = skb_len = ntohl(srcah->len); + rx_sa->used = 0; + rx_sa->len = mb_len = ntohl(srcah->len); rx_sa->rkey = ntohl(srcah->rkey); rx_sa->vaddr = be64_to_cpu(srcah->vaddr); - rx_sa->skb = skb; + rx_sa->flags = 0; if (ssk->tx_sa) { - sdp_dbg_data(sk_ssk(ssk), "got RX SrcAvail while waiting " + sdp_dbg_data(ssk->socket, "got RX SrcAvail while waiting " "for TX SrcAvail. waking up TX SrcAvail" "to be aborted\n"); wake_up(sk->sk_sleep); } - atomic_add(skb->len, &ssk->rcv_nxt); - sdp_dbg_data(sk, "queueing SrcAvail. skb_len = %d vaddr = %lld\n", - skb_len, rx_sa->vaddr); - } else { -mid_data: - skb_len = skb->len; - - atomic_add(skb_len, &ssk->rcv_nxt); + atomic_add(mb->len, &ssk->rcv_nxt); + sdp_dbg_data(sk, "queueing SrcAvail. mb_len = %d vaddr = %lld\n", + mb_len, rx_sa->vaddr); + } else +#endif + { + atomic_add(mb->m_pkthdr.len, &ssk->rcv_nxt); } - skb_queue_tail(&sk->sk_receive_queue, skb); - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); - return skb; + m_adj(mb, SDP_HEAD_SIZE); + SOCKBUF_LOCK(&sk->so_rcv); + if (unlikely(h->flags & SDP_OOB_PRES)) + sdp_urg(ssk, mb); + sbappend_locked(&sk->so_rcv, mb); + sorwakeup_locked(sk); + return mb; } -static int sdp_get_recv_sges(struct sdp_sock *ssk, u32 new_size) +static int +sdp_get_recv_bytes(struct sdp_sock *ssk, u32 new_size) { - int recv_sges = ssk->max_sge - 1; /* 1 sge is dedicated to sdp header */ - recv_sges = MIN(recv_sges, PAGE_ALIGN(new_size) >> PAGE_SHIFT); - recv_sges = MIN(recv_sges, SDP_MAX_RECV_SGES - 1); - - return recv_sges; + return MIN(new_size, SDP_MAX_PACKET); } -int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size) +int +sdp_init_buffers(struct sdp_sock *ssk, u32 new_size) { - ssk->recv_frags = sdp_get_recv_sges(ssk, new_size); - ssk->rcvbuf_scale = rcvbuf_scale; + ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size); sdp_post_recvs(ssk); return 0; } -int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size) +int +sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size) { - u32 curr_size = ssk->recv_frags << PAGE_SHIFT; - u32 max_size = (ssk->max_sge - 1) << PAGE_SHIFT; + u32 curr_size = ssk->recv_bytes; + u32 max_size = SDP_MAX_PACKET; - if (new_size > curr_size && new_size <= max_size && - sdp_get_large_socket(ssk)) { - ssk->rcvbuf_scale = rcvbuf_scale; - ssk->recv_frags = sdp_get_recv_sges(ssk, new_size); + if (new_size > curr_size && new_size <= max_size) { + ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size); return 0; - } else - return -1; + } + return -1; } -static void sdp_handle_resize_request(struct sdp_sock *ssk, - struct sdp_chrecvbuf *buf) +static void +sdp_handle_resize_request(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf) { if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0) ssk->recv_request_head = ring_head(ssk->rx_ring) + 1; @@ -402,143 +288,138 @@ ssk->recv_request = 1; } -static void sdp_handle_resize_ack(struct sdp_sock *ssk, - struct sdp_chrecvbuf *buf) +static void +sdp_handle_resize_ack(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf) { u32 new_size = ntohl(buf->size); - if (new_size > ssk->xmit_size_goal) { - ssk->sent_request = -1; + if (new_size > ssk->xmit_size_goal) ssk->xmit_size_goal = new_size; - ssk->send_frags = - PAGE_ALIGN(ssk->xmit_size_goal) / PAGE_SIZE + 1; - } else - ssk->sent_request = 0; } -static void sdp_reuse_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, int len) +static struct mbuf * +sdp_recv_completion(struct sdp_sock *ssk, int id) { - int i; - struct sk_buff *skb; - struct ib_device *dev = ssk->ib_device; - enum dma_data_direction dir = DMA_FROM_DEVICE; - int used; - - skb = sbuf->skb; - - ib_dma_unmap_single(dev, sbuf->mapping[0], SDP_SKB_HEAD_SIZE, dir); - used = SDP_SKB_HEAD_SIZE; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (used >= len) { - skb->truesize -= min(PAGE_SIZE, SDP_MAX_PAYLOAD) * - (skb_shinfo(skb)->nr_frags - i); - skb_shinfo(skb)->nr_frags = i; - break; - } - - ib_dma_unmap_page(dev, sbuf->mapping[i + 1], - skb_shinfo(skb)->frags[i].size, - dir); - sbuf->mapping[i + 1] = 0; - - used += skb_shinfo(skb)->frags[i].size; - } -} - -static struct sk_buff *sdp_recv_completion(struct sdp_sock *ssk, int id, int len) -{ struct sdp_buf *rx_req; struct ib_device *dev; - struct sk_buff *skb; + struct mbuf *mb; if (unlikely(id != ring_tail(ssk->rx_ring))) { - sdp_warn(sk_ssk(ssk), "Bogus recv completion id %d tail %d\n", + printk(KERN_WARNING "Bogus recv completion id %d tail %d\n", id, ring_tail(ssk->rx_ring)); return NULL; } dev = ssk->ib_device; rx_req = &ssk->rx_ring.buffer[id & (SDP_RX_SIZE - 1)]; - skb = rx_req->skb; - sdp_reuse_sdp_buf(ssk, rx_req, len); + mb = rx_req->mb; + sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE); atomic_inc(&ssk->rx_ring.tail); atomic_dec(&ssk->remote_credits); - return skb; + return mb; } /* socket lock should be taken before calling this */ -static int sdp_process_rx_ctl_skb(struct sdp_sock *ssk, struct sk_buff *skb) +static int +sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb) { - struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); - struct sock *sk = sk_ssk(ssk); + struct sdp_bsdh *h; + struct socket *sk; - sdp_dbg_data(sk, "Handling %s\n", mid2str(h->mid)); - sdp_prf(sk, skb, "Handling %s", mid2str(h->mid)); - + SDP_WLOCK_ASSERT(ssk); + sk = ssk->socket; + h = mtod(mb, struct sdp_bsdh *); switch (h->mid) { case SDP_MID_DATA: case SDP_MID_SRCAVAIL: - SDP_WARN_ON(!(sk->sk_shutdown & RCV_SHUTDOWN)); - sdp_dbg(sk, "DATA after socket rcv was shutdown\n"); /* got data in RCV_SHUTDOWN */ - if (sk->sk_state == TCP_FIN_WAIT1) { + if (ssk->state == TCPS_FIN_WAIT_1) { sdp_dbg(sk, "RX data when state = FIN_WAIT1\n"); - /* go into abortive close */ - sdp_exch_state(sk, TCPF_FIN_WAIT1, - TCP_TIME_WAIT); - - sk->sk_prot->disconnect(sk, 0); + sdp_notify(ssk, ECONNRESET); } + m_freem(mb); + break; +#ifdef SDP_ZCOPY case SDP_MID_RDMARDCOMPL: - sdp_warn(sk, "Handling RdmaRdCompl - ERROR\n"); + m_freem(mb); break; case SDP_MID_SENDSM: sdp_handle_sendsm(ssk, ntohl(h->mseq_ack)); + m_freem(mb); break; case SDP_MID_SRCAVAIL_CANCEL: - if (ssk->rx_sa && after(ntohl(h->mseq), ssk->rx_sa->mseq) && - !ssk->tx_ring.rdma_inflight) { - sdp_abort_rx_srcavail(sk); - sdp_post_sendsm(sk); + sdp_dbg_data(sk, "Handling SrcAvailCancel\n"); + sdp_prf(sk, NULL, "Handling SrcAvailCancel"); + if (ssk->rx_sa) { + ssk->srcavail_cancel_mseq = ntohl(h->mseq); + ssk->rx_sa->flags |= RX_SA_ABORTED; + ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get + the dirty logic from recvmsg */ + } else { + sdp_dbg(sk, "Got SrcAvailCancel - " + "but no SrcAvail in process\n"); } + m_freem(mb); break; case SDP_MID_SINKAVAIL: + sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n"); + sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored"); + /* FALLTHROUGH */ +#endif case SDP_MID_ABORT: - sdp_reset(sk); + sdp_dbg_data(sk, "Handling ABORT\n"); + sdp_prf(sk, NULL, "Handling ABORT"); + sdp_notify(ssk, ECONNRESET); + m_freem(mb); break; case SDP_MID_DISCONN: - sdp_handle_disconn(sk); + sdp_dbg_data(sk, "Handling DISCONN\n"); + sdp_prf(sk, NULL, "Handling DISCONN"); + sdp_handle_disconn(ssk); break; case SDP_MID_CHRCVBUF: + sdp_dbg_data(sk, "Handling RX CHRCVBUF\n"); sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1)); + m_freem(mb); break; case SDP_MID_CHRCVBUF_ACK: + sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n"); sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1)); + m_freem(mb); break; default: /* TODO: Handle other messages */ sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid); + m_freem(mb); } - sdp_free_skb(skb); return 0; } -static int sdp_process_rx_skb(struct sdp_sock *ssk, struct sk_buff *skb) +static int +sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb) { - struct sock *sk = sk_ssk(ssk); - int frags; + struct socket *sk; struct sdp_bsdh *h; - int pagesz, i; unsigned long mseq_ack; int credits_before; - h = (struct sdp_bsdh *)skb_transport_header(skb); + h = mtod(mb, struct sdp_bsdh *); + sk = ssk->socket; + /* + * If another thread is in so_pcbfree this may be partially torn + * down but no further synchronization is required as the destroying + * thread will wait for receive to shutdown before discarding the + * socket. + */ + if (sk == NULL) { + m_freem(mb); + return 0; + } SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk)); @@ -546,96 +427,78 @@ credits_before = tx_credits(ssk); atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) + 1 + ntohs(h->bufs)); - if (!before(mseq_ack, ssk->nagle_last_unacked)) + if (mseq_ack >= ssk->nagle_last_unacked) ssk->nagle_last_unacked = 0; - sdp_prf1(sk_ssk(ssk), skb, "RX: %s +%d c:%d->%d mseq:%d ack:%d", + sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n", mid2str(h->mid), ntohs(h->bufs), credits_before, tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack)); - frags = skb_shinfo(skb)->nr_frags; - pagesz = PAGE_ALIGN(skb->data_len); - skb_shinfo(skb)->nr_frags = pagesz / PAGE_SIZE; - - for (i = skb_shinfo(skb)->nr_frags; i < frags; ++i) { - put_page(skb_shinfo(skb)->frags[i].page); - } - skb->truesize -= frags * PAGE_SIZE; - - if (unlikely(h->flags & SDP_OOB_PEND)) - sk_send_sigurg(sk); - - skb_pull(skb, sizeof(struct sdp_bsdh)); - - if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { - if (ssk->rx_sa) { - sdp_dbg_data(sk, "SrcAvail in the middle of another SrcAvail. Aborting\n"); - h->mid = SDP_MID_DATA; - sdp_post_sendsm(sk); - } else { - skb_pull(skb, sizeof(struct sdp_srcah)); - } - } - - if (unlikely(h->mid == SDP_MID_DATA && skb->len == 0)) { + if (unlikely(h->mid == SDP_MID_DATA && + mb->m_pkthdr.len == SDP_HEAD_SIZE)) { /* Credit update is valid even after RCV_SHUTDOWN */ - sdp_free_skb(skb); + m_freem(mb); return 0; } - if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL && - h->mid != SDP_MID_DISCONN) || - unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) { - sdp_prf(sk, NULL, "Control skb - queing to control queue"); + if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) || + TCPS_HAVERCVDFIN(ssk->state)) { + sdp_prf(sk, NULL, "Control mb - queing to control queue"); +#ifdef SDP_ZCOPY if (h->mid == SDP_MID_SRCAVAIL_CANCEL) { sdp_dbg_data(sk, "Got SrcAvailCancel. " "seq: 0x%d seq_ack: 0x%d\n", ntohl(h->mseq), ntohl(h->mseq_ack)); - ssk->sa_cancel_mseq = ntohl(h->mseq); - ssk->sa_cancel_arrived = 1; - if (ssk->rx_sa) - wake_up(sk->sk_sleep); + ssk->srcavail_cancel_mseq = ntohl(h->mseq); + } - skb_queue_tail(&ssk->rx_ctl_q, skb); - } else if (h->mid == SDP_MID_RDMARDCOMPL) { + + if (h->mid == SDP_MID_RDMARDCOMPL) { struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1); sdp_dbg_data(sk, "RdmaRdCompl message arrived\n"); sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack), ntohl(rrch->len)); - sdp_free_skb(skb); - } else - skb_queue_tail(&ssk->rx_ctl_q, skb); + } +#endif + mb->m_nextpkt = NULL; + if (ssk->rx_ctl_tail) + ssk->rx_ctl_tail->m_nextpkt = mb; + else + ssk->rx_ctl_q = mb; + ssk->rx_ctl_tail = mb; return 0; } - sdp_prf(sk, NULL, "queueing %s skb", mid2str(h->mid)); - skb = sdp_sock_queue_rcv_skb(sk, skb); + sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid)); + mb = sdp_sock_queue_rcv_mb(sk, mb); + return 0; } -static struct sk_buff *sdp_process_rx_wc(struct sdp_sock *ssk, - struct ib_wc *wc) +/* called only from irq */ +static struct mbuf * +sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc) { - struct sk_buff *skb; + struct mbuf *mb; struct sdp_bsdh *h; - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; int mseq; - skb = sdp_recv_completion(ssk, wc->wr_id, wc->byte_len); - if (unlikely(!skb)) + mb = sdp_recv_completion(ssk, wc->wr_id); + if (unlikely(!mb)) return NULL; if (unlikely(wc->status)) { - if (ssk->qp_active) { + if (ssk->qp_active && sk) { sdp_dbg(sk, "Recv completion with error. " "Status %d, vendor: %d\n", wc->status, wc->vendor_err); - sdp_reset(sk); + sdp_abort(sk); ssk->qp_active = 0; } - sdp_free_skb(skb); + m_freem(mb); return NULL; } @@ -644,64 +507,46 @@ if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) { sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n", wc->byte_len, sizeof(struct sdp_bsdh)); - sdp_free_skb(skb); + m_freem(mb); return NULL; } - skb->len = wc->byte_len; - skb->data = skb->head; + /* Use m_adj to trim the tail of data we didn't use. */ + m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len)); + h = mtod(mb, struct sdp_bsdh *); - h = (struct sdp_bsdh *)skb->data; + SDP_DUMP_PACKET(ssk->socket, "RX", mb, h); - if (likely(wc->byte_len > SDP_SKB_HEAD_SIZE)) - skb->data_len = wc->byte_len - SDP_SKB_HEAD_SIZE; - else - skb->data_len = 0; - -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->tail = skb_headlen(skb); -#else - skb->tail = skb->head + skb_headlen(skb); -#endif - SDP_DUMP_PACKET(sk_ssk(ssk), "RX", skb, h); - skb_reset_transport_header(skb); - ssk->rx_packets++; - ssk->rx_bytes += skb->len; + ssk->rx_bytes += mb->m_pkthdr.len; mseq = ntohl(h->mseq); atomic_set(&ssk->mseq_ack, mseq); - if (unlikely(mseq != (int)wc->wr_id)) + if (mseq != (int)wc->wr_id) sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n", mseq, (int)wc->wr_id); - return skb; + return mb; } -/* like sk_stream_write_space - execpt measures remote credits */ -static void sdp_bzcopy_write_space(struct sdp_sock *ssk) +/* Wakeup writers if we now have credits. */ +static void +sdp_bzcopy_write_space(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - struct socket *sock = sk->sk_socket; + struct socket *sk = ssk->socket; - if (tx_credits(ssk) < ssk->min_bufs || !sock) - return; - - clear_bit(SOCK_NOSPACE, &sock->flags); - sdp_prf1(sk, NULL, "Waking up sleepers"); - - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, 2, POLL_OUT); + if (tx_credits(ssk) >= ssk->min_bufs && sk) + sowwakeup(sk); } -int sdp_poll_rx_cq(struct sdp_sock *ssk) +/* only from interrupt. */ +static int +sdp_poll_rx_cq(struct sdp_sock *ssk) { struct ib_cq *cq = ssk->rx_ring.cq; struct ib_wc ibwc[SDP_NUM_WC]; int n, i; int wc_processed = 0; - struct sk_buff *skb; + struct mbuf *mb; do { n = ib_poll_cq(cq, SDP_NUM_WC, ibwc); @@ -709,75 +554,71 @@ struct ib_wc *wc = &ibwc[i]; BUG_ON(!(wc->wr_id & SDP_OP_RECV)); - skb = sdp_process_rx_wc(ssk, wc); - if (!skb) + mb = sdp_process_rx_wc(ssk, wc); + if (!mb) continue; - sdp_process_rx_skb(ssk, skb); + sdp_process_rx_mb(ssk, mb); wc_processed++; } } while (n == SDP_NUM_WC); - if (wc_processed) { - sdp_prf(sk_ssk(ssk), NULL, "processed %d", wc_processed); + if (wc_processed) sdp_bzcopy_write_space(ssk); - } return wc_processed; } -static void sdp_rx_comp_work(struct work_struct *work) +static void +sdp_rx_comp_work(struct work_struct *work) { struct sdp_sock *ssk = container_of(work, struct sdp_sock, rx_comp_work); - struct sock *sk = sk_ssk(ssk); - SDPSTATS_COUNTER_INC(rx_wq); + sdp_prf(ssk->socket, NULL, "%s", __func__); - sdp_prf(sk, NULL, "%s", __func__); - + SDP_WLOCK(ssk); if (unlikely(!ssk->qp)) { - sdp_prf(sk, NULL, "qp was destroyed"); - return; + sdp_prf(ssk->socket, NULL, "qp was destroyed"); + goto out; } if (unlikely(!ssk->rx_ring.cq)) { - sdp_prf(sk, NULL, "rx_ring.cq is NULL"); - return; + sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL"); + goto out; } if (unlikely(!ssk->poll_cq)) { struct rdma_cm_id *id = ssk->id; if (id && id->qp) rdma_notify(id, RDMA_CM_EVENT_ESTABLISHED); - return; + goto out; } - lock_sock(sk); - - posts_handler_get(ssk); sdp_do_posts(ssk); - posts_handler_put(ssk, SDP_RX_ARMING_DELAY); - release_sock(sk); +out: + SDP_WUNLOCK(ssk); } -void sdp_do_posts(struct sdp_sock *ssk) +void +sdp_do_posts(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; int xmit_poll_force; - struct sk_buff *skb; + struct mbuf *mb; + SDP_WLOCK_ASSERT(ssk); if (!ssk->qp_active) { sdp_dbg(sk, "QP is deactivated\n"); return; } - if (likely(ssk->rx_ring.cq)) - sdp_poll_rx_cq(ssk); + while ((mb = ssk->rx_ctl_q)) { + ssk->rx_ctl_q = mb->m_nextpkt; + mb->m_nextpkt = NULL; + sdp_process_rx_ctl_mb(ssk, mb); + } - while ((skb = skb_dequeue(&ssk->rx_ctl_q))) - sdp_process_rx_ctl_skb(ssk, skb); - - if (sk->sk_state == TCP_TIME_WAIT) + if (ssk->state == TCPS_TIME_WAIT) return; if (!ssk->rx_ring.cq || !ssk->tx_ring.cq) @@ -788,36 +629,56 @@ if (tx_ring_posted(ssk)) sdp_xmit_poll(ssk, 1); - sdp_post_sends(ssk, 0); + sdp_post_sends(ssk, M_NOWAIT); - sk_mem_reclaim(sk); + xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS; - xmit_poll_force = sk->sk_write_pending && - (tx_credits(ssk) > SDP_MIN_TX_CREDITS); - if (credit_update_needed(ssk) || xmit_poll_force) { /* if has pending tx because run out of tx_credits - xmit it */ sdp_prf(sk, NULL, "Processing to free pending sends"); sdp_xmit_poll(ssk, xmit_poll_force); sdp_prf(sk, NULL, "Sending credit update"); - sdp_post_sends(ssk, 0); + sdp_post_sends(ssk, M_NOWAIT); } } -static inline int should_wake_up(struct sock *sk) +int +sdp_process_rx(struct sdp_sock *ssk) { - return sk->sk_sleep && waitqueue_active(sk->sk_sleep) && - (posts_handler(sdp_sk(sk)) || somebody_is_waiting(sk)); + int wc_processed = 0; + int credits_before; + + if (!rx_ring_trylock(&ssk->rx_ring)) { + sdp_dbg(ssk->socket, "ring destroyed. not polling it\n"); + return 0; + } + + credits_before = tx_credits(ssk); + + wc_processed = sdp_poll_rx_cq(ssk); + sdp_prf(ssk->socket, NULL, "processed %d", wc_processed); + + if (wc_processed) { + sdp_prf(ssk->socket, NULL, "credits: %d -> %d", + credits_before, tx_credits(ssk)); + queue_work(rx_comp_wq, &ssk->rx_comp_work); + } + sdp_arm_rx_cq(ssk); + + rx_ring_unlock(&ssk->rx_ring); + + return (wc_processed); } -static void sdp_rx_irq(struct ib_cq *cq, void *cq_context) +static void +sdp_rx_irq(struct ib_cq *cq, void *cq_context) { - struct sock *sk = cq_context; + struct socket *sk = cq_context; struct sdp_sock *ssk = sdp_sk(sk); - if (unlikely(cq != ssk->rx_ring.cq)) { - sdp_warn(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq); + if (cq != ssk->rx_ring.cq) { + sdp_dbg(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq); return; } @@ -825,71 +686,50 @@ sdp_prf(sk, NULL, "rx irq"); - if (should_wake_up(sk)) { - wake_up_interruptible(sk->sk_sleep); - SDPSTATS_COUNTER_INC(rx_int_wake_up); - } else { - if (queue_work_on(ssk->cpu, rx_comp_wq, &ssk->rx_comp_work)) - SDPSTATS_COUNTER_INC(rx_int_queue); - else - SDPSTATS_COUNTER_INC(rx_int_no_op); - } + sdp_process_rx(ssk); } -static void sdp_rx_ring_purge(struct sdp_sock *ssk) +static +void sdp_rx_ring_purge(struct sdp_sock *ssk) { - struct ib_device *dev = ssk->ib_device; - int id, i; - while (rx_ring_posted(ssk) > 0) { - struct sk_buff *skb; - skb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring), INT_MAX); - if (!skb) + struct mbuf *mb; + mb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring)); + if (!mb) break; - sdp_free_skb(skb); + m_freem(mb); } - - for (id = 0; id < SDP_RX_SIZE; id++) { - struct sdp_buf *sbuf = &ssk->rx_ring.buffer[id]; - - for (i = 1; i < SDP_MAX_SEND_SGES; i++) { - if (!sbuf->mapping[i]) - continue; - - ib_dma_unmap_page(dev, sbuf->mapping[i], - min(PAGE_SIZE, SDP_MAX_PAYLOAD), - DMA_FROM_DEVICE); - sbuf->mapping[i] = 0; - put_page(sbuf->pages[i - 1]); - atomic_dec(&sdp_current_mem_usage); - } - } } -static void sdp_rx_cq_event_handler(struct ib_event *event, void *data) +void +sdp_rx_ring_init(struct sdp_sock *ssk) { + ssk->rx_ring.buffer = NULL; + ssk->rx_ring.destroyed = 0; + rw_init(&ssk->rx_ring.destroyed_lock, "sdp rx lock"); } -static void sdp_arm_cq_timer(unsigned long data) +static void +sdp_rx_cq_event_handler(struct ib_event *event, void *data) { - struct sdp_sock *ssk = (struct sdp_sock *)data; - - SDPSTATS_COUNTER_INC(rx_cq_arm_timer); - sdp_arm_rx_cq(sk_ssk(ssk)); } -int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) +int +sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) { struct ib_cq *rx_cq; int rc = 0; + + sdp_dbg(ssk->socket, "rx ring created"); + INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work); atomic_set(&ssk->rx_ring.head, 1); atomic_set(&ssk->rx_ring.tail, 1); - ssk->rx_ring.buffer = kzalloc( + ssk->rx_ring.buffer = kmalloc( sizeof *ssk->rx_ring.buffer * SDP_RX_SIZE, GFP_KERNEL); if (!ssk->rx_ring.buffer) { - sdp_warn(sk_ssk(ssk), + sdp_warn(ssk->socket, "Unable to allocate RX Ring size %zd.\n", sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE); @@ -897,21 +737,17 @@ } rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler, - sk_ssk(ssk), SDP_RX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED); + ssk->socket, SDP_RX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED); if (IS_ERR(rx_cq)) { rc = PTR_ERR(rx_cq); - sdp_warn(sk_ssk(ssk), "Unable to allocate RX CQ: %d.\n", rc); + sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc); goto err_cq; } - ssk->rx_ring.cq = rx_cq; + sdp_sk(ssk->socket)->rx_ring.cq = rx_cq; + sdp_arm_rx_cq(ssk); - INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work); - setup_timer(&ssk->rx_ring.cq_arm_timer, sdp_arm_cq_timer, - (unsigned long)ssk); - sdp_arm_rx_cq(sk_ssk(ssk)); - return 0; err_cq: @@ -920,10 +756,13 @@ return rc; } -void sdp_rx_ring_destroy(struct sdp_sock *ssk) +void +sdp_rx_ring_destroy(struct sdp_sock *ssk) { - del_timer_sync(&ssk->rx_ring.cq_arm_timer); + cancel_work_sync(&ssk->rx_comp_work); + rx_ring_destroy_lock(&ssk->rx_ring); + if (ssk->rx_ring.buffer) { sdp_rx_ring_purge(ssk); @@ -933,12 +772,12 @@ if (ssk->rx_ring.cq) { if (ib_destroy_cq(ssk->rx_ring.cq)) { - sdp_warn(sk_ssk(ssk), "destroy cq(%p) failed\n", + sdp_warn(ssk->socket, "destroy cq(%p) failed\n", ssk->rx_ring.cq); } else { ssk->rx_ring.cq = NULL; } } - SDP_WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring)); + WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring)); } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c (.../head) (revision 219811) @@ -29,10 +29,6 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include -#include -#include -#include #include "sdp.h" #define sdp_cnt(var) do { (var)++; } while (0) @@ -41,22 +37,22 @@ "Total number of keepalive probes sent."); static int sdp_process_tx_cq(struct sdp_sock *ssk); +static void sdp_poll_tx_timeout(void *data); -int sdp_xmit_poll(struct sdp_sock *ssk, int force) +int +sdp_xmit_poll(struct sdp_sock *ssk, int force) { int wc_processed = 0; - sdp_prf(sk_ssk(ssk), NULL, "%s", __func__); + SDP_WLOCK_ASSERT(ssk); + sdp_prf(ssk->socket, NULL, "%s", __func__); /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ - if (likely(ssk->qp_active && sk_ssk(ssk)->sk_state != TCP_CLOSE) && - !timer_pending(&ssk->tx_ring.timer)) { - mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); - } + if (!callout_pending(&ssk->tx_ring.timer)) + callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, + sdp_poll_tx_timeout, ssk); - ssk->tx_compl_pending = 0; - /* Poll the CQ every SDP_TX_POLL_MODER packets */ if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) wc_processed = sdp_process_tx_cq(ssk); @@ -64,114 +60,98 @@ return wc_processed; } -void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb) +void +sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb) { struct sdp_buf *tx_req; - struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); - unsigned long mseq = ring_head(ssk->tx_ring); - int i, rc, frags; - u64 addr; + struct sdp_bsdh *h; + unsigned long mseq; struct ib_device *dev; struct ib_send_wr *bad_wr; - struct ib_sge ibsge[SDP_MAX_SEND_SGES]; - struct ib_sge *sge = ibsge; + struct ib_sge *sge; struct ib_send_wr tx_wr = { NULL }; - u32 send_flags = IB_SEND_SIGNALED; + int i, rc; + u64 addr; SDPSTATS_COUNTER_MID_INC(post_send, h->mid); - SDPSTATS_HIST(send_size, skb->len); + SDPSTATS_HIST(send_size, mb->len); - if (!ssk->qp_active) - goto err; + if (!ssk->qp_active) { + m_freem(mb); + return; + } + mseq = ring_head(ssk->tx_ring); + h = mtod(mb, struct sdp_bsdh *); ssk->tx_packets++; + ssk->tx_bytes += mb->m_pkthdr.len; +#ifdef SDP_ZCOPY if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { - struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(skb); + struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb); if (ssk->tx_sa != tx_sa) { - sdp_dbg_data(sk_ssk(ssk), "SrcAvail cancelled " + sdp_dbg_data(ssk->socket, "SrcAvail cancelled " "before being sent!\n"); - SDP_WARN_ON(1); - sdp_free_skb(skb); + WARN_ON(1); + m_freem(mb); return; } - TX_SRCAVAIL_STATE(skb)->mseq = mseq; + TX_SRCAVAIL_STATE(mb)->mseq = mseq; } +#endif - if (unlikely(SDP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) + if (unlikely(mb->m_flags & M_URG)) h->flags = SDP_OOB_PRES | SDP_OOB_PEND; else h->flags = 0; + mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */ h->bufs = htons(rx_ring_posted(ssk)); - h->len = htonl(skb->len); + h->len = htonl(mb->m_pkthdr.len); h->mseq = htonl(mseq); h->mseq_ack = htonl(mseq_ack(ssk)); - sdp_prf(sk_ssk(ssk), skb, "TX: %s bufs: %d mseq:%ld ack:%d c: %d", + sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d", mid2str(h->mid), rx_ring_posted(ssk), mseq, - ntohl(h->mseq_ack), tx_credits(ssk)); + ntohl(h->mseq_ack)); - SDP_DUMP_PACKET(sk_ssk(ssk), "TX", skb, h); + SDP_DUMP_PACKET(ssk->socket, "TX", mb, h); tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)]; - tx_req->skb = skb; + tx_req->mb = mb; dev = ssk->ib_device; - - if (skb->len <= ssk->inline_thresh && !skb_shinfo(skb)->nr_frags) { - SDPSTATS_COUNTER_INC(inline_sends); - sge->addr = (u64) skb->data; - sge->length = skb->len; - sge->lkey = 0; - frags = 0; - tx_req->mapping[0] = 0; /* Nothing to be cleaned up by sdp_cleanup_sdp_buf() */ - send_flags |= IB_SEND_INLINE; - } else { - addr = ib_dma_map_single(dev, skb->data, skb->len - skb->data_len, - DMA_TO_DEVICE); - tx_req->mapping[0] = addr; - + sge = &ibsge[0]; + for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { + addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, + DMA_TO_DEVICE); /* TODO: proper error handling */ BUG_ON(ib_dma_mapping_error(dev, addr)); - + BUG_ON(i >= SDP_MAX_SEND_SGES); + tx_req->mapping[i] = addr; sge->addr = addr; - sge->length = skb->len - skb->data_len; + sge->length = mb->m_len; sge->lkey = ssk->sdp_dev->mr->lkey; - frags = skb_shinfo(skb)->nr_frags; - for (i = 0; i < frags; ++i) { - ++sge; - addr = ib_dma_map_page(dev, skb_shinfo(skb)->frags[i].page, - skb_shinfo(skb)->frags[i].page_offset, - skb_shinfo(skb)->frags[i].size, - DMA_TO_DEVICE); - BUG_ON(ib_dma_mapping_error(dev, addr)); - tx_req->mapping[i + 1] = addr; - sge->addr = addr; - sge->length = skb_shinfo(skb)->frags[i].size; - sge->lkey = ssk->sdp_dev->mr->lkey; - } } - tx_wr.next = NULL; - tx_wr.wr_id = ring_head(ssk->tx_ring) | SDP_OP_SEND; + tx_wr.wr_id = mseq | SDP_OP_SEND; tx_wr.sg_list = ibsge; - tx_wr.num_sge = frags + 1; + tx_wr.num_sge = i; tx_wr.opcode = IB_WR_SEND; - tx_wr.send_flags = send_flags; - if (unlikely(SDP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) + tx_wr.send_flags = IB_SEND_SIGNALED; + if (unlikely(tx_req->mb->m_flags & M_URG)) tx_wr.send_flags |= IB_SEND_SOLICITED; rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr); if (unlikely(rc)) { - sdp_dbg(sk_ssk(ssk), + sdp_dbg(ssk->socket, "ib_post_send failed with status %d.\n", rc); - sdp_cleanup_sdp_buf(ssk, tx_req, skb->len - skb->data_len, DMA_TO_DEVICE); + sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); - sdp_set_error(sk_ssk(ssk), -ECONNRESET); - - goto err; + sdp_notify(ssk, ECONNRESET); + m_freem(tx_req->mb); + return; } atomic_inc(&ssk->tx_ring.head); @@ -179,17 +159,16 @@ atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); return; - -err: - sdp_free_skb(skb); } -static struct sk_buff *sdp_send_completion(struct sdp_sock *ssk, int mseq) +static struct mbuf * +sdp_send_completion(struct sdp_sock *ssk, int mseq) { struct ib_device *dev; struct sdp_buf *tx_req; - struct sk_buff *skb = NULL; + struct mbuf *mb = NULL; struct sdp_tx_ring *tx_ring = &ssk->tx_ring; + if (unlikely(mseq != ring_tail(*tx_ring))) { printk(KERN_WARNING "Bogus send completion id %d tail %d\n", mseq, ring_tail(*tx_ring)); @@ -198,70 +177,114 @@ dev = ssk->ib_device; tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)]; - skb = tx_req->skb; - if (!skb) - goto skip; /* This slot was used by RDMA WR */ + mb = tx_req->mb; + sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); - sdp_cleanup_sdp_buf(ssk, tx_req, skb->len - skb->data_len, DMA_TO_DEVICE); - - tx_ring->una_seq += SDP_SKB_CB(skb)->end_seq; - +#ifdef SDP_ZCOPY /* TODO: AIO and real zcopy code; add their context support here */ - if (BZCOPY_STATE(skb)) - BZCOPY_STATE(skb)->busy--; + if (BZCOPY_STATE(mb)) + BZCOPY_STATE(mb)->busy--; +#endif -skip: atomic_inc(&tx_ring->tail); out: - return skb; + return mb; } -static inline void sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc) +static int +sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc) { - struct sock *sk = sk_ssk(ssk); + struct mbuf *mb = NULL; + struct sdp_bsdh *h; + if (unlikely(wc->status)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + sdp_prf(ssk->socket, mb, "Send completion with error. " + "Status %d", wc->status); + sdp_dbg_data(ssk->socket, "Send completion with error. " + "Status %d\n", wc->status); + sdp_notify(ssk, ECONNRESET); + } + } + + mb = sdp_send_completion(ssk, wc->wr_id); + if (unlikely(!mb)) + return -1; + + h = mtod(mb, struct sdp_bsdh *); + sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq)); + sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d", + mb, mb->m_pkthdr.len, ntohl(h->mseq)); + m_freem(mb); + + return 0; +} + +static inline void +sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc) +{ + if (likely(wc->wr_id & SDP_OP_SEND)) { - struct sk_buff *skb; + sdp_handle_send_comp(ssk, wc); + return; + } - skb = sdp_send_completion(ssk, wc->wr_id); - if (likely(skb)) - sk_wmem_free_skb(sk, skb); - } else if (wc->wr_id & SDP_OP_RDMA) { - if (ssk->tx_ring.rdma_inflight && - ssk->tx_ring.rdma_inflight->busy) { - /* Only last RDMA read WR is signalled. Order is guaranteed - - * therefore if Last RDMA read WR is completed - all other - * have, too */ - ssk->tx_ring.rdma_inflight->busy = 0; - } else { - sdp_warn(sk, "Unexpected RDMA read completion, " - "probably was canceled already\n"); +#ifdef SDP_ZCOPY + if (wc->wr_id & SDP_OP_RDMA) { + /* TODO: handle failed RDMA read cqe */ + + sdp_dbg_data(ssk->socket, + "TX comp: RDMA read. status: %d\n", wc->status); + sdp_prf1(sk, NULL, "TX comp: RDMA read"); + + if (!ssk->tx_ring.rdma_inflight) { + sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n"); + return; } - wake_up(sk->sk_sleep); - } else { - /* Keepalive probe sent cleanup */ - sdp_cnt(sdp_keepalive_probes_sent); + if (!ssk->tx_ring.rdma_inflight->busy) { + sdp_warn(ssk->socket, + "ERROR: too many RDMA read completions\n"); + return; + } + + /* Only last RDMA read WR is signalled. Order is guaranteed - + * therefore if Last RDMA read WR is completed - all other + * have, too */ + ssk->tx_ring.rdma_inflight->busy = 0; + sowwakeup(ssk->socket); + sdp_dbg_data(ssk->socket, "woke up sleepers\n"); + return; } +#endif - if (likely(!wc->status) || wc->status == IB_WC_WR_FLUSH_ERR) + /* Keepalive probe sent cleanup */ + sdp_cnt(sdp_keepalive_probes_sent); + + if (likely(!wc->status)) return; - sdp_warn(sk, "Send completion with error. wr_id 0x%llx Status %d\n", - wc->wr_id, wc->status); + sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n", + __func__, wc->status); - sdp_set_error(sk, -ECONNRESET); + if (wc->status == IB_WC_WR_FLUSH_ERR) + return; + + sdp_notify(ssk, ECONNRESET); } -static int sdp_process_tx_cq(struct sdp_sock *ssk) +static int +sdp_process_tx_cq(struct sdp_sock *ssk) { struct ib_wc ibwc[SDP_NUM_WC]; int n, i; int wc_processed = 0; + SDP_WLOCK_ASSERT(ssk); + if (!ssk->tx_ring.cq) { - sdp_dbg(sk_ssk(ssk), "tx irq on destroyed tx_cq\n"); + sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n"); return 0; } @@ -274,149 +297,100 @@ } while (n == SDP_NUM_WC); if (wc_processed) { - struct sock *sk = sk_ssk(ssk); - sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", + sdp_post_sends(ssk, M_DONTWAIT); + sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", (u32) tx_ring_posted(ssk)); - sk_stream_write_space(sk_ssk(ssk)); - if (sk->sk_write_pending && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && - tx_ring_posted(ssk)) { - /* a write is pending and still no room in tx queue, - * arm tx cq - */ - sdp_prf(sk_ssk(ssk), NULL, "pending tx - rearming"); - sdp_arm_tx_cq(sk); - } - + sowwakeup(ssk->socket); } return wc_processed; } -/* Select who will handle tx completion: - * - a write is pending - wake it up and let it do the poll + post - * - post handler is taken - taker will do the poll + post - * else return 1 and let the caller do it - */ -static int sdp_tx_handler_select(struct sdp_sock *ssk) +static void +sdp_poll_tx(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - - if (sk->sk_write_pending) { - /* Do the TX posts from sender context */ - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { - sdp_prf1(sk, NULL, "Waking up pending sendmsg"); - wake_up_interruptible(sk->sk_sleep); - return 0; - } else - sdp_prf1(sk, NULL, "Unexpected: sk_sleep=%p, " - "waitqueue_active: %d\n", - sk->sk_sleep, waitqueue_active(sk->sk_sleep)); - } - - if (posts_handler(ssk)) { - /* Somebody else available to check for completion */ - sdp_prf1(sk, NULL, "Somebody else will call do_posts"); - return 0; - } - - return 1; -} - -static void sdp_poll_tx_timeout(unsigned long data) -{ - struct sdp_sock *ssk = (struct sdp_sock *)data; - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; u32 inflight, wc_processed; - sdp_prf1(sk_ssk(ssk), NULL, "TX timeout: inflight=%d, head=%d tail=%d", + sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", (u32) tx_ring_posted(ssk), ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring)); - /* Only process if the socket is not in use */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sdp_prf(sk_ssk(ssk), NULL, "TX comp: socket is busy"); - - if (sdp_tx_handler_select(ssk) && sk->sk_state != TCP_CLOSE && - likely(ssk->qp_active)) { - sdp_prf1(sk, NULL, "schedule a timer"); - mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); - } - - SDPSTATS_COUNTER_INC(tx_poll_busy); + if (unlikely(ssk->state == TCPS_CLOSED)) { + sdp_warn(sk, "Socket is closed\n"); goto out; } - if (unlikely(!ssk->qp || sk->sk_state == TCP_CLOSE)) { - SDPSTATS_COUNTER_INC(tx_poll_no_op); - goto out; - } - wc_processed = sdp_process_tx_cq(ssk); if (!wc_processed) SDPSTATS_COUNTER_INC(tx_poll_miss); - else { - sdp_post_sends(ssk, GFP_ATOMIC); + else SDPSTATS_COUNTER_INC(tx_poll_hit); - } inflight = (u32) tx_ring_posted(ssk); - sdp_prf1(sk_ssk(ssk), NULL, "finished tx proccessing. inflight = %d", - tx_ring_posted(ssk)); + sdp_prf1(ssk->socket, NULL, "finished tx proccessing. inflight = %d", + inflight); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ - if (inflight && likely(ssk->qp_active)) - mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); - + if (inflight) + callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, + sdp_poll_tx_timeout, ssk); out: +#ifdef SDP_ZCOPY if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) { sdp_prf1(sk, NULL, "RDMA is inflight - arming irq"); - sdp_arm_tx_cq(sk); + sdp_arm_tx_cq(ssk); } - - bh_unlock_sock(sk); +#endif + return; } -static void sdp_tx_irq(struct ib_cq *cq, void *cq_context) +static void +sdp_poll_tx_timeout(void *data) { - struct sock *sk = cq_context; - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk = (struct sdp_sock *)data; - sdp_prf1(sk, NULL, "tx irq"); - sdp_dbg_data(sk, "Got tx comp interrupt\n"); + if (!callout_active(&ssk->tx_ring.timer)) + return; + callout_deactivate(&ssk->tx_ring.timer); + sdp_poll_tx(ssk); +} - SDPSTATS_COUNTER_INC(tx_int_count); +static void +sdp_tx_irq(struct ib_cq *cq, void *cq_context) +{ + struct sdp_sock *ssk; - ssk->tx_compl_pending = 1; - - if (sdp_tx_handler_select(ssk) && likely(ssk->qp_active && - sk->sk_state != TCP_CLOSE)) { - sdp_prf1(sk, NULL, "poll and post from tasklet"); - mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); - tasklet_schedule(&ssk->tx_ring.tasklet); - } + ssk = cq_context; + sdp_prf1(ssk->socket, NULL, "tx irq"); + sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n"); + SDPSTATS_COUNTER_INC(tx_int_count); + SDP_WLOCK(ssk); + sdp_poll_tx(ssk); + SDP_WUNLOCK(ssk); } -static void sdp_tx_ring_purge(struct sdp_sock *ssk) +static +void sdp_tx_ring_purge(struct sdp_sock *ssk) { - while (ring_posted(ssk->tx_ring)) { - struct sk_buff *skb; - skb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring)); - if (!skb) + while (tx_ring_posted(ssk)) { + struct mbuf *mb; + mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring)); + if (!mb) break; - sdp_free_skb(skb); + m_freem(mb); } } -void sdp_post_keepalive(struct sdp_sock *ssk) +void +sdp_post_keepalive(struct sdp_sock *ssk) { int rc; struct ib_send_wr wr, *bad_wr; - sdp_dbg(sk_ssk(ssk), "%s\n", __func__); + sdp_dbg(ssk->socket, "%s\n", __func__); memset(&wr, 0, sizeof(wr)); @@ -428,56 +402,53 @@ rc = ib_post_send(ssk->qp, &wr, &bad_wr); if (rc) { - sdp_dbg(sk_ssk(ssk), + sdp_dbg(ssk->socket, "ib_post_keepalive failed with status %d.\n", rc); - sdp_set_error(sk_ssk(ssk), -ECONNRESET); + sdp_notify(ssk, ECONNRESET); } sdp_cnt(sdp_keepalive_probes_sent); } -static void sdp_tx_cq_event_handler(struct ib_event *event, void *data) +static void +sdp_tx_cq_event_handler(struct ib_event *event, void *data) { } -int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device) +int +sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device) { struct ib_cq *tx_cq; int rc = 0; + sdp_dbg(ssk->socket, "tx ring create\n"); + callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0); + callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0); atomic_set(&ssk->tx_ring.head, 1); atomic_set(&ssk->tx_ring.tail, 1); - ssk->tx_ring.buffer = kmalloc( + ssk->tx_ring.buffer = kzalloc( sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL); if (!ssk->tx_ring.buffer) { rc = -ENOMEM; - sdp_warn(sk_ssk(ssk), "Can't allocate TX Ring size %zd.\n", + sdp_warn(ssk->socket, "Can't allocate TX Ring size %zd.\n", sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE); goto out; } tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler, - sk_ssk(ssk), SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED); + ssk, SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED); if (IS_ERR(tx_cq)) { rc = PTR_ERR(tx_cq); - sdp_warn(sk_ssk(ssk), "Unable to allocate TX CQ: %d.\n", rc); + sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc); goto err_cq; } - ssk->tx_ring.cq = tx_cq; - - setup_timer(&ssk->tx_ring.timer, sdp_poll_tx_timeout, - (unsigned long)ssk); ssk->tx_ring.poll_cnt = 0; + sdp_arm_tx_cq(ssk); - tasklet_init(&ssk->tx_ring.tasklet, sdp_poll_tx_timeout, - (unsigned long) ssk); - - setup_timer(&ssk->nagle_timer, sdp_nagle_timeout, (unsigned long) ssk); - return 0; err_cq: @@ -487,12 +458,17 @@ return rc; } -void sdp_tx_ring_destroy(struct sdp_sock *ssk) +void +sdp_tx_ring_destroy(struct sdp_sock *ssk) { - del_timer_sync(&ssk->tx_ring.timer); - if (ssk->nagle_timer.function) - del_timer_sync(&ssk->nagle_timer); + sdp_dbg(ssk->socket, "tx ring destroy\n"); + SDP_WLOCK(ssk); + callout_stop(&ssk->tx_ring.timer); + callout_stop(&ssk->nagle_timer); + SDP_WUNLOCK(ssk); + callout_drain(&ssk->tx_ring.timer); + callout_drain(&ssk->nagle_timer); if (ssk->tx_ring.buffer) { sdp_tx_ring_purge(ssk); @@ -503,16 +479,12 @@ if (ssk->tx_ring.cq) { if (ib_destroy_cq(ssk->tx_ring.cq)) { - sdp_warn(sk_ssk(ssk), "destroy cq(%p) failed\n", + sdp_warn(ssk->socket, "destroy cq(%p) failed\n", ssk->tx_ring.cq); } else { ssk->tx_ring.cq = NULL; } } - tasklet_kill(&ssk->tx_ring.tasklet); - /* tx_cq is destroyed, so no more tx_irq, so no one will schedule this - * tasklet. */ - - SDP_WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring)); + WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring)); } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c (.../head) (revision 219811) @@ -33,9 +33,11 @@ */ #include "sdp.h" +static void sdp_nagle_timeout(void *data); + #ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA -void _dump_packet(const char *func, int line, struct sock *sk, char *str, - struct sk_buff *skb, const struct sdp_bsdh *h) +void _dump_packet(const char *func, int line, struct socket *sk, char *str, + struct mbuf *mb, const struct sdp_bsdh *h) { struct sdp_hh *hh; struct sdp_hah *hah; @@ -44,9 +46,9 @@ struct sdp_srcah *srcah; int len = 0; char buf[256]; - len += snprintf(buf, 255-len, "mid: %-20s flags: 0x%x " + len += snprintf(buf, 255-len, "%s mb: %p mid: %2x:%-20s flags: 0x%x " "bufs: 0x%x len: 0x%x mseq: 0x%x mseq_ack: 0x%x | ", - mid2str(h->mid), h->flags, + str, mb, h->mid, mid2str(h->mid), h->flags, ntohs(h->bufs), ntohl(h->len), ntohl(h->mseq), ntohl(h->mseq_ack)); @@ -72,7 +74,7 @@ ntohl(req_size->size)); break; case SDP_MID_DATA: - len += snprintf(buf + len, 255-len, "data_len: 0x%zx |", + len += snprintf(buf + len, 255-len, "data_len: 0x%lx |", ntohl(h->len) - sizeof(struct sdp_bsdh)); break; case SDP_MID_RDMARDCOMPL: @@ -84,9 +86,9 @@ case SDP_MID_SRCAVAIL: srcah = (struct sdp_srcah *)(h+1); - len += snprintf(buf + len, 255-len, " | payload: 0x%zx, " - "len: 0x%x, rkey: 0x%x, vaddr: 0x%llx |", - ntohl(h->len) - sizeof(struct sdp_bsdh) - + len += snprintf(buf + len, 255-len, " | payload: 0x%lx, " + "len: 0x%x, rkey: 0x%x, vaddr: 0x%jx |", + ntohl(h->len) - sizeof(struct sdp_bsdh) - sizeof(struct sdp_srcah), ntohl(srcah->len), ntohl(srcah->rkey), be64_to_cpu(srcah->vaddr)); @@ -96,165 +98,134 @@ } buf[len] = 0; _sdp_printk(func, line, KERN_WARNING, sk, "%s: %s\n", str, buf); - _sdp_prf(sk, skb, func, line, "%s: %s", str, buf); } #endif -static inline void update_send_head(struct sock *sk, struct sk_buff *skb) +static inline int +sdp_nagle_off(struct sdp_sock *ssk, struct mbuf *mb) { - struct page *page; - sk->sk_send_head = skb->next; - if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) { - sk->sk_send_head = NULL; - page = sk->sk_sndmsg_page; - if (page) { - put_page(page); - sk->sk_sndmsg_page = NULL; - } - } -} -static inline int sdp_nagle_off(struct sdp_sock *ssk, struct sk_buff *skb) -{ - struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + struct sdp_bsdh *h; + + h = mtod(mb, struct sdp_bsdh *); int send_now = - BZCOPY_STATE(skb) || +#ifdef SDP_ZCOPY + BZCOPY_STATE(mb) || +#endif unlikely(h->mid != SDP_MID_DATA) || - (ssk->nonagle & TCP_NAGLE_OFF) || + (ssk->flags & SDP_NODELAY) || !ssk->nagle_last_unacked || - skb->next != (struct sk_buff *)&sk_ssk(ssk)->sk_write_queue || - skb->len + sizeof(struct sdp_bsdh) >= ssk->xmit_size_goal || - (SDP_SKB_CB(skb)->flags & TCPCB_FLAG_PSH) || - (SDP_SKB_CB(skb)->flags & TCPCB_FLAG_URG); + mb->m_pkthdr.len >= ssk->xmit_size_goal / 4 || + (mb->m_flags & M_PUSH); if (send_now) { unsigned long mseq = ring_head(ssk->tx_ring); ssk->nagle_last_unacked = mseq; } else { - if (!timer_pending(&ssk->nagle_timer) && ssk->qp_active) { - mod_timer(&ssk->nagle_timer, - jiffies + SDP_NAGLE_TIMEOUT); - sdp_dbg_data(sk_ssk(ssk), "Starting nagle timer\n"); + if (!callout_pending(&ssk->nagle_timer)) { + callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT, + sdp_nagle_timeout, ssk); + sdp_dbg_data(ssk->socket, "Starting nagle timer\n"); } } - sdp_dbg_data(sk_ssk(ssk), "send_now = %d last_unacked = %u\n", + sdp_dbg_data(ssk->socket, "send_now = %d last_unacked = %ld\n", send_now, ssk->nagle_last_unacked); return send_now; } -void sdp_nagle_timeout(unsigned long data) +static void +sdp_nagle_timeout(void *data) { struct sdp_sock *ssk = (struct sdp_sock *)data; - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; - SDPSTATS_COUNTER_INC(nagle_timer); - sdp_dbg_data(sk, "last_unacked = %u\n", ssk->nagle_last_unacked); + sdp_dbg_data(sk, "last_unacked = %ld\n", ssk->nagle_last_unacked); - if (!ssk->nagle_last_unacked) - goto out2; + if (!callout_active(&ssk->nagle_timer)) + return; + callout_deactivate(&ssk->nagle_timer); - /* Only process if the socket is not in use */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sdp_dbg_data(sk, "socket is busy - will try later\n"); + if (!ssk->nagle_last_unacked) goto out; - } - - if (sk->sk_state == TCP_CLOSE) { - bh_unlock_sock(sk); + if (ssk->state == TCPS_CLOSED) return; - } - ssk->nagle_last_unacked = 0; - sdp_post_sends(ssk, GFP_ATOMIC); + sdp_post_sends(ssk, M_DONTWAIT); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - sk_stream_write_space(sk); + sowwakeup(ssk->socket); out: - bh_unlock_sock(sk); -out2: - if (sk->sk_send_head && ssk->qp_active) { - /* If has pending sends - rearm */ - mod_timer(&ssk->nagle_timer, jiffies + SDP_NAGLE_TIMEOUT); - } + if (sk->so_snd.sb_sndptr) + callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT, + sdp_nagle_timeout, ssk); } -static inline int sdp_should_rearm(struct sock *sk) +void +sdp_post_sends(struct sdp_sock *ssk, int wait) { - return sk->sk_state != TCP_ESTABLISHED || sdp_sk(sk)->tx_sa || - somebody_is_waiting(sk); -} - -void sdp_post_sends(struct sdp_sock *ssk, gfp_t gfp) -{ - /* TODO: nonagle? */ - struct sk_buff *skb; + struct mbuf *mb; int post_count = 0; - struct sock *sk = sk_ssk(ssk); + struct socket *sk; + int low; + sk = ssk->socket; if (unlikely(!ssk->id)) { - if (sk->sk_send_head) { - sdp_dbg(sk, "Send on socket without cmid ECONNRESET\n"); - /* TODO: flush send queue? */ - sdp_reset(sk); + if (sk->so_snd.sb_sndptr) { + sdp_dbg(ssk->socket, + "Send on socket without cmid ECONNRESET.\n"); + sdp_notify(ssk, ECONNRESET); } return; } again: if (sdp_tx_ring_slots_left(ssk) < SDP_TX_SIZE / 2) - sdp_xmit_poll(ssk, 1); + sdp_xmit_poll(ssk, 1); - /* Run out of credits, check if got a credit update */ - if (unlikely(tx_credits(ssk) <= SDP_MIN_TX_CREDITS)) { - sdp_poll_rx_cq(ssk); - - if (unlikely(sdp_should_rearm(sk) || !posts_handler(ssk))) - sdp_arm_rx_cq(sk); - } - if (ssk->recv_request && ring_tail(ssk->rx_ring) >= ssk->recv_request_head && tx_credits(ssk) >= SDP_MIN_TX_CREDITS && sdp_tx_ring_slots_left(ssk)) { - skb = sdp_alloc_skb_chrcvbuf_ack(sk, - ssk->recv_frags * PAGE_SIZE, gfp); - if (likely(skb)) { - ssk->recv_request = 0; - sdp_post_send(ssk, skb); - post_count++; - } + mb = sdp_alloc_mb_chrcvbuf_ack(sk, + ssk->recv_bytes - SDP_HEAD_SIZE, wait); + if (mb == NULL) + goto allocfail; + ssk->recv_request = 0; + sdp_post_send(ssk, mb); + post_count++; } if (tx_credits(ssk) <= SDP_MIN_TX_CREDITS && - sdp_tx_ring_slots_left(ssk) && - sk->sk_send_head && - sdp_nagle_off(ssk, sk->sk_send_head)) { + sdp_tx_ring_slots_left(ssk) && sk->so_snd.sb_sndptr && + sdp_nagle_off(ssk, sk->so_snd.sb_sndptr)) { SDPSTATS_COUNTER_INC(send_miss_no_credits); } while (tx_credits(ssk) > SDP_MIN_TX_CREDITS && - sdp_tx_ring_slots_left(ssk) && - (skb = sk->sk_send_head) && - sdp_nagle_off(ssk, skb)) { - update_send_head(sk, skb); - __skb_dequeue(&sk->sk_write_queue); + sdp_tx_ring_slots_left(ssk) && (mb = sk->so_snd.sb_sndptr) && + sdp_nagle_off(ssk, mb)) { + struct mbuf *n; - sdp_post_send(ssk, skb); - + SOCKBUF_LOCK(&sk->so_snd); + sk->so_snd.sb_sndptr = mb->m_nextpkt; + sk->so_snd.sb_mb = mb->m_nextpkt; + mb->m_nextpkt = NULL; + SB_EMPTY_FIXUP(&sk->so_snd); + for (n = mb; n != NULL; n = n->m_next) + sbfree(&sk->so_snd, n); + SOCKBUF_UNLOCK(&sk->so_snd); + sdp_post_send(ssk, mb); post_count++; } - if (credit_update_needed(ssk) && - likely((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_FIN_WAIT1))) { + if (credit_update_needed(ssk) && ssk->state >= TCPS_ESTABLISHED && + ssk->state < TCPS_FIN_WAIT_2) { + mb = sdp_alloc_mb_data(ssk->socket, wait); + if (mb == NULL) + goto allocfail; + sdp_post_send(ssk, mb); - skb = sdp_alloc_skb_data(sk, 0, gfp); - if (likely(skb)) { - sdp_post_send(ssk, skb); - SDPSTATS_COUNTER_INC(post_send_credits); - post_count++; - } + SDPSTATS_COUNTER_INC(post_send_credits); + post_count++; } /* send DisConn if needed @@ -262,19 +233,26 @@ * If one credit is available, an implementation shall only send SDP * messages that provide additional credits and also do not contain ULP * payload. */ - if (unlikely(ssk->sdp_disconnect) && - !sk->sk_send_head && - tx_credits(ssk) > 1) { - skb = sdp_alloc_skb_disconnect(sk, gfp); - if (likely(skb)) { - ssk->sdp_disconnect = 0; - sdp_post_send(ssk, skb); - post_count++; - } + if ((ssk->flags & SDP_NEEDFIN) && !sk->so_snd.sb_sndptr && + tx_credits(ssk) > 1) { + mb = sdp_alloc_mb_disconnect(sk, wait); + if (mb == NULL) + goto allocfail; + ssk->flags &= ~SDP_NEEDFIN; + sdp_post_send(ssk, mb); + post_count++; } - - if (!sdp_tx_ring_slots_left(ssk) || post_count) { - if (sdp_xmit_poll(ssk, 1)) + low = (sdp_tx_ring_slots_left(ssk) <= SDP_MIN_TX_CREDITS); + if (post_count || low) { + if (low) + sdp_arm_tx_cq(ssk); + if (sdp_xmit_poll(ssk, low)) goto again; } + return; + +allocfail: + ssk->nagle_last_unacked = -1; + callout_reset(&ssk->nagle_timer, 1, sdp_nagle_timeout, ssk); + return; } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c (.../head) (revision 219811) @@ -31,33 +31,25 @@ * * $Id$ */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include "sdp.h" #define SDP_MAJV_MINV 0x22 -SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 0, "Support only link layer of " +SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 1, "Support only link layer of " "type Infiniband"); -static void sdp_qp_event_handler(struct ib_event *event, void *data) +enum { + SDP_HH_SIZE = 76, + SDP_HAH_SIZE = 180, +}; + +static void +sdp_qp_event_handler(struct ib_event *event, void *data) { - sdp_warn(NULL, "unexpected invocation: event: %d, data=%p\n", - event->event, data); } -static int sdp_get_max_dev_sge(struct ib_device *dev) +static int +sdp_get_max_dev_sge(struct ib_device *dev) { struct ib_device_attr attr; static int max_sges = -1; @@ -73,84 +65,82 @@ return max_sges; } -static int sdp_init_qp(struct sock *sk, struct rdma_cm_id *id) +static int +sdp_init_qp(struct socket *sk, struct rdma_cm_id *id) { struct ib_qp_init_attr qp_init_attr = { .event_handler = sdp_qp_event_handler, .cap.max_send_wr = SDP_TX_SIZE, .cap.max_recv_wr = SDP_RX_SIZE, - .cap.max_inline_data = sdp_inline_thresh, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, }; struct ib_device *device = id->device; + struct sdp_sock *ssk; int rc; sdp_dbg(sk, "%s\n", __func__); - sdp_sk(sk)->max_sge = sdp_get_max_dev_sge(device); - sdp_dbg(sk, "Max sges: %d\n", sdp_sk(sk)->max_sge); + ssk = sdp_sk(sk); + ssk->max_sge = sdp_get_max_dev_sge(device); + sdp_dbg(sk, "Max sges: %d\n", ssk->max_sge); - qp_init_attr.cap.max_send_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_SEND_SGES); - sdp_dbg(sk, "Setting max send sge to: %d\n", qp_init_attr.cap.max_send_sge); - - qp_init_attr.cap.max_recv_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_RECV_SGES); - sdp_dbg(sk, "Setting max recv sge to: %d\n", qp_init_attr.cap.max_recv_sge); - - sdp_sk(sk)->sdp_dev = ib_get_client_data(device, &sdp_client); - if (!sdp_sk(sk)->sdp_dev) { + qp_init_attr.cap.max_send_sge = MIN(ssk->max_sge, SDP_MAX_SEND_SGES); + sdp_dbg(sk, "Setting max send sge to: %d\n", + qp_init_attr.cap.max_send_sge); + + qp_init_attr.cap.max_recv_sge = MIN(ssk->max_sge, SDP_MAX_RECV_SGES); + sdp_dbg(sk, "Setting max recv sge to: %d\n", + qp_init_attr.cap.max_recv_sge); + + ssk->sdp_dev = ib_get_client_data(device, &sdp_client); + if (!ssk->sdp_dev) { sdp_warn(sk, "SDP not available on device %s\n", device->name); rc = -ENODEV; goto err_rx; } - rc = sdp_rx_ring_create(sdp_sk(sk), device); + rc = sdp_rx_ring_create(ssk, device); if (rc) goto err_rx; - rc = sdp_tx_ring_create(sdp_sk(sk), device); + rc = sdp_tx_ring_create(ssk, device); if (rc) goto err_tx; - qp_init_attr.recv_cq = sdp_sk(sk)->rx_ring.cq; - qp_init_attr.send_cq = sdp_sk(sk)->tx_ring.cq; + qp_init_attr.recv_cq = ssk->rx_ring.cq; + qp_init_attr.send_cq = ssk->tx_ring.cq; - rc = rdma_create_qp(id, sdp_sk(sk)->sdp_dev->pd, &qp_init_attr); + rc = rdma_create_qp(id, ssk->sdp_dev->pd, &qp_init_attr); if (rc) { sdp_warn(sk, "Unable to create QP: %d.\n", rc); goto err_qp; } - sdp_sk(sk)->qp = id->qp; - sdp_sk(sk)->ib_device = device; - sdp_sk(sk)->qp_active = 1; - sdp_sk(sk)->context.device = device; - sdp_sk(sk)->inline_thresh = qp_init_attr.cap.max_inline_data; + ssk->qp = id->qp; + ssk->ib_device = device; + ssk->qp_active = 1; + ssk->context.device = device; sdp_dbg(sk, "%s done\n", __func__); return 0; err_qp: - sdp_tx_ring_destroy(sdp_sk(sk)); + sdp_tx_ring_destroy(ssk); err_tx: - sdp_rx_ring_destroy(sdp_sk(sk)); + sdp_rx_ring_destroy(ssk); err_rx: return rc; } -static int sdp_get_max_send_frags(u32 buf_size) +static int +sdp_connect_handler(struct socket *sk, struct rdma_cm_id *id, + struct rdma_cm_event *event) { - return MIN( - /* +1 to conpensate on not aligned buffers */ - (PAGE_ALIGN(buf_size) >> PAGE_SHIFT) + 1, - SDP_MAX_SEND_SGES - 1); -} - -static int sdp_connect_handler(struct sock *sk, struct rdma_cm_id *id, - struct rdma_cm_event *event) -{ + struct sockaddr_in *src_addr; struct sockaddr_in *dst_addr; - struct sock *child; + struct socket *child; const struct sdp_hh *h; + struct sdp_sock *ssk; int rc; sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id); @@ -161,161 +151,106 @@ if (!h->max_adverts) return -EINVAL; - child = sk_clone(sk, GFP_KERNEL); + child = sonewconn(sk, SS_ISCONNECTED); if (!child) return -ENOMEM; - sdp_init_sock(child); - - dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; - inet_sk(child)->dport = dst_addr->sin_port; - inet_sk(child)->daddr = dst_addr->sin_addr.s_addr; - -#ifdef SDP_SOCK_HISTORY - sdp_ssk_hist_rename(sk); -#endif - __sock_put(child, SOCK_REF_CLONE); - - down_read(&device_removal_lock); - + ssk = sdp_sk(child); rc = sdp_init_qp(child, id); - if (rc) { - bh_unlock_sock(child); - up_read(&device_removal_lock); - sdp_sk(child)->destructed_already = 1; -#ifdef SDP_SOCK_HISTORY - sdp_ssk_hist_close(child); -#endif - sk_free(child); + if (rc) return rc; - } + SDP_WLOCK(ssk); + id->context = ssk; + ssk->id = id; + ssk->socket = child; + ssk->cred = crhold(child->so_cred); + dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; + src_addr = (struct sockaddr_in *)&id->route.addr.src_addr; + ssk->fport = dst_addr->sin_port; + ssk->faddr = dst_addr->sin_addr.s_addr; + ssk->lport = src_addr->sin_port; + ssk->max_bufs = ntohs(h->bsdh.bufs); + atomic_set(&ssk->tx_ring.credits, ssk->max_bufs); + ssk->min_bufs = tx_credits(ssk) / 4; + ssk->xmit_size_goal = ntohl(h->localrcvsz) - sizeof(struct sdp_bsdh); + sdp_init_buffers(ssk, rcvbuf_initial_size); + ssk->state = TCPS_SYN_RECEIVED; + SDP_WUNLOCK(ssk); - sdp_sk(child)->max_bufs = ntohs(h->bsdh.bufs); - atomic_set(&sdp_sk(child)->tx_ring.credits, sdp_sk(child)->max_bufs); - - sdp_sk(child)->min_bufs = tx_credits(sdp_sk(child)) / 4; - sdp_sk(child)->xmit_size_goal = ntohl(h->localrcvsz) - - sizeof(struct sdp_bsdh); - - sdp_sk(child)->send_frags = sdp_get_max_send_frags(sdp_sk(child)->xmit_size_goal); - sdp_init_buffers(sdp_sk(child), rcvbuf_initial_size); - - id->context = child; - sdp_sk(child)->id = id; - - list_add_tail(&sdp_sk(child)->backlog_queue, - &sdp_sk(sk)->backlog_queue); - sdp_sk(child)->parent = sk; - - bh_unlock_sock(child); - sdp_add_sock(sdp_sk(child)); - up_read(&device_removal_lock); - - sdp_exch_state(child, TCPF_LISTEN | TCPF_CLOSE, TCP_SYN_RECV); - - /* child->sk_write_space(child); */ - /* child->sk_data_ready(child, 0); */ - sk->sk_data_ready(sk, 0); - return 0; } -static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id, - struct rdma_cm_event *event) +static int +sdp_response_handler(struct socket *sk, struct rdma_cm_id *id, + struct rdma_cm_event *event) { const struct sdp_hah *h; struct sockaddr_in *dst_addr; + struct sdp_sock *ssk; sdp_dbg(sk, "%s\n", __func__); - sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED); - sdp_set_default_moderation(sdp_sk(sk)); - - if (sock_flag(sk, SOCK_KEEPOPEN)) - sdp_start_keepalive_timer(sk); - - if (sock_flag(sk, SOCK_DEAD)) + ssk = sdp_sk(sk); + SDP_WLOCK(ssk); + ssk->state = TCPS_ESTABLISHED; + sdp_set_default_moderation(ssk); + if (ssk->flags & SDP_DROPPED) { + SDP_WUNLOCK(ssk); return 0; - + } + if (sk->so_options & SO_KEEPALIVE) + sdp_start_keepalive_timer(sk); h = event->param.conn.private_data; SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh); - sdp_sk(sk)->max_bufs = ntohs(h->bsdh.bufs); - atomic_set(&sdp_sk(sk)->tx_ring.credits, sdp_sk(sk)->max_bufs); - sdp_sk(sk)->min_bufs = tx_credits(sdp_sk(sk)) / 4; - sdp_sk(sk)->xmit_size_goal = + ssk->max_bufs = ntohs(h->bsdh.bufs); + atomic_set(&ssk->tx_ring.credits, ssk->max_bufs); + ssk->min_bufs = tx_credits(ssk) / 4; + ssk->xmit_size_goal = ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh); - sdp_sk(sk)->send_frags = sdp_get_max_send_frags(sdp_sk(sk)->xmit_size_goal); - sdp_sk(sk)->xmit_size_goal = MIN(sdp_sk(sk)->xmit_size_goal, - sdp_sk(sk)->send_frags * PAGE_SIZE); + ssk->poll_cq = 1; - sdp_sk(sk)->poll_cq = 1; - - sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); - dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; - inet_sk(sk)->dport = dst_addr->sin_port; - inet_sk(sk)->daddr = dst_addr->sin_addr.s_addr; + ssk->fport = dst_addr->sin_port; + ssk->faddr = dst_addr->sin_addr.s_addr; + soisconnected(sk); + SDP_WUNLOCK(ssk); -#ifdef SDP_SOCK_HISTORY - sdp_ssk_hist_rename(sk); -#endif return 0; } -static int sdp_connected_handler(struct sock *sk) +static int +sdp_connected_handler(struct socket *sk, struct rdma_cm_event *event) { - struct sock *parent; + struct sdp_sock *ssk; + sdp_dbg(sk, "%s\n", __func__); - parent = sdp_sk(sk)->parent; - BUG_ON(!parent); + ssk = sdp_sk(sk); + SDP_WLOCK(ssk); + ssk->state = TCPS_ESTABLISHED; - sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED); + sdp_set_default_moderation(ssk); -#ifdef SDP_SOCK_HISTORY - sdp_ssk_hist_rename(sk); -#endif - sdp_set_default_moderation(sdp_sk(sk)); - - if (sock_flag(sk, SOCK_KEEPOPEN)) + if (sk->so_options & SO_KEEPALIVE) sdp_start_keepalive_timer(sk); - if (sock_flag(sk, SOCK_DEAD)) - return 0; - - lock_sock(parent); - if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */ - sdp_dbg(sk, "parent is going away.\n"); - goto done; - } - - sk_acceptq_added(parent); - sdp_dbg(parent, "%s child connection established\n", __func__); - list_del_init(&sdp_sk(sk)->backlog_queue); - list_add_tail(&sdp_sk(sk)->accept_queue, - &sdp_sk(parent)->accept_queue); - - parent->sk_state_change(parent); - sk_wake_async(parent, 0, POLL_OUT); -done: - release_sock(parent); - + if ((ssk->flags & SDP_DROPPED) == 0) + soisconnected(sk); + SDP_WUNLOCK(ssk); return 0; } -static int sdp_disconnected_handler(struct sock *sk) +static int +sdp_disconnected_handler(struct socket *sk) { - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk; + ssk = sdp_sk(sk); sdp_dbg(sk, "%s\n", __func__); - if (ssk->tx_ring.cq) - if (sdp_xmit_poll(ssk, 1)) - sdp_post_sends(ssk, 0); + SDP_WLOCK_ASSERT(ssk); + if (sdp_sk(sk)->state == TCPS_SYN_RECEIVED) { + sdp_connected_handler(sk, NULL); - if (sk->sk_state == TCP_SYN_RECV) { - sdp_connected_handler(sk); - if (rcv_nxt(ssk)) return 0; } @@ -323,41 +258,36 @@ return -ECONNRESET; } -int sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +int +sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_conn_param conn_param; - struct sock *parent = NULL; - struct sock *child = NULL; - struct sock *sk; + struct socket *sk; + struct sdp_sock *ssk; struct sdp_hah hah; struct sdp_hh hh; int rc = 0; - sk = id->context; - if (!sk) { - sdp_dbg(NULL, "cm_id is being torn down, event %s\n", - rdma_cm_event_str(event->event)); + ssk = id->context; + sk = NULL; + if (ssk) + sk = ssk->socket; + if (!ssk || !sk || !ssk->id) { + sdp_dbg(sk, + "cm_id is being torn down, event %d, ssk %p, sk %p, id %p\n", + event->event, ssk, sk, id); return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ? -EINVAL : 0; } - sdp_add_to_history(sk, rdma_cm_event_str(event->event)); - - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - sdp_dbg(sk, "event: %s\n", rdma_cm_event_str(event->event)); - if (!sdp_sk(sk)->id) { - sdp_dbg(sk, "socket is being torn down\n"); - rc = event->event == RDMA_CM_EVENT_CONNECT_REQUEST ? - -EINVAL : 0; - release_sock(sk); - return rc; - } - + sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: + sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n"); + if (sdp_link_layer_ib_only && - rdma_node_get_transport(id->device->node_type) == + rdma_node_get_transport(id->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(id->device, id->port_num) != IB_LINK_LAYER_INFINIBAND) { @@ -371,12 +301,16 @@ rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT); break; case RDMA_CM_EVENT_ADDR_ERROR: + sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n"); rc = -ENETUNREACH; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: + sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id); rc = sdp_init_qp(sk, id); if (rc) break; + atomic_set(&sdp_sk(sk)->remote_credits, + rx_ring_posted(sdp_sk(sk))); memset(&hh, 0, sizeof hh); hh.bsdh.mid = SDP_MID_HELLO; hh.bsdh.len = htonl(sizeof(struct sdp_hh)); @@ -385,11 +319,9 @@ hh.majv_minv = SDP_MAJV_MINV; sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size); hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk))); - atomic_set(&sdp_sk(sk)->remote_credits, - rx_ring_posted(sdp_sk(sk))); - hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_frags * - PAGE_SIZE + sizeof(struct sdp_bsdh)); - inet_sk(sk)->saddr = inet_sk(sk)->rcv_saddr = + hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_bytes); + hh.max_adverts = 0x1; + sdp_sk(sk)->laddr = ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; memset(&conn_param, 0, sizeof conn_param); conn_param.private_data_len = sizeof hh; @@ -397,31 +329,31 @@ conn_param.responder_resources = 4 /* TODO */; conn_param.initiator_depth = 4 /* TODO */; conn_param.retry_count = SDP_RETRY_COUNT; - SDP_DUMP_PACKET(sk, "TX", NULL, &hh.bsdh); + SDP_DUMP_PACKET(NULL, "TX", NULL, &hh.bsdh); rc = rdma_connect(id, &conn_param); break; case RDMA_CM_EVENT_ROUTE_ERROR: + sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id); rc = -ETIMEDOUT; break; case RDMA_CM_EVENT_CONNECT_REQUEST: + sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n"); rc = sdp_connect_handler(sk, id, event); if (rc) { sdp_dbg(sk, "Destroying qp\n"); rdma_reject(id, NULL, 0); break; } - child = id->context; - atomic_set(&sdp_sk(child)->remote_credits, - rx_ring_posted(sdp_sk(child))); + ssk = id->context; + atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); memset(&hah, 0, sizeof hah); hah.bsdh.mid = SDP_MID_HELLO_ACK; - hah.bsdh.bufs = htons(rx_ring_posted(sdp_sk(child))); + hah.bsdh.bufs = htons(rx_ring_posted(ssk)); hah.bsdh.len = htonl(sizeof(struct sdp_hah)); hah.majv_minv = SDP_MAJV_MINV; hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec, but just in case */ - hah.actrcvsz = htonl(sdp_sk(child)->recv_frags * PAGE_SIZE + - sizeof(struct sdp_bsdh)); + hah.actrcvsz = htonl(ssk->recv_bytes); memset(&conn_param, 0, sizeof conn_param); conn_param.private_data_len = sizeof hah; conn_param.private_data = &hah; @@ -431,13 +363,13 @@ SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh); rc = rdma_accept(id, &conn_param); if (rc) { - sdp_sk(child)->id = NULL; + ssk->id = NULL; id->qp = NULL; id->context = NULL; - parent = sdp_sk(child)->parent; /* TODO: hold ? */ } break; case RDMA_CM_EVENT_CONNECT_RESPONSE: + sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n"); rc = sdp_response_handler(sk, id, event); if (rc) { sdp_dbg(sk, "Destroying qp\n"); @@ -446,50 +378,57 @@ rc = rdma_accept(id, NULL); break; case RDMA_CM_EVENT_CONNECT_ERROR: + sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n"); rc = -ETIMEDOUT; break; case RDMA_CM_EVENT_UNREACHABLE: + sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n"); rc = -ENETUNREACH; break; case RDMA_CM_EVENT_REJECTED: + sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n"); rc = -ECONNREFUSED; break; case RDMA_CM_EVENT_ESTABLISHED: - inet_sk(sk)->saddr = inet_sk(sk)->rcv_saddr = + sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n"); + sdp_sk(sk)->laddr = ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; - rc = sdp_connected_handler(sk); + rc = sdp_connected_handler(sk, event); break; case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */ - if (sk->sk_state == TCP_LAST_ACK) { - sdp_cancel_dreq_wait_timeout(sdp_sk(sk)); + sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n"); - sdp_exch_state(sk, TCPF_LAST_ACK, TCP_TIME_WAIT); + SDP_WLOCK(ssk); + if (ssk->state == TCPS_LAST_ACK) { + sdp_cancel_dreq_wait_timeout(ssk); sdp_dbg(sk, "%s: waiting for Infiniband tear down\n", __func__); } - - sdp_sk(sk)->qp_active = 0; + ssk->qp_active = 0; + SDP_WUNLOCK(ssk); rdma_disconnect(id); - - if (sk->sk_state != TCP_TIME_WAIT) { - if (sk->sk_state == TCP_CLOSE_WAIT) { + SDP_WLOCK(ssk); + if (ssk->state != TCPS_TIME_WAIT) { + if (ssk->state == TCPS_CLOSE_WAIT) { sdp_dbg(sk, "IB teardown while in " - "TCP_CLOSE_WAIT taking reference to " + "TCPS_CLOSE_WAIT taking reference to " "let close() finish the work\n"); - sock_hold(sk, SOCK_REF_CMA); - sdp_start_cma_timewait_timeout(sdp_sk(sk), - SDP_CMA_TIMEWAIT_TIMEOUT); - } - sdp_set_error(sk, -EPIPE); rc = sdp_disconnected_handler(sk); + if (rc) + rc = -EPIPE; } + SDP_WUNLOCK(ssk); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: + sdp_dbg(sk, "RDMA_CM_EVENT_TIMEWAIT_EXIT\n"); + SDP_WLOCK(ssk); rc = sdp_disconnected_handler(sk); + SDP_WUNLOCK(ssk); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: + sdp_dbg(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n"); rc = -ENETRESET; break; default: @@ -499,37 +438,19 @@ break; } - sdp_dbg(sk, "event: %s handled\n", rdma_cm_event_str(event->event)); + sdp_dbg(sk, "event %d done. status %d\n", event->event, rc); - if (rc && sdp_sk(sk)->id == id) { - child = sk; - sdp_sk(sk)->id = NULL; - id->qp = NULL; - id->context = NULL; - parent = sdp_sk(sk)->parent; - sdp_reset_sk(sk, rc); + if (rc) { + SDP_WLOCK(ssk); + if (ssk->id == id) { + ssk->id = NULL; + id->qp = NULL; + id->context = NULL; + if (sdp_notify(ssk, -rc)) + SDP_WUNLOCK(ssk); + } else + SDP_WUNLOCK(ssk); } - release_sock(sk); - - sdp_dbg(sk, "event: %s done. status %d\n", - rdma_cm_event_str(event->event), rc); - - if (parent) { - lock_sock(parent); - if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */ - sdp_dbg(sk, "parent is going away.\n"); - child = NULL; - goto done; - } - if (!list_empty(&sdp_sk(child)->backlog_queue)) - list_del_init(&sdp_sk(child)->backlog_queue); - else - child = NULL; -done: - release_sock(parent); - if (child) - sdp_common_release(child); - } return rc; } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h (.../head) (revision 219811) @@ -3,33 +3,20 @@ #define SDPSTATS_ON -#ifdef CONFIG_INFINIBAND_SDP_DEBUG -#define SDP_SOCK_HISTORY -#endif +//#define GETNSTIMEODAY_SUPPORTED -#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA -#define SDP_PROFILING -#endif - -#define SDP_WARN_ON(x) WARN_ON(x) -static inline struct sdp_sock *sdp_sk(const struct sock *sk); - -#define _sdp_printk(func, line, level, sk, format, arg...) do { \ - preempt_disable(); \ - printk(level "%s:%d sdp_sock(%5d:%d %d:%d): " format, \ - func, line, \ - current->pid, smp_processor_id(), \ - (sk) ? inet_sk(sk)->num : -1, \ - (sk) ? ntohs(inet_sk(sk)->dport) : -1, ## arg); \ - preempt_enable(); \ +#define _sdp_printk(func, line, level, sk, format, arg...) \ +do { \ + printk(level "%s:%d %p sdp_sock(%d:%d %d:%d): " format "\n", \ + func, line, sk ? sdp_sk(sk) : NULL, \ + curproc->p_pid, PCPU_GET(cpuid), \ + (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->lport) : -1, \ + (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->fport) : -1, ## arg); \ } while (0) #define sdp_printk(level, sk, format, arg...) \ _sdp_printk(__func__, __LINE__, level, sk, format, ## arg) -#define sdp_warn(sk, format, arg...) \ - do { \ - sdp_printk(KERN_WARNING, sk, format, ## arg); \ - sdp_prf(sk, NULL, format , ## arg); \ - } while (0) +#define sdp_warn(sk, format, arg...) \ + sdp_printk(KERN_WARNING, sk, format , ## arg) #define SDP_MODPARAM_SINT(var, def_val, msg) \ static int var = def_val; \ @@ -42,17 +29,17 @@ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \ #ifdef SDP_PROFILING -struct sk_buff; +struct mbuf; struct sdpprf_log { int idx; int pid; int cpu; int sk_num; int sk_dport; - struct sk_buff *skb; + struct mbuf *mb; char msg[256]; - cycles_t time; + unsigned long long time; const char *func; int line; @@ -61,32 +48,40 @@ #define SDPPRF_LOG_SIZE 0x20000 /* must be a power of 2 */ extern struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE]; -extern atomic_t sdpprf_log_count; +extern int sdpprf_log_count; -#define _sdp_prf(sk, s, _func, _line, format, arg...) ({ \ - int idx = atomic_add_return(1, &sdpprf_log_count); \ +#ifdef GETNSTIMEODAY_SUPPORTED +static inline unsigned long long current_nsec(void) +{ + struct timespec tv; + getnstimeofday(&tv); + return tv.tv_sec * NSEC_PER_SEC + tv.tv_nsec; +} +#else +#define current_nsec() jiffies_to_usecs(jiffies) +#endif + +#define sdp_prf1(sk, s, format, arg...) ({ \ struct sdpprf_log *l = \ - &sdpprf_log[idx & (SDPPRF_LOG_SIZE - 1)]; \ + &sdpprf_log[sdpprf_log_count++ & (SDPPRF_LOG_SIZE - 1)]; \ preempt_disable(); \ - l->idx = idx; \ + l->idx = sdpprf_log_count - 1; \ l->pid = current->pid; \ l->sk_num = (sk) ? inet_sk(sk)->num : -1; \ l->sk_dport = (sk) ? ntohs(inet_sk(sk)->dport) : -1; \ l->cpu = smp_processor_id(); \ - l->skb = s; \ + l->mb = s; \ snprintf(l->msg, sizeof(l->msg) - 1, format, ## arg); \ - l->time = get_cycles(); \ - l->func = _func; \ - l->line = _line; \ + l->time = current_nsec(); \ + l->func = __func__; \ + l->line = __LINE__; \ preempt_enable(); \ 1; \ }) -#define sdp_prf1(sk, s, format, arg...) \ - _sdp_prf(sk, s, __func__, __LINE__, format, ## arg) +//#define sdp_prf(sk, s, format, arg...) #define sdp_prf(sk, s, format, arg...) sdp_prf1(sk, s, format, ## arg) #else -#define _sdp_prf(sk, s, _func, _line, format, arg...) #define sdp_prf1(sk, s, format, arg...) #define sdp_prf(sk, s, format, arg...) #endif @@ -94,26 +89,12 @@ #ifdef CONFIG_INFINIBAND_SDP_DEBUG extern int sdp_debug_level; -#define sdp_dbg(sk, format, arg...) \ - do { \ - if (sdp_debug_level > 0) \ - sdp_printk(KERN_WARNING, sk, format , ## arg); \ - sdp_prf(sk, NULL, format , ## arg); \ +#define sdp_dbg(sk, format, arg...) \ + do { \ + if (sdp_debug_level > 0) \ + sdp_printk(KERN_WARNING, sk, format , ## arg); \ } while (0) -#define sock_ref(sk, msg, sock_op) ({ \ - if (!atomic_read(&(sk)->sk_refcnt)) {\ - sdp_warn(sk, "%s:%d - %s (%s) ref = 0.\n", \ - __func__, __LINE__, #sock_op, msg); \ - sdp_print_history(sk); \ - SDP_WARN_ON(1); \ - } else { \ - sdp_dbg(sk, "%s:%d - %s (%s) ref = %d.\n", __func__, __LINE__, \ - #sock_op, msg, atomic_read(&(sk)->sk_refcnt)); \ - sock_op(sk); \ - }\ -}) - #else /* CONFIG_INFINIBAND_SDP_DEBUG */ #define sdp_dbg(priv, format, arg...) \ do { (void) (priv); } while (0) @@ -123,178 +104,64 @@ #ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA extern int sdp_data_debug_level; -#define sdp_dbg_data(sk, format, arg...) \ - do { \ - if (sdp_data_debug_level & 0x2) \ - sdp_printk(KERN_WARNING, sk, format , ## arg); \ - sdp_prf(sk, NULL, format , ## arg); \ +#define sdp_dbg_data(sk, format, arg...) \ + do { \ + if (sdp_data_debug_level & 0x2) \ + sdp_printk(KERN_WARNING, sk, format , ## arg); \ } while (0) -#define SDP_DUMP_PACKET(sk, str, skb, h) \ +#define SDP_DUMP_PACKET(sk, str, mb, h) \ do { \ - if (sdp_data_debug_level & 0x1) \ - dump_packet(sk, str, skb, h); \ + if (sdp_data_debug_level & 0x1) \ + dump_packet(sk, str, mb, h); \ } while (0) #else #define sdp_dbg_data(priv, format, arg...) -#define SDP_DUMP_PACKET(sk, str, skb, h) +#define SDP_DUMP_PACKET(sk, str, mb, h) #endif -enum sdp_ref { - SOCK_REF_RESET, - SOCK_REF_ALIVE, /* sock_alloc -> destruct_sock */ - SOCK_REF_CLONE, - SOCK_REF_CMA, /* sdp_cma_handler is expected to be invoked */ - SOCK_REF_SEQ, /* during proc read */ - SOCK_REF_DREQ_TO, /* dreq timeout is pending */ - SOCK_REF_ZCOPY, /* zcopy send in process */ - SOCK_REF_RDMA_RD, /* RDMA read in process */ - SOCK_REF_KEEPALIVE /* socket is held by sk_reset_timer */ -}; +#define SOCK_REF_RESET "RESET" +#define SOCK_REF_ALIVE "ALIVE" /* sock_alloc -> destruct_sock */ +#define SOCK_REF_CLONE "CLONE" +#define SOCK_REF_CMA "CMA" /* sdp_cma_handler() is expected to be invoked */ +#define SOCK_REF_SEQ "SEQ" /* during proc read */ +#define SOCK_REF_DREQ_TO "DREQ_TO" /* dreq timeout is pending */ +#define SOCK_REF_ZCOPY "ZCOPY" /* zcopy send in process */ +#define SOCK_REF_RDMA_RD "RDMA_RD" /* RDMA read in process */ -#ifdef SDP_SOCK_HISTORY -#define SDP_SOCK_HISTORY_LEN 128 +#define sock_hold(sk, msg) sock_ref(sk, msg, sock_hold) +#define sock_put(sk, msg) sock_ref(sk, msg, sock_put) +#define __sock_put(sk, msg) sock_ref(sk, msg, __sock_put) -enum sdp_ref_type { - NOT_REF, - HOLD_REF, - PUT_REF, - __PUT_REF, - BOTH_REF -}; - -struct sdp_sock_hist { - char *str; - char *func; - int line; - int pid; - u8 cnt; - u8 ref_type; /* enum sdp_ref_type */ - u8 ref_enum; /* enum sdp_ref */ -}; - -static inline char *reftype2str(int reftype) -{ #define ENUM2STR(e) [e] = #e - static char *enum2str[] = { - ENUM2STR(NOT_REF), - ENUM2STR(HOLD_REF), - ENUM2STR(PUT_REF), - ENUM2STR(__PUT_REF), - ENUM2STR(BOTH_REF) - }; - if (reftype < 0 || reftype >= ARRAY_SIZE(enum2str)) { - printk(KERN_WARNING "reftype %d is illegal\n", reftype); - return NULL; - } - - return enum2str[reftype]; -} - -void _sdp_add_to_history(struct sock *sk, const char *str, - const char *func, int line, int ref_type, int ref_enum); -void sdp_print_history(struct sock *sk); - -#define sdp_add_to_history(sk, str) \ - _sdp_add_to_history(sk, str, __func__, __LINE__, 0, 0) - -#define sock_hold(sk, msg) \ - do { \ - _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ - HOLD_REF, msg); \ - sock_ref(sk, #msg, sock_hold); \ - } while (0) - -#define sock_put(sk, msg) \ - do { \ - _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ - PUT_REF, msg); \ - sock_ref(sk, #msg, sock_put); \ - } while (0) - -#define __sock_put(sk, msg) \ - do { \ - _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ - __PUT_REF, msg); \ - sock_ref(sk, #msg, __sock_put); \ - } while (0) - -int sdp_ssk_hist_open(struct sock *sk); -int sdp_ssk_hist_close(struct sock *sk); -int sdp_ssk_hist_rename(struct sock *sk); - -#else -#define sock_hold(sk, msg) sock_ref(sk, #msg, sock_hold) -#define sock_put(sk, msg) sock_ref(sk, #msg, sock_put) -#define __sock_put(sk, msg) sock_ref(sk, #msg, __sock_put) - -#define _sdp_add_to_history(sk, str, func, line, ref_type, ref_enum) -#define sdp_add_to_history(sk, str) -#define sdp_print_history(sk) - -#endif /* SDP_SOCK_HISTORY */ - -#define ENUM2STR(e) [e] = #e - static inline char *sdp_state_str(int state) { static char *state2str[] = { - ENUM2STR(TCP_ESTABLISHED), - ENUM2STR(TCP_SYN_SENT), - ENUM2STR(TCP_SYN_RECV), - ENUM2STR(TCP_FIN_WAIT1), - ENUM2STR(TCP_FIN_WAIT2), - ENUM2STR(TCP_TIME_WAIT), - ENUM2STR(TCP_CLOSE), - ENUM2STR(TCP_CLOSE_WAIT), - ENUM2STR(TCP_LAST_ACK), - ENUM2STR(TCP_LISTEN), - ENUM2STR(TCP_CLOSING), + ENUM2STR(TCPS_ESTABLISHED), + ENUM2STR(TCPS_SYN_SENT), + ENUM2STR(TCPS_SYN_RECEIVED), + ENUM2STR(TCPS_FIN_WAIT_1), + ENUM2STR(TCPS_FIN_WAIT_2), + ENUM2STR(TCPS_TIME_WAIT), + ENUM2STR(TCPS_CLOSED), + ENUM2STR(TCPS_CLOSE_WAIT), + ENUM2STR(TCPS_LAST_ACK), + ENUM2STR(TCPS_LISTEN), + ENUM2STR(TCPS_CLOSING), }; - if (state < 0 || state >= ARRAY_SIZE(state2str)) { - printk(KERN_WARNING "state %d is illegal\n", state); - return NULL; - } + if (state < 0 || state >= ARRAY_SIZE(state2str)) + return "unknown"; return state2str[state]; } -static inline const char* rdma_cm_event_str(int event) -{ - static const char* state2str[] = { - ENUM2STR(RDMA_CM_EVENT_ADDR_RESOLVED), - ENUM2STR(RDMA_CM_EVENT_ADDR_ERROR), - ENUM2STR(RDMA_CM_EVENT_ROUTE_RESOLVED), - ENUM2STR(RDMA_CM_EVENT_ROUTE_ERROR), - ENUM2STR(RDMA_CM_EVENT_CONNECT_REQUEST), - ENUM2STR(RDMA_CM_EVENT_CONNECT_RESPONSE), - ENUM2STR(RDMA_CM_EVENT_CONNECT_ERROR), - ENUM2STR(RDMA_CM_EVENT_UNREACHABLE), - ENUM2STR(RDMA_CM_EVENT_REJECTED), - ENUM2STR(RDMA_CM_EVENT_ESTABLISHED), - ENUM2STR(RDMA_CM_EVENT_DISCONNECTED), - ENUM2STR(RDMA_CM_EVENT_DEVICE_REMOVAL), - ENUM2STR(RDMA_CM_EVENT_MULTICAST_JOIN), - ENUM2STR(RDMA_CM_EVENT_MULTICAST_ERROR), - ENUM2STR(RDMA_CM_EVENT_ADDR_CHANGE), - ENUM2STR(RDMA_CM_EVENT_TIMEWAIT_EXIT) - }; - - if (event < 0 || event >= ARRAY_SIZE(state2str)) { - printk(KERN_WARNING "event %d is illegal\n", event); - return NULL; - } - - return state2str[event]; -} - struct sdp_bsdh; #ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA -void _dump_packet(const char *func, int line, struct sock *sk, char *str, - struct sk_buff *skb, const struct sdp_bsdh *h); -#define dump_packet(sk, str, skb, h) \ - _dump_packet(__func__, __LINE__, sk, str, skb, h) +void _dump_packet(const char *func, int line, struct socket *sk, char *str, + struct mbuf *mb, const struct sdp_bsdh *h); +#define dump_packet(sk, str, mb, h) \ + _dump_packet(__func__, __LINE__, sk, str, mb, h) #endif #endif Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c (.../head) (revision 219811) @@ -31,24 +31,14 @@ */ #include -#include #include -#include #include "sdp.h" #ifdef CONFIG_PROC_FS -#define DEBUGFS_SDP_BASE "sdp" #define PROC_SDP_STATS "sdpstats" #define PROC_SDP_PERF "sdpprf" -#if defined(SDP_SOCK_HISTORY) || defined(SDP_PROFILING) -struct dentry *sdp_dbgfs_base; -#endif -#ifdef SDP_PROFILING -struct dentry *sdp_prof_file = NULL; -#endif - /* just like TCP fs */ struct sdp_seq_afinfo { struct module *owner; @@ -79,14 +69,6 @@ return NULL; } -#define sdp_sock_hold_return(sk, msg) \ - ({ \ - _sdp_add_to_history(sk, #msg, __func__, __LINE__, HOLD_REF, msg); \ - sdp_dbg(sk, "%s:%d - %s (%s) ref = %d.\n", __func__, __LINE__, \ - "sock_hold", #msg, atomic_read(&(sk)->sk_refcnt)); \ - atomic_inc_return(&(sk)->sk_refcnt); \ - }) - static void *sdp_seq_start(struct seq_file *seq, loff_t *pos) { void *start = NULL; @@ -99,12 +81,8 @@ spin_lock_irq(&sock_list_lock); start = sdp_get_idx(seq, *pos - 1); - if (!start) - goto out; - - if (sdp_sock_hold_return((struct sock *)start, SOCK_REF_SEQ) < 2) - start = NULL; -out: + if (start) + sock_hold((struct socket *)start, SOCK_REF_SEQ); spin_unlock_irq(&sock_list_lock); return start; @@ -120,13 +98,10 @@ next = sdp_get_idx(seq, 0); else next = sdp_get_idx(seq, *pos); - if (!next) - goto out; - - if (sdp_sock_hold_return((struct sock *)next, SOCK_REF_SEQ) < 2) - next = NULL; -out: + if (next) + sock_hold((struct socket *)next, SOCK_REF_SEQ); spin_unlock_irq(&sock_list_lock); + *pos += 1; st->num++; @@ -142,7 +117,7 @@ static int sdp_seq_show(struct seq_file *seq, void *v) { struct sdp_iter_state *st; - struct sock *sk = v; + struct socket *sk = v; char tmpbuf[TMPSZ + 1]; unsigned int dest; unsigned int src; @@ -195,7 +170,7 @@ #define _kzalloc(size,flags) kzalloc(size,flags) #undef kzalloc s = kzalloc(sizeof(*s), GFP_KERNEL); -#define kzalloc(s,f) _kzalloc(s,f) +#define kzalloc(s,f) _kzalloc(s,f) if (!s) return -ENOMEM; s->family = afinfo->family; @@ -234,19 +209,12 @@ { int i; u32 max = 0; - int first = -1, last = n - 1; seq_printf(seq, "%s:\n", str); for (i = 0; i < n; i++) { if (h[i] > max) max = h[i]; - - if (first == -1 && h[i]) - first = i; - - if (h[i]) - last = i; } if (max == 0) { @@ -254,14 +222,14 @@ return; } - for (i = first; i <= last; i++) { + for (i = 0; i < n; i++) { char s[51]; int j = 50 * h[i] / max; int val = is_log ? (i == n-1 ? 0 : 1<time - start_t); - usec_rem = do_div(t, USEC_PER_SEC); - remove_newline(l->msg); + t = l->time - start_t; + nsec_rem = do_div(t, 1000000000); + seq_printf(m, "%-6d: [%5lu.%06lu] %-50s - [%d{%d} %d:%d] " - "skb: %p %s:%d\n", - l->idx, t, usec_rem, + "mb: %p %s:%d\n", + l->idx, (unsigned long)t, nsec_rem/1000, l->msg, l->pid, l->cpu, l->sk_num, l->sk_dport, - l->skb, l->func, l->line); + l->mb, l->func, l->line); out: return 0; } @@ -463,15 +380,15 @@ int idx = *pos; if (!*pos) { - if (!atomic_read(&sdpprf_log_count)) + if (!sdpprf_log_count) return SEQ_START_TOKEN; } - if (*pos >= MIN(atomic_read(&sdpprf_log_count), SDPPRF_LOG_SIZE - 1)) + if (*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1)) return NULL; - if (atomic_read(&sdpprf_log_count) >= SDPPRF_LOG_SIZE - 1) { - int off = atomic_read(&sdpprf_log_count) & (SDPPRF_LOG_SIZE - 1); + if (sdpprf_log_count >= SDPPRF_LOG_SIZE - 1) { + int off = sdpprf_log_count & (SDPPRF_LOG_SIZE - 1); idx = (idx + off) & (SDPPRF_LOG_SIZE - 1); } @@ -485,7 +402,7 @@ { struct sdpprf_log *l = v; - if (++*pos >= MIN(atomic_read(&sdpprf_log_count), SDPPRF_LOG_SIZE - 1)) + if (++*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1)) return NULL; ++l; @@ -518,7 +435,7 @@ static ssize_t sdpprf_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { - atomic_set(&sdpprf_log_count, 0); + sdpprf_log_count = 0; printk(KERN_INFO "Cleared sdpprf statistics\n"); return count; @@ -533,222 +450,15 @@ }; #endif /* SDP_PROFILING */ -#ifdef SDP_SOCK_HISTORY - -void sdp_print_history(struct sock *sk) -{ - struct sdp_sock *ssk = sdp_sk(sk); - unsigned i; - unsigned long flags; - - spin_lock_irqsave(&ssk->hst_lock, flags); - - sdp_warn(sk, "############## %p %s %lu/%zu ##############\n", - sk, sdp_state_str(sk->sk_state), - ssk->hst_idx, ARRAY_SIZE(ssk->hst)); - - for (i = 0; i < ssk->hst_idx; ++i) { - struct sdp_sock_hist *hst = &ssk->hst[i]; - char *ref_str = reftype2str(hst->ref_type); - - if (hst->ref_type == NOT_REF) - ref_str = ""; - - if (hst->cnt != 1) { - sdp_warn(sk, "[%s:%d pid: %d] %s %s : %d\n", - hst->func, hst->line, hst->pid, - ref_str, hst->str, hst->cnt); - } else { - sdp_warn(sk, "[%s:%d pid: %d] %s %s\n", - hst->func, hst->line, hst->pid, - ref_str, hst->str); - } - } - - spin_unlock_irqrestore(&ssk->hst_lock, flags); -} - -void _sdp_add_to_history(struct sock *sk, const char *str, - const char *func, int line, int ref_type, int ref_enum) -{ - struct sdp_sock *ssk = sdp_sk(sk); - unsigned i; - unsigned long flags; - struct sdp_sock_hist *hst; - - spin_lock_irqsave(&ssk->hst_lock, flags); - - i = ssk->hst_idx; - - if (i >= ARRAY_SIZE(ssk->hst)) { - //sdp_warn(sk, "overflow, drop: %s\n", s); - ++ssk->hst_idx; - goto out; - } - - if (ssk->hst[i].str) - sdp_warn(sk, "overwriting %s\n", ssk->hst[i].str); - - switch (ref_type) { - case NOT_REF: - case HOLD_REF: -simple_add: - hst = &ssk->hst[i]; - hst->str = (char *)str; - hst->func = (char *)func; - hst->line = line; - hst->ref_type = ref_type; - hst->ref_enum = ref_enum; - hst->cnt = 1; - hst->pid = current->pid; - ++ssk->hst_idx; - break; - case PUT_REF: - case __PUT_REF: - /* Try to shrink history by attaching HOLD+PUT - * together */ - hst = i > 0 ? &ssk->hst[i - 1] : NULL; - if (hst && hst->ref_type == HOLD_REF && - hst->ref_enum == ref_enum) { - hst->ref_type = BOTH_REF; - hst->func = (char *)func; - hst->line = line; - hst->pid = current->pid; - - /* try to shrink some more - by summing up */ - --i; - hst = i > 0 ? &ssk->hst[i - 1] : NULL; - if (hst && hst->ref_type == BOTH_REF && - hst->ref_enum == ref_enum) { - ++hst->cnt; - hst->func = (char *)func; - hst->line = line; - hst->pid = current->pid; - ssk->hst[i].str = NULL; - - --ssk->hst_idx; - } - } else - goto simple_add; - break; - default: - sdp_warn(sk, "error\n"); - } -out: - spin_unlock_irqrestore(&ssk->hst_lock, flags); -} -static int sdp_ssk_hist_seq_show(struct seq_file *seq, void *v) -{ - struct sock *sk = seq->private; - struct sdp_sock *ssk = sdp_sk(sk); - unsigned i; - unsigned long flags; - - spin_lock_irqsave(&ssk->hst_lock, flags); - - seq_printf(seq, "############## %p %s %lu/%zu ##############\n", - sk, sdp_state_str(sk->sk_state), - ssk->hst_idx, ARRAY_SIZE(ssk->hst)); - - for (i = 0; i < ssk->hst_idx; ++i) { - struct sdp_sock_hist *hst = &ssk->hst[i]; - char *ref_str = reftype2str(hst->ref_type); - - if (hst->ref_type == NOT_REF) - ref_str = ""; - - if (hst->cnt != 1) { - seq_printf(seq, "[%30s:%-5d pid: %-6d] %s %s : %d\n", - hst->func, hst->line, hst->pid, - ref_str, hst->str, hst->cnt); - } else { - seq_printf(seq, "[%30s:%-5d pid: %-6d] %s %s\n", - hst->func, hst->line, hst->pid, - ref_str, hst->str); - } - } - - spin_unlock_irqrestore(&ssk->hst_lock, flags); - return 0; -} - -static int sdp_ssk_hist_seq_open(struct inode *inode, struct file *file) -{ - struct sock *sk = inode->i_private; - - return single_open(file, sdp_ssk_hist_seq_show, sk); -} - -static struct file_operations ssk_hist_fops = { - .owner = THIS_MODULE, - .open = sdp_ssk_hist_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void sdp_ssk_hist_name(char *sk_name, int len, struct sock *sk) -{ - int lport = inet_sk(sk)->num; - int rport = ntohs(inet_sk(sk)->dport); - - snprintf(sk_name, len, "%05x_%d:%d", - sdp_sk(sk)->sk_id, lport, rport); -} - -int sdp_ssk_hist_open(struct sock *sk) -{ - int ret = 0; - char sk_name[256]; - struct sdp_sock *ssk = sdp_sk(sk); - - if (!sdp_dbgfs_base) { - return 0; - } - - sdp_ssk_hist_name(sk_name, sizeof(sk_name), sk); - - ssk->hst_dentr = debugfs_create_file(sk_name, S_IRUGO | S_IWUGO, - sdp_dbgfs_base, sk, &ssk_hist_fops); - if (IS_ERR(ssk->hst_dentr)) { - ret = PTR_ERR(ssk->hst_dentr); - ssk->hst_dentr = NULL; - } - - return ret; -} - -int sdp_ssk_hist_close(struct sock *sk) -{ - if (sk && sdp_sk(sk)->hst_dentr) - debugfs_remove(sdp_sk(sk)->hst_dentr); - return 0; -} - -int sdp_ssk_hist_rename(struct sock *sk) -{ - char sk_name[256]; - struct dentry *d; - - if (!sk || !sdp_sk(sk)->hst_dentr) - return 0; - - sdp_ssk_hist_name(sk_name, sizeof(sk_name), sk); - - d = debugfs_rename(sdp_dbgfs_base, sdp_sk(sk)->hst_dentr, sdp_dbgfs_base, sk_name); - if (IS_ERR(d)) - return PTR_ERR(d); - - return 0; -} -#endif - int __init sdp_proc_init(void) { struct proc_dir_entry *p = NULL; #ifdef SDPSTATS_ON struct proc_dir_entry *stats = NULL; #endif +#ifdef SDP_PROFILING + struct proc_dir_entry *prof = NULL; +#endif sdp_seq_afinfo.seq_fops->owner = sdp_seq_afinfo.owner; sdp_seq_afinfo.seq_fops->open = sdp_seq_open; @@ -756,19 +466,6 @@ sdp_seq_afinfo.seq_fops->llseek = seq_lseek; sdp_seq_afinfo.seq_fops->release = seq_release_private; -#if defined(SDP_PROFILING) || defined(SDP_SOCK_HISTORY) - sdp_dbgfs_base = debugfs_create_dir(DEBUGFS_SDP_BASE, NULL); - if (!sdp_dbgfs_base || IS_ERR(sdp_dbgfs_base)) { - if (PTR_ERR(sdp_dbgfs_base) == -ENODEV) - printk(KERN_WARNING "sdp: debugfs is not supported.\n"); - else { - printk(KERN_ERR "sdp: error creating debugfs information %ld\n", - PTR_ERR(sdp_dbgfs_base)); - return -EINVAL; - } - } -#endif - p = proc_net_fops_create(&init_net, sdp_seq_afinfo.name, S_IRUGO, sdp_seq_afinfo.seq_fops); if (p) @@ -786,9 +483,9 @@ #endif #ifdef SDP_PROFILING - sdp_prof_file = debugfs_create_file(PROC_SDP_PERF, S_IRUGO | S_IWUGO, - sdp_dbgfs_base, NULL, &sdpprf_fops); - if (!sdp_prof_file) + prof = proc_net_fops_create(&init_net, PROC_SDP_PERF, + S_IRUGO | S_IWUGO, &sdpprf_fops); + if (!prof) goto no_mem_prof; #endif @@ -805,7 +502,7 @@ #endif proc_net_remove(&init_net, sdp_seq_afinfo.name); -no_mem: +no_mem: return -ENOMEM; } @@ -818,11 +515,8 @@ proc_net_remove(&init_net, PROC_SDP_STATS); #endif #ifdef SDP_PROFILING - debugfs_remove(sdp_prof_file); + proc_net_remove(&init_net, PROC_SDP_PERF); #endif -#if defined(SDP_PROFILING) || defined(SDP_SOCK_HISTORY) - debugfs_remove(sdp_dbgfs_base); -#endif } #else /* CONFIG_PROC_FS */ Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (.../head) (revision 219811) @@ -1,3062 +1,1961 @@ -/* - * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. + * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c */ + /* - * This file is based on net/ipv4/tcp.c - * under the following permission notice: * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iX Systems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * */ +#include +__FBSDID("$FreeBSD$"); -#if defined(__ia64__) -/* csum_partial_copy_from_user is not exported on ia64. - We don't really need it for SDP - skb_copy_to_page happens to call it - but for SDP HW checksum is always set, so ... */ - -#include -#include -#include - -static inline -unsigned int csum_partial_copy_from_user_new (const char *src, char *dst, - int len, unsigned int sum, - int *errp) -{ - *errp = -EINVAL; - return 0; -} - -#define csum_partial_copy_from_user csum_partial_copy_from_user_new -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "sdp.h" -#include -MODULE_AUTHOR("Michael S. Tsirkin"); -MODULE_DESCRIPTION("InfiniBand SDP module"); -MODULE_LICENSE("Dual BSD/GPL"); +#include +#include +#include -#ifdef CONFIG_INFINIBAND_SDP_DEBUG -SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0."); -#endif -#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA -SDP_MODPARAM_INT(sdp_data_debug_level, 0, - "Enable data path debug tracing if > 0."); -#endif +uma_zone_t sdp_zone; +struct rwlock sdp_lock; +LIST_HEAD(, sdp_sock) sdp_list; -SDP_MODPARAM_SINT(sdp_fmr_pool_size, 20, "Number of FMRs to allocate for pool"); -SDP_MODPARAM_SINT(sdp_fmr_dirty_wm, 5, "Watermark to flush fmr pool"); +struct workqueue_struct *rx_comp_wq; -SDP_MODPARAM_SINT(recv_poll, 700, "usecs to poll recv before arming interrupt."); -SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME, - "Default idle time in seconds before keepalive probe sent."); +RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); +#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) +#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) +#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) +#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) +#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) +#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) +#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) -SDP_MODPARAM_INT(sdp_inline_thresh, SDP_DEF_INLINE_THRESH, - "Inline copy threshold. effective to new sockets only; 0=Off."); +MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol"); -static int sdp_bzcopy_thresh = 0; -SDP_MODPARAM_INT(sdp_zcopy_thresh, SDP_DEF_ZCOPY_THRESH , - "Zero copy using RDMA threshold; 0=Off."); -#define SDP_RX_COAL_TIME_HIGH 128 -SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000, - "Target number of bytes to coalesce with interrupt moderation."); -SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies)."); -SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec)."); -SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation usec."); -SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec)."); -SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation usec."); -SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH), - "rx rate thresh ()."); -SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies)."); +static void sdp_stop_keepalive_timer(struct socket *so); -SDP_MODPARAM_SINT(hw_int_mod_count, -1, - "forced hw int moderation val. -1 for auto (packets)."); -SDP_MODPARAM_SINT(hw_int_mod_usec, -1, - "forced hw int moderation val. -1 for auto (usec)."); +/* + * SDP protocol interface to socket abstraction. + */ +/* + * sdp_sendspace and sdp_recvspace are the default send and receive window + * sizes, respectively. + */ +u_long sdp_sendspace = 1024*32; +u_long sdp_recvspace = 1024*64; -struct workqueue_struct *sdp_wq; -struct workqueue_struct *rx_comp_wq; +static int sdp_count; -struct list_head sock_list; -spinlock_t sock_list_lock; - -DECLARE_RWSEM(device_removal_lock); - -static inline unsigned int sdp_keepalive_time_when(const struct sdp_sock *ssk) +/* + * Disable async. CMA events for sockets which are being torn down. + */ +static void +sdp_destroy_cma(struct sdp_sock *ssk) { - return ssk->keepalive_time ? : sdp_keepalive_time; -} -inline void sdp_add_sock(struct sdp_sock *ssk) -{ - spin_lock_irq(&sock_list_lock); - list_add_tail(&ssk->sock_list, &sock_list); - spin_unlock_irq(&sock_list_lock); + if (ssk->id == NULL) + return; + rdma_destroy_id(ssk->id); + ssk->id = NULL; } -inline void sdp_remove_sock(struct sdp_sock *ssk) +static int +sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) { - spin_lock_irq(&sock_list_lock); - BUG_ON(list_empty(&sock_list)); - list_del_init(&(ssk->sock_list)); - spin_unlock_irq(&sock_list_lock); -} + struct sockaddr_in *sin; + struct sockaddr_in null; + int error; -static int sdp_get_port(struct sock *sk, unsigned short snum) -{ - struct sdp_sock *ssk = sdp_sk(sk); - struct sockaddr_in *src_addr; - int rc; + SDP_WLOCK_ASSERT(ssk); - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(snum), - .sin_addr.s_addr = inet_sk(sk)->rcv_saddr, - }; - - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s: %u.%u.%u.%u:%hu\n", __func__, - NIPQUAD(addr.sin_addr.s_addr), ntohs(addr.sin_port)); - - if (!ssk->id) - ssk->id = rdma_create_id(sdp_cma_handler, sk, RDMA_PS_SDP); - - if (!ssk->id) - return -ENOMEM; - - /* IP core seems to bind many times to the same address */ - /* TODO: I don't really understand why. Find out. */ - if (!memcmp(&addr, &ssk->id->route.addr.src_addr, sizeof addr)) - return 0; - - rc = ssk->last_bind_err = rdma_bind_addr(ssk->id, (struct sockaddr *)&addr); - if (rc) { - sdp_dbg(sk, "Destroying rdma id\n"); - rdma_destroy_id(ssk->id); - ssk->id = NULL; - return rc; + if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) + return (EINVAL); + /* rdma_bind_addr handles bind races. */ + SDP_WUNLOCK(ssk); + if (ssk->id == NULL) + ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP); + if (ssk->id == NULL) { + SDP_WLOCK(ssk); + return (ENOMEM); } - - src_addr = (struct sockaddr_in *)&(ssk->id->route.addr.src_addr); - inet_sk(sk)->num = ntohs(src_addr->sin_port); -#ifdef SDP_SOCK_HISTORY - sdp_ssk_hist_rename(sk); -#endif - return 0; + if (nam == NULL) { + null.sin_family = AF_INET; + null.sin_len = sizeof(null); + null.sin_addr.s_addr = INADDR_ANY; + null.sin_port = 0; + bzero(&null.sin_zero, sizeof(null.sin_zero)); + nam = (struct sockaddr *)&null; + } + error = -rdma_bind_addr(ssk->id, nam); + SDP_WLOCK(ssk); + if (error == 0) { + sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; + ssk->laddr = sin->sin_addr.s_addr; + ssk->lport = sin->sin_port; + } else + sdp_destroy_cma(ssk); + return (error); } -static void sdp_destroy_qp(struct sdp_sock *ssk) +static void +sdp_pcbfree(struct sdp_sock *ssk) { - sdp_dbg(sk_ssk(ssk), "destroying qp\n"); - sdp_prf(sk_ssk(ssk), NULL, "destroying qp"); + KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); - sdp_add_to_history(sk_ssk(ssk), __func__); + sdp_dbg(ssk->socket, "Freeing pcb"); + SDP_WLOCK_ASSERT(ssk); + ssk->flags |= SDP_DESTROY; + SDP_WUNLOCK(ssk); + SDP_LIST_WLOCK(); + sdp_count--; + LIST_REMOVE(ssk, list); + SDP_LIST_WUNLOCK(); + crfree(ssk->cred); + sdp_destroy_cma(ssk); ssk->qp_active = 0; - if (ssk->qp) { ib_destroy_qp(ssk->qp); ssk->qp = NULL; } - - sdp_rx_ring_destroy(ssk); sdp_tx_ring_destroy(ssk); - - sdp_remove_large_sock(ssk); + sdp_rx_ring_destroy(ssk); + rw_destroy(&ssk->rx_ring.destroyed_lock); + uma_zfree(sdp_zone, ssk); + rw_destroy(&ssk->lock); } -static void sdp_reset_keepalive_timer(struct sock *sk, unsigned long len) +/* + * Common routines to return a socket address. + */ +static struct sockaddr * +sdp_sockaddr(in_port_t port, struct in_addr *addr_p) { - struct sdp_sock *ssk = sdp_sk(sk); + struct sockaddr_in *sin; - sdp_dbg(sk, "%s\n", __func__); + sin = malloc(sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = *addr_p; + sin->sin_port = port; - ssk->keepalive_tx_head = ring_head(ssk->tx_ring); - ssk->keepalive_rx_head = ring_head(ssk->rx_ring); - - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); + return (struct sockaddr *)sin; } -static void sdp_delete_keepalive_timer(struct sock *sk) +static int +sdp_getsockaddr(struct socket *so, struct sockaddr **nam) { - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk; + struct in_addr addr; + in_port_t port; - sdp_dbg(sk, "%s\n", __func__); + ssk = sdp_sk(so); + SDP_RLOCK(ssk); + port = ssk->lport; + addr.s_addr = ssk->laddr; + SDP_RUNLOCK(ssk); - ssk->keepalive_tx_head = 0; - ssk->keepalive_rx_head = 0; - - sk_stop_timer(sk, &sk->sk_timer); + *nam = sdp_sockaddr(port, &addr); + return 0; } -static void sdp_keepalive_timer(unsigned long data) +static int +sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) { - struct sock *sk = (struct sock *)data; - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk; + struct in_addr addr; + in_port_t port; - sdp_dbg(sk, "%s\n", __func__); - SDPSTATS_COUNTER_INC(keepalive_timer); + ssk = sdp_sk(so); + SDP_RLOCK(ssk); + port = ssk->fport; + addr.s_addr = ssk->faddr; + SDP_RUNLOCK(ssk); - /* Only process if the socket is not in use */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sdp_reset_keepalive_timer(sk, HZ / 20); - goto out; - } - - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_LISTEN || - sk->sk_state == TCP_CLOSE || !ssk->qp) - goto out; - - if (ssk->keepalive_tx_head == ring_head(ssk->tx_ring) && - ssk->keepalive_rx_head == ring_head(ssk->rx_ring)) - sdp_post_keepalive(ssk); - - sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(ssk)); - -out: - bh_unlock_sock(sk); - sock_put(sk, SOCK_REF_KEEPALIVE); + *nam = sdp_sockaddr(port, &addr); + return 0; } -static void sdp_set_keepalive(struct sock *sk, int val) +static void +sdp_pcbnotifyall(struct in_addr faddr, int errno, + struct sdp_sock *(*notify)(struct sdp_sock *, int)) { - sdp_dbg(sk, "%s %d\n", __func__, val); + struct sdp_sock *ssk, *ssk_temp; - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - return; - - if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - sdp_start_keepalive_timer(sk); - else if (!val) - sdp_delete_keepalive_timer(sk); + SDP_LIST_WLOCK(); + LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { + SDP_WLOCK(ssk); + if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { + SDP_WUNLOCK(ssk); + continue; + } + if ((ssk->flags & SDP_DESTROY) == 0) + if ((*notify)(ssk, errno)) + SDP_WUNLOCK(ssk); + } + SDP_LIST_WUNLOCK(); } -void sdp_start_keepalive_timer(struct sock *sk) +#if 0 +static void +sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) { - sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk))); + struct sdp_sock *ssk; + + SDP_LIST_RLOCK(); + LIST_FOREACH(ssk, &sdp_list, list) { + SDP_WLOCK(ssk); + func(ssk, arg); + SDP_WUNLOCK(ssk); + } + SDP_LIST_RUNLOCK(); } +#endif -void sdp_set_default_moderation(struct sdp_sock *ssk) +static void +sdp_output_reset(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - struct sdp_moderation *mod = &ssk->auto_mod; - int rx_buf_size; + struct rdma_cm_id *id; - if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) { - int err; - - mod->adaptive_rx_coal = 0; - - if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) { - err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, - hw_int_mod_usec); - if (unlikely(err)) - sdp_warn(sk, - "Failed modifying moderation for cq\n"); - else - sdp_dbg(sk, - "Using fixed interrupt moderation\n"); - SDPSTATS_COUNTER_INC(rx_cq_modified); - } - return; + SDP_WLOCK_ASSERT(ssk); + if (ssk->id) { + id = ssk->id; + ssk->qp_active = 0; + SDP_WUNLOCK(ssk); + rdma_disconnect(id); + SDP_WLOCK(ssk); } - - mod->adaptive_rx_coal = 1; - sdp_dbg(sk, "Using adaptive interrupt moderation\n"); - - /* If we haven't received a specific coalescing setting - * (module param), we set the moderation paramters as follows: - * - moder_cnt is set to the number of mtu sized packets to - * satisfy our coelsing target. - * - moder_time is set to a fixed value. - */ - rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh); - mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1; - mod->moder_time = sdp_rx_coal_time; - sdp_dbg(sk, "Default coalesing params for buf size:%d - " - "moder_cnt:%d moder_time:%d\n", - rx_buf_size, mod->moder_cnt, mod->moder_time); - - /* Reset auto-moderation params */ - mod->pkt_rate_low = sdp_rx_rate_low; - mod->rx_usecs_low = sdp_rx_coal_time_low; - mod->pkt_rate_high = sdp_rx_rate_high; - mod->rx_usecs_high = sdp_rx_coal_time_high; - mod->sample_interval = sdp_sample_interval; - - mod->last_moder_time = SDP_AUTO_CONF; - mod->last_moder_jiffies = 0; - mod->last_moder_packets = 0; - mod->last_moder_tx_packets = 0; - mod->last_moder_bytes = 0; + ssk->state = TCPS_CLOSED; } -/* If tx and rx packet rates are not balanced, assume that - * traffic is mainly BW bound and apply maximum moderation. - * Otherwise, moderate according to packet rate */ -static inline int calc_moder_time(int rate, struct sdp_moderation *mod, - int tx_pkt_diff, int rx_pkt_diff) +/* + * Attempt to close a SDP socket, marking it as dropped, and freeing + * the socket if we hold the only reference. + */ +static struct sdp_sock * +sdp_closed(struct sdp_sock *ssk) { - if (2 * tx_pkt_diff > 3 * rx_pkt_diff || - 2 * rx_pkt_diff > 3 * tx_pkt_diff) - return mod->rx_usecs_high; + struct socket *so; - if (rate < mod->pkt_rate_low) - return mod->rx_usecs_low; + SDP_WLOCK_ASSERT(ssk); - if (rate > mod->pkt_rate_high) - return mod->rx_usecs_high; - - return (rate - mod->pkt_rate_low) * - (mod->rx_usecs_high - mod->rx_usecs_low) / - (mod->pkt_rate_high - mod->pkt_rate_low) + - mod->rx_usecs_low; + ssk->flags |= SDP_DROPPED; + so = ssk->socket; + soisdisconnected(so); + if (ssk->flags & SDP_SOCKREF) { + KASSERT(so->so_state & SS_PROTOREF, + ("sdp_closed: !SS_PROTOREF")); + ssk->flags &= ~SDP_SOCKREF; + SDP_WUNLOCK(ssk); + ACCEPT_LOCK(); + SOCK_LOCK(so); + so->so_state &= ~SS_PROTOREF; + sofree(so); + return (NULL); + } + return (ssk); } -static void sdp_auto_moderation(struct sdp_sock *ssk) +/* + * Perform timer based shutdowns which can not operate in + * callout context. + */ +static void +sdp_shutdown_task(void *data, int pending) { - struct sdp_moderation *mod = &ssk->auto_mod; + struct sdp_sock *ssk; - unsigned long period = jiffies - mod->last_moder_jiffies; - unsigned long packets; - unsigned long rate; - unsigned long avg_pkt_size; - unsigned long tx_pkt_diff; - unsigned long rx_pkt_diff; - int moder_time; - int err; - - if (unlikely(!ssk->rx_ring.cq)) + ssk = data; + SDP_WLOCK(ssk); + /* + * I don't think this can race with another call to pcbfree() + * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. + */ + if (ssk->flags & SDP_DESTROY) + panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", + ssk); + if (ssk->flags & SDP_DISCON) + sdp_output_reset(ssk); + /* We have to clear this so sdp_detach() will call pcbfree(). */ + ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); + if ((ssk->flags & SDP_DROPPED) == 0 && + sdp_closed(ssk) == NULL) return; - - if (!mod->adaptive_rx_coal) + if (ssk->socket == NULL) { + sdp_pcbfree(ssk); return; - - if (period < mod->sample_interval) - return; - - if (!mod->last_moder_jiffies || !period) - goto out; - - tx_pkt_diff = ((unsigned long) (ssk->tx_packets - - mod->last_moder_tx_packets)); - rx_pkt_diff = ((unsigned long) (ssk->rx_packets - - mod->last_moder_packets)); - packets = max(tx_pkt_diff, rx_pkt_diff); - rate = packets * HZ / period; - avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes - - mod->last_moder_bytes)) / packets : 0; - - /* Apply auto-moderation only when packet rate exceeds a rate that - * it matters */ - if (rate > sdp_rx_rate_thresh) { - moder_time = calc_moder_time(rate, mod, tx_pkt_diff, - rx_pkt_diff); - } else { - /* When packet rate is low, use default moderation rather - * than 0 to prevent interrupt storms if traffic suddenly - * increases */ - moder_time = mod->moder_time; } - - sdp_dbg_data(sk_ssk(ssk), "tx rate:%lu rx_rate:%lu\n", - tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period); - - sdp_dbg_data(sk_ssk(ssk), "Rx moder_time changed from:%d to %d " - "period:%lu [jiff] packets:%lu avg_pkt_size:%lu " - "rate:%lu [p/s])\n", - mod->last_moder_time, moder_time, period, packets, - avg_pkt_size, rate); - - if (moder_time != mod->last_moder_time) { - mod->last_moder_time = moder_time; - err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time); - if (unlikely(err)) { - sdp_dbg_data(sk_ssk(ssk), - "Failed modifying moderation for cq"); - } - SDPSTATS_COUNTER_INC(rx_cq_modified); - } - -out: - mod->last_moder_packets = ssk->rx_packets; - mod->last_moder_tx_packets = ssk->tx_packets; - mod->last_moder_bytes = ssk->rx_bytes; - mod->last_moder_jiffies = jiffies; + SDP_WUNLOCK(ssk); } -void sdp_reset_sk(struct sock *sk, int rc) +/* + * 2msl has expired, schedule the shutdown task. + */ +static void +sdp_2msl_timeout(void *data) { - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk; - sdp_dbg(sk, "%s\n", __func__); - - if (ssk->tx_ring.cq) - if (sdp_xmit_poll(ssk, 1)) - sdp_post_sends(ssk, 0); - - sdp_abort_srcavail(sk); - - if (!(sk->sk_shutdown & RCV_SHUTDOWN) || !sk_stream_memory_free(sk)) { - sdp_dbg(sk, "setting state to error\n"); - sdp_set_error(sk, rc); - } - - sk->sk_state_change(sk); - - /* Don't destroy socket before destroy work does its job */ - sock_hold(sk, SOCK_REF_RESET); - queue_work(sdp_wq, &ssk->destroy_work); + ssk = data; + /* Callout canceled. */ + if (!callout_active(&ssk->keep2msl)) + goto out; + callout_deactivate(&ssk->keep2msl); + /* Should be impossible, defensive programming. */ + if ((ssk->flags & SDP_TIMEWAIT) == 0) + goto out; + taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); +out: + SDP_WUNLOCK(ssk); + return; } -/* Like tcp_reset */ -/* When we get a reset (completion with error) we do this. */ -void sdp_reset(struct sock *sk) +/* + * Schedule the 2msl wait timer. + */ +static void +sdp_2msl_wait(struct sdp_sock *ssk) { - int err; - sdp_dbg(sk, "%s state=%s\n", __func__, sdp_state_str(sk->sk_state)); - - if (sk->sk_state != TCP_ESTABLISHED) - return; - - /* We want the right error as BSD sees it (and indeed as we do). */ - - /* On fin we currently only set RCV_SHUTDOWN, so .. */ - err = (sk->sk_shutdown & RCV_SHUTDOWN) ? EPIPE : ECONNRESET; - - sdp_set_error(sk, -err); - sk->sk_state_change(sk); + SDP_WLOCK_ASSERT(ssk); + ssk->flags |= SDP_TIMEWAIT; + ssk->state = TCPS_TIME_WAIT; + soisdisconnected(ssk->socket); + callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); } -/* TODO: linger? */ -static void sdp_destroy_resources(struct sock *sk) +/* + * Timed out waiting for the final fin/ack from rdma_disconnect(). + */ +static void +sdp_dreq_timeout(void *data) { - struct sdp_sock *ssk = sdp_sk(sk); - struct rdma_cm_id *id = NULL; - sdp_dbg(sk, "%s\n", __func__); + struct sdp_sock *ssk; - lock_sock(sk); - - sk->sk_send_head = NULL; - skb_queue_purge(&sk->sk_write_queue); - /* - * If sendmsg cached page exists, toss it. - */ - if (sk->sk_sndmsg_page) { - __free_page(sk->sk_sndmsg_page); - sk->sk_sndmsg_page = NULL; - atomic_dec(&sdp_current_mem_usage); - } - - id = ssk->id; - if (ssk->id) { - id->qp = NULL; - ssk->id = NULL; - release_sock(sk); - rdma_destroy_id(id); - lock_sock(sk); - } - - sdp_destroy_qp(ssk); - - /* QP is destroyed, so no one will queue skbs anymore. */ - if (ssk->rx_sa) - sdp_abort_rx_srcavail(sk); - - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&ssk->rx_ctl_q); - - sdp_dbg(sk, "%s done; releasing sock\n", __func__); - release_sock(sk); + ssk = data; + /* Callout canceled. */ + if (!callout_active(&ssk->keep2msl)) + goto out; + /* Callout rescheduled, probably as a different timer. */ + if (callout_pending(&ssk->keep2msl)) + goto out; + callout_deactivate(&ssk->keep2msl); + if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) + goto out; + if ((ssk->flags & SDP_DREQWAIT) == 0) + goto out; + ssk->flags &= ~SDP_DREQWAIT; + ssk->flags |= SDP_DISCON; + sdp_2msl_wait(ssk); + ssk->qp_active = 0; +out: + SDP_WUNLOCK(ssk); } -static inline void sdp_kill_id_and_release(struct sdp_sock *ssk) +/* + * Received the final fin/ack. Cancel the 2msl. + */ +void +sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - struct rdma_cm_id *id; - - lock_sock(sk); - id = ssk->id; - ssk->id = NULL; - release_sock(sk); - - if (id) - rdma_destroy_id(id); - sdp_common_release(sk); + sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); + ssk->flags &= ~SDP_DREQWAIT; + sdp_2msl_wait(ssk); } -static void sdp_destruct(struct sock *sk) +static int +sdp_init_sock(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); - struct sdp_sock *s, *t; sdp_dbg(sk, "%s\n", __func__); - if (ssk->destructed_already) { - sdp_warn(sk, "redestructing sk!\n"); - return; - } - sdp_add_to_history(sk, __func__); - percpu_counter_dec(sk->sk_prot->orphan_count); - percpu_counter_dec(sk->sk_prot->sockets_allocated); - ssk->destructed_already = 1; - - down_read(&device_removal_lock); - sdp_remove_sock(ssk); - sdp_destroy_resources(sk); - up_read(&device_removal_lock); - -#ifdef SDP_SOCK_HISTORY - sdp_add_to_history(sk, __func__); - sdp_ssk_hist_close(sk); + callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); + TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); +#ifdef SDP_ZCOPY + INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); + ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ + ssk->tx_ring.rdma_inflight = NULL; #endif + atomic_set(&ssk->mseq_ack, 0); + sdp_rx_ring_init(ssk); + ssk->tx_ring.buffer = NULL; - flush_workqueue(rx_comp_wq); - /* Consider use cancel_work_sync(&ssk->rx_comp_work) */ - - if (ssk->parent) - goto done; - - list_for_each_entry_safe(s, t, &ssk->backlog_queue, backlog_queue) { - sdp_kill_id_and_release(s); - } - list_for_each_entry_safe(s, t, &ssk->accept_queue, accept_queue) { - sdp_kill_id_and_release(s); - } - -done: - sdp_dbg(sk, "%s done\n", __func__); + return 0; } -static inline void sdp_start_dreq_wait_timeout(struct sdp_sock *ssk, int timeo) +/* + * Allocate an sdp_sock for the socket and reserve socket buffer space. + */ +static int +sdp_attach(struct socket *so, int proto, struct thread *td) { - sdp_dbg(sk_ssk(ssk), "Starting dreq wait timeout\n"); + struct sdp_sock *ssk; + int error; - queue_delayed_work(sdp_wq, &ssk->dreq_wait_work, timeo); - ssk->dreq_wait_timeout = 1; -} + ssk = sdp_sk(so); + KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, sdp_sendspace, sdp_recvspace); + if (error) + return (error); + } + so->so_rcv.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= SB_AUTOSIZE; + ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); + if (ssk == NULL) + return (ENOBUFS); + rw_init(&ssk->lock, "sdpsock"); + ssk->socket = so; + ssk->cred = crhold(so->so_cred); + so->so_pcb = (caddr_t)ssk; + sdp_init_sock(so); + ssk->flags = 0; + ssk->qp_active = 0; + ssk->state = TCPS_CLOSED; + SDP_LIST_WLOCK(); + LIST_INSERT_HEAD(&sdp_list, ssk, list); + sdp_count++; + SDP_LIST_WUNLOCK(); + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; -static void sdp_send_disconnect(struct sock *sk) -{ - sock_hold(sk, SOCK_REF_DREQ_TO); - sdp_start_dreq_wait_timeout(sdp_sk(sk), SDP_FIN_WAIT_TIMEOUT); - - sdp_sk(sk)->sdp_disconnect = 1; - sdp_post_sends(sdp_sk(sk), 0); - - sdp_arm_rx_cq(sk); + return (0); } /* - * State processing on a close. - * TCP_ESTABLISHED -> TCP_FIN_WAIT1 -> TCP_CLOSE + * Detach SDP from the socket, potentially leaving it around for the + * timewait to expire. */ -static int sdp_close_state(struct sock *sk) +static void +sdp_detach(struct socket *so) { - switch (sk->sk_state) { - case TCP_ESTABLISHED: - sdp_exch_state(sk, TCPF_ESTABLISHED, TCP_FIN_WAIT1); - break; - case TCP_CLOSE_WAIT: - sdp_exch_state(sk, TCPF_CLOSE_WAIT, TCP_LAST_ACK); - break; - default: - return 0; - } + struct sdp_sock *ssk; - return 1; + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); + ssk->socket->so_pcb = NULL; + ssk->socket = NULL; + if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) + SDP_WUNLOCK(ssk); + else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) + sdp_pcbfree(ssk); + else + panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); } /* - * In order to prevent asynchronous-events handling after the last reference - * count removed, we destroy rdma_id so cma_handler() won't be invoked. - * This function should be called under lock_sock(sk). + * Allocate a local address for the socket. */ -static inline void disable_cma_handler(struct sock *sk) +static int +sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { - if (sdp_sk(sk)->id) { - struct rdma_cm_id *id = sdp_sk(sk)->id; - sdp_sk(sk)->id = NULL; - release_sock(sk); - rdma_destroy_id(id); - lock_sock(sk); - } -} + int error = 0; + struct sdp_sock *ssk; + struct sockaddr_in *sin; -static void sdp_cma_timewait_timeout_work(struct work_struct *work) -{ - struct sdp_sock *ssk = - container_of(work, struct sdp_sock, cma_timewait_work.work); - struct sock *sk = sk_ssk(ssk); + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EINVAL); + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return (EAFNOSUPPORT); - lock_sock(sk); - if (!ssk->cma_timewait_timeout) { - release_sock(sk); - return; + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = EINVAL; + goto out; } + error = sdp_pcbbind(ssk, nam, td->td_ucred); +out: + SDP_WUNLOCK(ssk); - ssk->cma_timewait_timeout = 0; - release_sock(sk); - sock_put(sk, SOCK_REF_CMA); + return (error); } -static int sdp_cancel_cma_timewait_timeout(struct sdp_sock *ssk) +/* + * Prepare to accept connections. + */ +static int +sdp_listen(struct socket *so, int backlog, struct thread *td) { - if (!ssk->cma_timewait_timeout) - return 0; + int error = 0; + struct sdp_sock *ssk; - ssk->cma_timewait_timeout = 0; - return cancel_delayed_work(&ssk->cma_timewait_work); - /* No need to use the sync'ed function because the socket's refcnt is - * pre-taken and multiple invocations of sock_put() are self sync'ed - * (atomic operation). - */ -} - -void sdp_start_cma_timewait_timeout(struct sdp_sock *ssk, int timeo) -{ - queue_delayed_work(sdp_wq, &ssk->cma_timewait_work, timeo); - ssk->cma_timewait_timeout = 1; -} - -/* Like tcp_close */ -static void sdp_close(struct sock *sk, long timeout) -{ - struct sk_buff *skb; - int data_was_unread = 0; - - sdp_add_to_history(sk, __func__); - lock_sock(sk); - - sdp_dbg(sk, "%s\n", __func__); - sdp_prf(sk, NULL, __func__); - - sdp_sk(sk)->cpu = smp_processor_id(); - sdp_delete_keepalive_timer(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if ((1 << sk->sk_state) & (TCPF_TIME_WAIT | TCPF_CLOSE)) { - /* this could happen if socket was closed by a CM teardown - and after that the user called close() */ - disable_cma_handler(sk); + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = EINVAL; goto out; } - - if (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_SYN_SENT) { - sdp_exch_state(sk, TCPF_LISTEN | TCPF_SYN_SENT, TCP_CLOSE); - disable_cma_handler(sk); - - /* Special case: stop listening. - This is done by sdp_destruct. */ - goto out; + if (error == 0 && ssk->lport == 0) + error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); + SOCK_LOCK(so); + if (error == 0) + error = solisten_proto_check(so); + if (error == 0) { + solisten_proto(so, backlog); + ssk->state = TCPS_LISTEN; } + SOCK_UNLOCK(so); - sock_hold(sk, SOCK_REF_CMA); - sdp_start_cma_timewait_timeout(sdp_sk(sk), SDP_CMA_TIMEWAIT_TIMEOUT); - - /* We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - * reader process may not have drained the data yet! - */ - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { - struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); - if (h->mid == SDP_MID_DISCONN) { - sdp_handle_disconn(sk); - } else { - if (h->mid == SDP_MID_SRCAVAIL && sdp_sk(sk)->rx_sa) { - sdp_abort_rx_srcavail(sk); - sdp_post_sendsm(sk); - } - - sdp_dbg(sk, "Data was unread. skb: %p\n", skb); - data_was_unread = 1; - } - sdp_free_skb(skb); - } - - sk_mem_reclaim(sk); - - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. - */ - if (data_was_unread || - (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { - /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); - sdp_exch_state(sk, TCPF_CLOSE_WAIT | TCPF_ESTABLISHED, - TCP_TIME_WAIT); - - /* Go into abortive close */ - sk->sk_prot->disconnect(sk, 0); - } else if (sdp_close_state(sk)) { - /* We FIN if the application ate all the data before - * zapping the connection. - */ - - sdp_send_disconnect(sk); - } - - /* TODO: state should move to CLOSE or CLOSE_WAIT etc on disconnect. - Since it currently doesn't, do it here to avoid blocking below. */ - if (!sdp_sk(sk)->id) - sdp_exch_state(sk, TCPF_FIN_WAIT1 | TCPF_LAST_ACK | - TCPF_CLOSE_WAIT, TCP_CLOSE); - - sk_stream_wait_close(sk, timeout); out: - release_sock(sk); - - sdp_common_release(sk); + SDP_WUNLOCK(ssk); + if (error == 0) + error = -rdma_listen(ssk->id, backlog); + return (error); } -static int sdp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +/* + * Initiate a SDP connection to nam. + */ +static int +sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) { - struct sdp_sock *ssk = sdp_sk(sk); - struct sockaddr_in src_addr = { - .sin_family = AF_INET, - .sin_port = htons(inet_sk(sk)->sport), - .sin_addr.s_addr = inet_sk(sk)->saddr, - }; - int rc; + struct sockaddr_in src; + struct socket *so; + int error; - sdp_add_to_history(sk, __func__); - ssk->cpu = smp_processor_id(); - release_sock(sk); - flush_workqueue(sdp_wq); - lock_sock(sk); - if (sk->sk_err) { - sdp_warn(sk, "Can't connect, socket marked with error: %d\n", - sk->sk_err); - return -sk->sk_err; - } + so = ssk->socket; - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (uaddr->sa_family == AF_INET_SDP) - uaddr->sa_family = AF_INET; - else if (uaddr->sa_family != AF_INET) - return -EAFNOSUPPORT; - - if (!ssk->id) { - rc = sdp_get_port(sk, 0); - if (rc) - return rc; - inet_sk(sk)->sport = htons(inet_sk(sk)->num); + SDP_WLOCK_ASSERT(ssk); + if (ssk->lport == 0) { + error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); + if (error) + return error; } + src.sin_family = AF_INET; + src.sin_len = sizeof(src); + bzero(&src.sin_zero, sizeof(src.sin_zero)); + src.sin_port = ssk->lport; + src.sin_addr.s_addr = ssk->laddr; + soisconnecting(so); + SDP_WUNLOCK(ssk); + error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, + SDP_RESOLVE_TIMEOUT); + SDP_WLOCK(ssk); + if (error == 0) + ssk->state = TCPS_SYN_SENT; - sdp_dbg(sk, "%s %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", __func__, - NIPQUAD(src_addr.sin_addr.s_addr), - ntohs(src_addr.sin_port), - NIPQUAD(((struct sockaddr_in *)uaddr)->sin_addr.s_addr), - ntohs(((struct sockaddr_in *)uaddr)->sin_port)); - - rc = rdma_resolve_addr(ssk->id, (struct sockaddr *)&src_addr, - uaddr, SDP_RESOLVE_TIMEOUT); - if (rc) { - sdp_dbg(sk, "rdma_resolve_addr failed: %d\n", rc); - return rc; - } - - sdp_exch_state(sk, TCPF_CLOSE, TCP_SYN_SENT); return 0; } -static int sdp_disconnect(struct sock *sk, int flags) +/* + * Initiate SDP connection. + */ +static int +sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { - struct sdp_sock *ssk = sdp_sk(sk); - int rc = 0; - struct sdp_sock *s, *t; - struct rdma_cm_id *id; + int error = 0; + struct sdp_sock *ssk; + struct sockaddr_in *sin; - sdp_dbg(sk, "%s\n", __func__); - - ssk->cpu = smp_processor_id(); - if (sk->sk_state != TCP_LISTEN) { - if (ssk->id) { - sdp_sk(sk)->qp_active = 0; - rc = rdma_disconnect(ssk->id); - } - - return rc; - } - - sdp_exch_state(sk, TCPF_LISTEN, TCP_CLOSE); - id = ssk->id; - ssk->id = NULL; - release_sock(sk); /* release socket since locking semantics is parent - inside child */ - if (id) - rdma_destroy_id(id); - - list_for_each_entry_safe(s, t, &ssk->backlog_queue, backlog_queue) { - sdp_kill_id_and_release(s); - } - list_for_each_entry_safe(s, t, &ssk->accept_queue, accept_queue) { - sdp_kill_id_and_release(s); - } - - lock_sock(sk); - - return 0; + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EINVAL); + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return (EAFNOSUPPORT); + if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) + return (error); + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) + error = EINVAL; + else + error = sdp_start_connect(ssk, nam, td); + SDP_WUNLOCK(ssk); + return (error); } -/* Like inet_csk_wait_for_connect */ -static int sdp_wait_for_connect(struct sock *sk, long timeo) +/* + * Drop a SDP socket, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +static struct sdp_sock * +sdp_drop(struct sdp_sock *ssk, int errno) { - struct sdp_sock *ssk = sdp_sk(sk); - DEFINE_WAIT(wait); - int err; + struct socket *so; - sdp_dbg(sk, "%s\n", __func__); - /* - * True wake-one mechanism for incoming connections: only - * one process gets woken up, not the 'whole herd'. - * Since we do not 'race & poll' for established sockets - * anymore, the common case will execute the loop only once. - * - * Subtle issue: "add_wait_queue_exclusive()" will be added - * after any current non-exclusive waiters, and we know that - * it will always _stay_ after any new non-exclusive waiters - * because all non-exclusive waiters are added at the - * beginning of the wait-queue. As such, it's ok to "drop" - * our exclusiveness temporarily when we get woken up without - * having to remove and re-insert us on the wait queue. - */ - for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (list_empty(&ssk->accept_queue)) { - timeo = schedule_timeout(timeo); - } - lock_sock(sk); - err = 0; - if (!list_empty(&ssk->accept_queue)) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!timeo) - break; - } - finish_wait(sk->sk_sleep, &wait); - sdp_dbg(sk, "%s returns %d\n", __func__, err); - return err; + SDP_WLOCK_ASSERT(ssk); + so = ssk->socket; + if (TCPS_HAVERCVDSYN(ssk->state)) + sdp_output_reset(ssk); + if (errno == ETIMEDOUT && ssk->softerror) + errno = ssk->softerror; + so->so_error = errno; + return (sdp_closed(ssk)); } -/* Consider using request_sock_queue instead of duplicating all this */ -/* Like inet_csk_accept */ -static struct sock *sdp_accept(struct sock *sk, int flags, int *err) +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static void +sdp_usrclosed(struct sdp_sock *ssk) { - struct sdp_sock *newssk = NULL, *ssk; - struct sock *newsk; - int error; - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s state %s expected %s *err %d\n", __func__, - sdp_state_str(sk->sk_state), "TCP_LISTEN", *err); + SDP_WLOCK_ASSERT(ssk); - ssk = sdp_sk(sk); - lock_sock(sk); - ssk->cpu = smp_processor_id(); + switch (ssk->state) { + case TCPS_LISTEN: + ssk->state = TCPS_CLOSED; + SDP_WUNLOCK(ssk); + sdp_destroy_cma(ssk); + SDP_WLOCK(ssk); + /* FALLTHROUGH */ + case TCPS_CLOSED: + ssk = sdp_closed(ssk); + /* + * sdp_closed() should never return NULL here as the socket is + * still open. + */ + KASSERT(ssk != NULL, + ("sdp_usrclosed: sdp_closed() returned NULL")); + break; - /* We need to make sure that this socket is listening, - * and that it has something pending. - */ - error = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - goto out_err; + case TCPS_SYN_SENT: + /* FALLTHROUGH */ + case TCPS_SYN_RECEIVED: + ssk->flags |= SDP_NEEDFIN; + break; - /* Find already established connection */ - if (list_empty(&ssk->accept_queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + case TCPS_ESTABLISHED: + ssk->flags |= SDP_NEEDFIN; + ssk->state = TCPS_FIN_WAIT_1; + break; - /* If this is a non blocking socket don't sleep */ - error = -EAGAIN; - if (!timeo) - goto out_err; - - error = sdp_wait_for_connect(sk, timeo); - if (error) - goto out_err; + case TCPS_CLOSE_WAIT: + ssk->state = TCPS_LAST_ACK; + break; } - - newssk = list_entry(ssk->accept_queue.next, struct sdp_sock, - accept_queue); - list_del_init(&newssk->accept_queue); - newssk->parent = NULL; - sk_acceptq_removed(sk); - newsk = sk_ssk(newssk); -out: - release_sock(sk); - if (newsk) { - lock_sock(newsk); - if (newssk->rx_ring.cq) { - newssk->poll_cq = 1; - sdp_arm_rx_cq(sk_ssk(newssk)); - } - release_sock(newsk); + if (ssk->state >= TCPS_FIN_WAIT_2) { + /* Prevent the connection hanging in FIN_WAIT_2 forever. */ + if (ssk->state == TCPS_FIN_WAIT_2) + sdp_2msl_wait(ssk); + else + soisdisconnected(ssk->socket); } - sdp_dbg(sk, "%s: status %d sk %p newsk %p\n", __func__, - *err, sk, newsk); - return newsk; -out_err: - sdp_dbg(sk, "%s: error %d\n", __func__, error); - newsk = NULL; - *err = error; - goto out; } -/* Like tcp_ioctl */ -static int sdp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static void +sdp_output_disconnect(struct sdp_sock *ssk) { - struct sdp_sock *ssk = sdp_sk(sk); - int answ; - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s\n", __func__); - - switch (cmd) { - case SIOCINQ: - if (sk->sk_state == TCP_LISTEN) - return -EINVAL; - - lock_sock(sk); - ssk->cpu = smp_processor_id(); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !ssk->urg_data || - before(ssk->urg_seq, ssk->copied_seq) || - !before(ssk->urg_seq, rcv_nxt(ssk))) { - answ = rcv_nxt(ssk) - ssk->copied_seq; - - /* Subtract 1, if FIN is in queue. */ - if (answ && !skb_queue_empty(&sk->sk_receive_queue)) - answ -= - (skb_transport_header(sk->sk_receive_queue.prev))[0] - == SDP_MID_DISCONN ? 1 : 0; - } else - answ = ssk->urg_seq - ssk->copied_seq; - release_sock(sk); - break; - case SIOCATMARK: - answ = ssk->urg_data && ssk->urg_seq == ssk->copied_seq; - break; - case SIOCOUTQ: - if (sk->sk_state == TCP_LISTEN) - return -EINVAL; - - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else - answ = ssk->write_seq - ssk->tx_ring.una_seq; - break; - default: - return -ENOIOCTLCMD; - } - /* TODO: Need to handle: - case SIOCOUTQ: - */ - return put_user(answ, (int __user *)arg); + SDP_WLOCK_ASSERT(ssk); + callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, + sdp_dreq_timeout, ssk); + ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; + sdp_post_sends(ssk, M_NOWAIT); } -void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) +/* + * Initiate or continue a disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static void +sdp_start_disconnect(struct sdp_sock *ssk) { - if (!ssk->dreq_wait_timeout) - return; + struct socket *so; + int unread; - sdp_dbg(sk_ssk(ssk), "cancelling dreq wait timeout\n"); - - ssk->dreq_wait_timeout = 0; - if (cancel_delayed_work_sync(&ssk->dreq_wait_work)) { - /* The timeout hasn't reached - need to clean ref count */ - sock_put(sk_ssk(ssk), SOCK_REF_DREQ_TO); + so = ssk->socket; + SDP_WLOCK_ASSERT(ssk); + sdp_stop_keepalive_timer(so); + /* + * Neither sdp_closed() nor sdp_drop() should return NULL, as the + * socket is still open. + */ + if (ssk->state < TCPS_ESTABLISHED) { + ssk = sdp_closed(ssk); + KASSERT(ssk != NULL, + ("sdp_start_disconnect: sdp_close() returned NULL")); + } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { + ssk = sdp_drop(ssk, 0); + KASSERT(ssk != NULL, + ("sdp_start_disconnect: sdp_drop() returned NULL")); + } else { + soisdisconnecting(so); + unread = so->so_rcv.sb_cc; + sbflush(&so->so_rcv); + sdp_usrclosed(ssk); + if (!(ssk->flags & SDP_DROPPED)) { + if (unread) + sdp_output_reset(ssk); + else + sdp_output_disconnect(ssk); + } } } -static void sdp_destroy_work(struct work_struct *work) +/* + * User initiated disconnect. + */ +static int +sdp_disconnect(struct socket *so) { - struct sdp_sock *ssk = container_of(work, struct sdp_sock, - destroy_work); - struct sock *sk = sk_ssk(ssk); - sdp_dbg(sk, "%s: refcnt %d\n", __func__, atomic_read(&sk->sk_refcnt)); + struct sdp_sock *ssk; + int error = 0; - lock_sock(sk); - sdp_destroy_qp(ssk); - release_sock(sk); - - /* Can be sure that rx_comp_work won't be queued from here cause - * ssk->rx_ring.cq is NULL from here - */ - cancel_work_sync(&ssk->rx_comp_work); - - lock_sock(sk); - memset((void *)&ssk->id, 0, sizeof(*ssk) - offsetof(typeof(*ssk), id)); - release_sock(sk); - - sdp_cancel_dreq_wait_timeout(ssk); - - lock_sock(sk); - if (sk->sk_state == TCP_TIME_WAIT) { - if (sdp_cancel_cma_timewait_timeout(ssk)) - sock_put(sk, SOCK_REF_CMA); + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = ECONNRESET; + goto out; } - - /* In normal close current state is TCP_TIME_WAIT or TCP_CLOSE - but if a CM connection is dropped below our legs state could - be any state */ - sdp_exch_state(sk, ~0, TCP_CLOSE); - release_sock(sk); - - sock_put(sk, SOCK_REF_RESET); + sdp_start_disconnect(ssk); +out: + SDP_WUNLOCK(ssk); + return (error); } -static void sdp_dreq_wait_timeout_work(struct work_struct *work) +/* + * Accept a connection. Essentially all the work is done at higher levels; + * just return the address of the peer, storing through addr. + * + * + * XXX This is broken XXX + * + * The rationale for acquiring the sdp lock here is somewhat complicated, + * and is described in detail in the commit log entry for r175612. Acquiring + * it delays an accept(2) racing with sonewconn(), which inserts the socket + * before the address/port fields are initialized. A better fix would + * prevent the socket from being placed in the listen queue until all fields + * are fully initialized. + */ +static int +sdp_accept(struct socket *so, struct sockaddr **nam) { - struct sdp_sock *ssk = - container_of(work, struct sdp_sock, dreq_wait_work.work); - struct sock *sk = sk_ssk(ssk); + struct sdp_sock *ssk = NULL; + struct in_addr addr; + in_port_t port; + int error; - if (!ssk->dreq_wait_timeout) - goto out; + if (so->so_state & SS_ISDISCONNECTED) + return (ECONNABORTED); - lock_sock(sk); - - if (!ssk->dreq_wait_timeout || - !((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_LAST_ACK))) { - release_sock(sk); + port = 0; + addr.s_addr = 0; + error = 0; + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = ECONNABORTED; goto out; } - - sdp_dbg(sk, "timed out waiting for FIN/DREQ. " - "going into abortive close.\n"); - - ssk->dreq_wait_timeout = 0; - sdp_exch_state(sk, TCPF_LAST_ACK | TCPF_FIN_WAIT1, TCP_TIME_WAIT); - - if (ssk->id) { - sdp_dbg(sk, "Destroyed QP\n"); - ssk->qp_active = 0; - rdma_disconnect(ssk->id); - release_sock(sk); - } else { - release_sock(sk); - sock_put(sk, SOCK_REF_CMA); - } - + port = ssk->fport; + addr.s_addr = ssk->faddr; out: - sock_put(sk, SOCK_REF_DREQ_TO); + SDP_WUNLOCK(ssk); + if (error == 0) + *nam = sdp_sockaddr(port, &addr); + return error; } /* - * Only SDP interact with this receive queue. Don't want - * lockdep warnings that using spinlock irqsave + * Mark the connection as being incapable of further output. */ -static struct lock_class_key ib_sdp_sk_receive_queue_lock_key; - -static struct lock_class_key ib_sdp_sk_callback_lock_key; - -static void sdp_destroy_work(struct work_struct *work); -static void sdp_dreq_wait_timeout_work(struct work_struct *work); -static void sdp_cma_timewait_timeout_work(struct work_struct *work); - -atomic_t socket_idx = ATOMIC_INIT(0); - -int sdp_init_sock(struct sock *sk) +static int +sdp_shutdown(struct socket *so) { - struct sdp_sock *ssk = sdp_sk(sk); + int error = 0; + struct sdp_sock *ssk; - sdp_dbg(sk, "%s\n", __func__); + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = ECONNRESET; + goto out; + } + socantsendmore(so); + sdp_usrclosed(ssk); + if (!(ssk->flags & SDP_DROPPED)) + sdp_output_disconnect(ssk); - ssk->sk_id = atomic_inc_return(&socket_idx); +out: + SDP_WUNLOCK(ssk); - INIT_LIST_HEAD(&ssk->accept_queue); - INIT_LIST_HEAD(&ssk->backlog_queue); - INIT_DELAYED_WORK(&ssk->dreq_wait_work, sdp_dreq_wait_timeout_work); - INIT_DELAYED_WORK(&ssk->cma_timewait_work, sdp_cma_timewait_timeout_work); - INIT_WORK(&ssk->destroy_work, sdp_destroy_work); - - lockdep_set_class(&sk->sk_receive_queue.lock, - &ib_sdp_sk_receive_queue_lock_key); - - lockdep_set_class(&sk->sk_callback_lock, - &ib_sdp_sk_callback_lock_key); - - sk->sk_route_caps |= NETIF_F_SG | NETIF_F_NO_CSUM; - - skb_queue_head_init(&ssk->rx_ctl_q); - - atomic_set(&ssk->mseq_ack, 0); - - ssk->rx_ring.buffer = NULL; - ssk->tx_ring.buffer = NULL; - ssk->sdp_disconnect = 0; - ssk->destructed_already = 0; - ssk->id_destroyed_already = 0; - spin_lock_init(&ssk->lock); - spin_lock_init(&ssk->tx_sa_lock); - ssk->tx_compl_pending = 0; - - atomic_set(&ssk->somebody_is_doing_posts, 0); - ssk->cpu = smp_processor_id(); - ssk->tx_ring.rdma_inflight = NULL; - - init_timer(&ssk->rx_ring.cq_arm_timer); - init_timer(&ssk->tx_ring.timer); - init_timer(&ssk->nagle_timer); - init_timer(&sk->sk_timer); - setup_timer(&sk->sk_timer, sdp_keepalive_timer, (unsigned long)sk); - ssk->sa_cancel_arrived = 0; - ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ - ssk->last_bind_err = 0; - -#ifdef SDP_SOCK_HISTORY - memset(ssk->hst, 0, sizeof ssk->hst); - ssk->hst_idx = 0; - spin_lock_init(&ssk->hst_lock); - sdp_ssk_hist_open(sk); -#endif - - return 0; + return (error); } -static void sdp_shutdown(struct sock *sk, int how) +static void +sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) { - struct sdp_sock *ssk = sdp_sk(sk); + struct mbuf *n; + int ncnt; - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s\n", __func__); - if (!(how & SEND_SHUTDOWN)) + SOCKBUF_LOCK_ASSERT(sb); + SBLASTRECORDCHK(sb) + KASSERT(mb->m_flags & M_PKTHDR, + ("sdp_append: %p Missing packet header.\n", mb)); + n = sb->sb_lastrecord; + /* + * If the queue is empty just set all pointers and proceed. + */ + if (n == NULL) { + sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; + for (; mb; mb = mb->m_next) { + sb->sb_mbtail = mb; + sballoc(sb, mb); + } return; - - /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (!((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT))) { + } + /* + * Count the number of mbufs in the current tail. + */ + for (ncnt = 0; n->m_next; n = n->m_next) + ncnt++; + n = sb->sb_lastrecord; + /* + * If the two chains can fit in a single sdp packet and + * the last record has not been sent yet (WRITABLE) coalesce + * them. The lastrecord remains the same but we must strip the + * packet header and then let sbcompress do the hard part. + */ + if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && + n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < + ssk->xmit_size_goal) { + m_adj(mb, SDP_HEAD_SIZE); + n->m_pkthdr.len += mb->m_pkthdr.len; + n->m_flags |= mb->m_flags & (M_PUSH | M_URG); + m_demote(mb, 1); + sbcompress(sb, mb, sb->sb_mbtail); return; } - - if (!sdp_close_state(sk)) - return; - /* - * Just turn off CORK here. - * We could check for socket shutting down in main data path, - * but this costs no extra cycles there. + * Not compressible, just append to the end and adjust counters. */ - ssk->nonagle &= ~TCP_NAGLE_CORK; - if (ssk->nonagle & TCP_NAGLE_OFF) - ssk->nonagle |= TCP_NAGLE_PUSH; - - sdp_send_disconnect(sk); -} - -static void sdp_mark_push(struct sdp_sock *ssk, struct sk_buff *skb) -{ - SDP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - sdp_do_posts(ssk); -} - -static inline void sdp_push_pending_frames(struct sock *sk) -{ - struct sk_buff *skb = sk->sk_send_head; - if (skb) { - sdp_mark_push(sdp_sk(sk), skb); + sb->sb_lastrecord->m_flags |= M_PUSH; + sb->sb_lastrecord->m_nextpkt = mb; + sb->sb_lastrecord = mb; + if (sb->sb_sndptr == NULL) + sb->sb_sndptr = mb; + for (; mb; mb = mb->m_next) { + sb->sb_mbtail = mb; + sballoc(sb, mb); } } -/* SOL_SOCKET level options are handled by sock_setsockopt */ -static int sdp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + * + * This comes from sendfile, normal sends will come from sdp_sosend(). + */ +static int +sdp_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct thread *td) { - struct sdp_sock *ssk = sdp_sk(sk); - int val; - int err = 0; + struct sdp_sock *ssk; + struct mbuf *n; + int error; + int cnt; - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s\n", __func__); - if (optlen < sizeof(int)) - return -EINVAL; - - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - lock_sock(sk); - ssk->cpu = smp_processor_id(); - - /* SOCK_KEEPALIVE is really a SOL_SOCKET level option but there - * is a problem handling it at that level. In order to start - * the keepalive timer on an SDP socket, we must call an SDP - * specific routine. Since sock_setsockopt() can not be modifed - * to understand SDP, the application must pass that option - * through to us. Since SO_KEEPALIVE and TCP_DEFER_ACCEPT both - * use the same optname, the level must not be SOL_TCP or SOL_SOCKET - */ - if (level == PF_INET_SDP && optname == SO_KEEPALIVE) { - sdp_set_keepalive(sk, val); - if (val) - sock_set_flag(sk, SOCK_KEEPOPEN); - else - sock_reset_flag(sk, SOCK_KEEPOPEN); - goto out; + error = 0; + ssk = sdp_sk(so); + KASSERT(m->m_flags & M_PKTHDR, + ("sdp_send: %p no packet header", m)); + M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT); + mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; + for (n = m, cnt = 0; n->m_next; n = n->m_next) + cnt++; + if (cnt > SDP_MAX_SEND_SGES) { + n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES); + if (n == NULL) { + m_freem(m); + return (EMSGSIZE); + } + m = n; + for (cnt = 0; n->m_next; n = n->m_next) + cnt++; } - - if (level != SOL_TCP) { - err = -ENOPROTOOPT; + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + if (control) + m_freem(control); + if (m) + m_freem(m); + error = ECONNRESET; goto out; } - - switch (optname) { - case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. + if (control) { + /* SDP doesn't support control messages. */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if (!(flags & PRUS_OOB)) { + SOCKBUF_LOCK(&so->so_snd); + sdp_append(ssk, &so->so_snd, m, cnt); + SOCKBUF_UNLOCK(&so->so_snd); + if (nam && ssk->state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected. */ - ssk->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - sdp_push_pending_frames(sk); - } else { - ssk->nonagle &= ~TCP_NAGLE_OFF; + error = sdp_start_connect(ssk, nam, td); + if (error) + goto out; } - break; - case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + socantsendmore(so); + sdp_usrclosed(ssk); + if (!(ssk->flags & SDP_DROPPED)) + sdp_output_disconnect(ssk); + } else if (!(ssk->flags & SDP_DROPPED) && + !(flags & PRUS_MORETOCOME)) + sdp_post_sends(ssk, M_NOWAIT); + SDP_WUNLOCK(ssk); + return (0); + } else { + SOCKBUF_LOCK(&so->so_snd); + if (sbspace(&so->so_snd) < -512) { + SOCKBUF_UNLOCK(&so->so_snd); + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. */ - if (val) { - ssk->nonagle |= TCP_NAGLE_CORK; - } else { - ssk->nonagle &= ~TCP_NAGLE_CORK; - if (ssk->nonagle&TCP_NAGLE_OFF) - ssk->nonagle |= TCP_NAGLE_PUSH; - sdp_push_pending_frames(sk); + m->m_flags |= M_URG | M_PUSH; + sdp_append(ssk, &so->so_snd, m, cnt); + SOCKBUF_UNLOCK(&so->so_snd); + if (nam && ssk->state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected. + */ + error = sdp_start_connect(ssk, nam, td); + if (error) + goto out; } - break; - case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - ssk->keepalive_time = val * HZ; - - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - sdp_reset_keepalive_timer(sk, - ssk->keepalive_time); - } - } - break; - case SDP_ZCOPY_THRESH: - if (val != 0 && (val < SDP_MIN_ZCOPY_THRESH || - val > SDP_MAX_ZCOPY_THRESH)) - err = -EINVAL; - else - ssk->zcopy_thresh = val; - break; - default: - err = -ENOPROTOOPT; - break; + sdp_post_sends(ssk, M_NOWAIT); + SDP_WUNLOCK(ssk); + return (0); } - out: - release_sock(sk); - return err; + SDP_WUNLOCK(ssk); + return (error); } -/* SOL_SOCKET level options are handled by sock_getsockopt */ -static int sdp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) + +/* + * Send on a socket. If send must go all at once and message is larger than + * send buffering, then hard error. Lock against other senders. If must go + * all at once and not enough room now, then inform user that this would + * block and do nothing. Otherwise, if nonblocking, send as much as + * possible. The data to be sent is described by "uio" if nonzero, otherwise + * by the mbuf chain "top" (which must be null if uio is not). Data provided + * in mbuf chain must be small enough to send all at once. + * + * Returns nonzero on error, timeout or signal; callers must check for short + * counts if EINTR/ERESTART are returned. Data and control buffers are freed + * on return. + */ +static int +sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { - /* TODO */ - struct sdp_sock *ssk = sdp_sk(sk); - int val, len; + struct sdp_sock *ssk; + long space, resid; + int atomic; + int error; + int copy; - sdp_add_to_history(sk, __func__); - sdp_dbg(sk, "%s\n", __func__); - - if (level != SOL_TCP) - return -EOPNOTSUPP; - - if (get_user(len, option)) - return -EFAULT; - - len = min_t(unsigned int, len, sizeof(int)); - - if (len < 0) - return -EINVAL; - - switch (optname) { - case TCP_NODELAY: - val = !!(ssk->nonagle&TCP_NAGLE_OFF); - break; - case TCP_CORK: - val = !!(ssk->nonagle&TCP_NAGLE_CORK); - break; - case TCP_KEEPIDLE: - val = (ssk->keepalive_time ? : sdp_keepalive_time) / HZ; - break; - case TCP_MAXSEG: - val = ssk->xmit_size_goal; - break; - case SDP_ZCOPY_THRESH: - val = ssk->zcopy_thresh; - break; - case SDP_LAST_BIND_ERR: - val = ssk->last_bind_err; - break; - default: - return -ENOPROTOOPT; + if (uio != NULL) + resid = uio->uio_resid; + else + resid = top->m_pkthdr.len; + atomic = top != NULL; + if (control != NULL) { + if (control->m_len) { + m_freem(control); + if (top) + m_freem(top); + return (EINVAL); + } + m_freem(control); + control = NULL; } + /* + * In theory resid should be unsigned. However, space must be + * signed, as it might be less than 0 if we over-committed, and we + * must use a signed comparison of space and resid. On the other + * hand, a negative resid causes us to loop sending 0-length + * segments to the protocol. + * + * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM + * type sockets since that's an error. + */ + if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { + error = EINVAL; + goto out; + } + if (td != NULL) + td->td_ru.ru_msgsnd++; - if (put_user(len, option)) - return -EFAULT; - if (copy_to_user(optval, &val, len)) - return -EFAULT; - return 0; -} + ssk = sdp_sk(so); + error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + if (error) + goto out; -static inline int cycles_before(cycles_t a, cycles_t b) -{ - /* cycles_t is unsigned, but may be int/long/long long. */ - - if (sizeof(cycles_t) == 4) - return before(a, b); - else - return (s64)(a - b) < 0; -} +restart: + do { + SOCKBUF_LOCK(&so->so_snd); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + SOCKBUF_UNLOCK(&so->so_snd); + error = EPIPE; + goto release; + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + SOCKBUF_UNLOCK(&so->so_snd); + goto release; + } + if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { + SOCKBUF_UNLOCK(&so->so_snd); + error = ENOTCONN; + goto release; + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { + SOCKBUF_UNLOCK(&so->so_snd); + error = EMSGSIZE; + goto release; + } + if (space < resid && + (atomic || space < so->so_snd.sb_lowat)) { + if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { + SOCKBUF_UNLOCK(&so->so_snd); + error = EWOULDBLOCK; + goto release; + } + error = sbwait(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); + if (error) + goto release; + goto restart; + } + SOCKBUF_UNLOCK(&so->so_snd); + do { + if (uio == NULL) { + resid = 0; + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + } else { + /* + * Copy the data from userland into a mbuf + * chain. If no data is to be copied in, + * a single empty mbuf is returned. + */ + copy = min(space, + ssk->xmit_size_goal - SDP_HEAD_SIZE); + top = m_uiotombuf(uio, M_WAITOK, copy, + 0, M_PKTHDR | + ((flags & MSG_EOR) ? M_EOR : 0)); + if (top == NULL) { + /* only possible error */ + error = EFAULT; + goto release; + } + space -= resid - uio->uio_resid; + resid = uio->uio_resid; + } + /* + * XXX all the SBS_CANTSENDMORE checks previously + * done could be out of date after dropping the + * socket lock. + */ + error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : + /* + * Set EOF on the last send if the user specified + * MSG_EOF. + */ + ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : + /* If there is more to send set PRUS_MORETOCOME. */ + (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, + top, addr, NULL, td); + top = NULL; + if (error) + goto release; + } while (resid && space > 0); + } while (resid); -static inline cycles_t sdp_usec_to_cycles(int usecs) -{ -#ifdef CONFIG_PPC - return usecs * tb_ticks_per_usec; -#elif defined(__ia64__) - return usecs * local_cpu_data->cyc_per_usec; -#else - return usecs * cpu_khz / 1000; -#endif +release: + sbunlock(&so->so_snd); +out: + if (top != NULL) + m_freem(top); + return (error); } -static inline int poll_recv_cq(struct sock *sk) +/* + * The part of soreceive() that implements reading non-inline out-of-band + * data from a socket. For more complete comments, see soreceive(), from + * which this code originated. + * + * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is + * unable to return an mbuf chain to the caller. + */ +static int +soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { - cycles_t start = get_cycles(); - cycles_t end = start + sdp_usec_to_cycles(recv_poll); + struct protosw *pr = so->so_proto; + struct mbuf *m; + int error; - sdp_prf(sk, NULL, "polling recv"); + KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); - if (unlikely(!sdp_sk(sk)->rx_ring.cq)) - return 0; - + m = m_get(M_WAIT, MT_DATA); + error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); + if (error) + goto bad; do { - if (sdp_poll_rx_cq(sdp_sk(sk))) { - SDPSTATS_COUNTER_INC(rx_poll_hit); - SDPSTATS_HIST(poll_hit_usec, sdp_cycles_to_usecs( - (unsigned long)(get_cycles() - start))); - return 0; - } - } while (cycles_before(get_cycles(), end)); - - SDPSTATS_COUNTER_INC(rx_poll_miss); - return 1; + error = uiomove(mtod(m, void *), + (int) min(uio->uio_resid, m->m_len), uio); + m = m_free(m); + } while (uio->uio_resid && error == 0 && m); +bad: + if (m != NULL) + m_freem(m); + return (error); } -/* Like tcp_recv_urg */ /* - * Handle reading urgent data. BSD has very simple semantics for - * this, no blocking and very strange errors 8) + * Optimized version of soreceive() for stream (TCP) sockets. */ - -static int sdp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags, - int *addr_len) +static int +sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { - struct sdp_sock *ssk = sdp_sk(sk); + int len = 0, error = 0, flags, oresid; + struct sockbuf *sb; + struct mbuf *m, *n = NULL; + struct sdp_sock *ssk; - poll_recv_cq(sk); + /* We only do stream sockets. */ + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (psa != NULL) + *psa = NULL; + if (controlp != NULL) + return (EINVAL); + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) + return (soreceive_rcvoob(so, uio, flags)); + if (mp0 != NULL) + *mp0 = NULL; - /* No URG data to read. */ - if (sock_flag(sk, SOCK_URGINLINE) || !ssk->urg_data || - ssk->urg_data == TCP_URG_READ) - return -EINVAL; /* Yes this is right ! */ + sb = &so->so_rcv; + ssk = sdp_sk(so); - if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) - return -ENOTCONN; + /* Prevent other readers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) + goto out; + SOCKBUF_LOCK(sb); - if (ssk->urg_data & TCP_URG_VALID) { - int err = 0; - char c = ssk->urg_data; - - if (!(flags & MSG_PEEK)) - ssk->urg_data = TCP_URG_READ; - - /* Read urgent data. */ - msg->msg_flags |= MSG_OOB; - - if (len > 0) { - if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); - len = 1; - } else - msg->msg_flags |= MSG_TRUNC; - - return err ? -EFAULT : len; + /* Easy one, no space to copyout anything. */ + if (uio->uio_resid == 0) { + error = EINVAL; + goto out; } + oresid = uio->uio_resid; - if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) - return 0; - - /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and - * the available implementations agree in this case: - * this call should never block, independent of the - * blocking state of the socket. - * Mike - */ - return -EAGAIN; -} - -static inline void sdp_mark_urg(struct sock *sk, int flags) -{ - if (unlikely(flags & MSG_OOB)) { - struct sk_buff *skb = sk->sk_write_queue.prev; - SDP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; + /* We will never ever get anything unless we are connected. */ + if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { + /* When disconnecting there may be still some data left. */ + if (sb->sb_cc > 0) + goto deliver; + if (!(so->so_state & SS_ISDISCONNECTED)) + error = ENOTCONN; + goto out; } -} -static inline void sdp_push(struct sock *sk, int flags) -{ - if (sk->sk_send_head) - sdp_mark_urg(sk, flags); - sdp_do_posts(sdp_sk(sk)); -} - -void sdp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - __skb_queue_tail(&sk->sk_write_queue, skb); - sk->sk_wmem_queued += skb->truesize; - sk_mem_charge(sk, skb->truesize); - if (!sk->sk_send_head) - sk->sk_send_head = skb; - if (sdp_sk(sk)->nonagle & TCP_NAGLE_PUSH) - sdp_sk(sk)->nonagle &= ~TCP_NAGLE_PUSH; -} - -static inline struct bzcopy_state *sdp_bz_cleanup(struct bzcopy_state *bz) -{ - int i; - struct sdp_sock *ssk = (struct sdp_sock *)bz->ssk; - - /* Wait for in-flight sends; should be quick */ - if (bz->busy) { - struct sock *sk = sk_ssk(ssk); - unsigned long timeout = jiffies + SDP_BZCOPY_POLL_TIMEOUT; - - while (jiffies < timeout) { - if (sdp_xmit_poll(sdp_sk(sk), 1)) - sdp_post_sends(ssk, 0); - if (!bz->busy) - break; - SDPSTATS_COUNTER_INC(bzcopy_poll_miss); - } - - if (bz->busy) - sdp_warn(sk, "Could not reap %d in-flight sends\n", - bz->busy); + /* Socket buffer is empty and we shall not block. */ + if (sb->sb_cc == 0 && + ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { + error = EAGAIN; + goto out; } - if (bz->pages) { - for (i = 0; i < bz->page_cnt; i++) { - put_page(bz->pages[i]); - } +restart: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); - kfree(bz->pages); + /* Abort if socket has reported problems. */ + if (so->so_error) { + if (sb->sb_cc > 0) + goto deliver; + if (oresid > uio->uio_resid) + goto out; + error = so->so_error; + if (!(flags & MSG_PEEK)) + so->so_error = 0; + goto out; } - kfree(bz); - - return NULL; -} - -static int sdp_get_user_pages(struct page **pages, const unsigned int nr_pages, - unsigned long uaddr, int rw) -{ - int res, i; - - /* Try to fault in all of the necessary pages */ - down_read(¤t->mm->mmap_sem); - /* rw==READ means read from drive, write into memory area */ - res = get_user_pages( - current, - current->mm, - uaddr, - nr_pages, - rw == READ, - 0, /* don't force */ - pages, - NULL); - up_read(¤t->mm->mmap_sem); - - /* Errors and no page mapped should return here */ - if (res < nr_pages) - return res; - - for (i=0; i < nr_pages; i++) { - /* FIXME: flush superflous for rw==READ, - * probably wrong function for rw==WRITE - */ - flush_dcache_page(pages[i]); - } - - return nr_pages; -} - -static int sdp_get_pages(struct sock *sk, struct page **pages, int page_cnt, - unsigned long addr) -{ - int done_pages = 0; - - sdp_dbg_data(sk, "count: 0x%x addr: 0x%lx\n", page_cnt, addr); - - addr &= PAGE_MASK; - if (segment_eq(get_fs(), KERNEL_DS)) { - for (done_pages = 0; done_pages < page_cnt; done_pages++) { - pages[done_pages] = virt_to_page(addr); - if (!pages[done_pages]) - break; - get_page(pages[done_pages]); - addr += PAGE_SIZE; - } - } else { - done_pages = sdp_get_user_pages(pages, page_cnt, addr, WRITE); + /* Door is closed. Deliver what is left, if any. */ + if (sb->sb_state & SBS_CANTRCVMORE) { + if (sb->sb_cc > 0) + goto deliver; + else + goto out; } - if (unlikely(done_pages != page_cnt)) - goto err; - - return 0; - -err: - sdp_warn(sk, "Error getting pages. done_pages: %d page_cnt: %d\n", - done_pages, page_cnt); - for (; done_pages > 0; done_pages--) - page_cache_release(pages[done_pages - 1]); - - return -1; -} - -static struct bzcopy_state *sdp_bz_setup(struct sdp_sock *ssk, - char __user *base, - int len, - int size_goal) -{ - struct bzcopy_state *bz; - unsigned long addr; - int thresh; - mm_segment_t cur_fs; - int rc = 0; - - thresh = sdp_bzcopy_thresh; - if (thresh == 0 || len < thresh || !capable(CAP_IPC_LOCK)) { - SDPSTATS_COUNTER_INC(sendmsg_bcopy_segment); - return NULL; + /* Socket buffer got some data that we shall deliver now. */ + if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && + ((sb->sb_flags & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO)) || + sb->sb_cc >= sb->sb_lowat || + sb->sb_cc >= uio->uio_resid || + sb->sb_cc >= sb->sb_hiwat) ) { + goto deliver; } - SDPSTATS_COUNTER_INC(sendmsg_bzcopy_segment); - cur_fs = get_fs(); + /* On MSG_WAITALL we must wait until all data or error arrives. */ + if ((flags & MSG_WAITALL) && + (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) + goto deliver; /* - * Since we use the TCP segmentation fields of the skb to map user - * pages, we must make sure that everything we send in a single chunk - * fits into the frags array in the skb. + * Wait and block until (more) data comes in. + * NB: Drops the sockbuf lock during wait. */ - size_goal = size_goal / PAGE_SIZE + 1; - if (size_goal >= MAX_SKB_FRAGS) - return NULL; + error = sbwait(sb); + if (error) + goto out; + goto restart; - bz = kzalloc(sizeof(*bz), GFP_KERNEL); - if (!bz) - return ERR_PTR(-ENOMEM); +deliver: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); + KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); - addr = (unsigned long)base; + /* Statistics. */ + if (uio->uio_td) + uio->uio_td->td_ru.ru_msgrcv++; - bz->u_base = base; - bz->u_len = len; - bz->left = len; - bz->cur_offset = addr & ~PAGE_MASK; - bz->busy = 0; - bz->ssk = ssk; - bz->page_cnt = PAGE_ALIGN(len + bz->cur_offset) >> PAGE_SHIFT; - bz->pages = kcalloc(bz->page_cnt, sizeof(struct page *), - GFP_KERNEL); - - if (!bz->pages) { - kfree(bz); - return ERR_PTR(-ENOMEM); - } - - rc = sdp_get_pages(sk_ssk(ssk), bz->pages, bz->page_cnt, - (unsigned long)base); - - if (unlikely(rc)) - goto err; - - return bz; - -err: - kfree(bz->pages); - kfree(bz); - return ERR_PTR(-EFAULT); -} - -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int sdp_bcopy_get(struct sock *sk, struct sk_buff *skb, - char __user *from, int copy) -{ - int err; - struct sdp_sock *ssk = sdp_sk(sk); - - /* Where to copy to? */ - if (skb_tailroom(skb) > 0) { - /* We have some space in skb head. Superb! */ - if (copy > skb_tailroom(skb)) - copy = skb_tailroom(skb); - if ((err = skb_add_data(skb, from, copy)) != 0) - return SDP_ERR_FAULT; - } else { - int merge = 0; - int i = skb_shinfo(skb)->nr_frags; - struct page *page = TCP_PAGE(sk); - int off = TCP_OFF(sk); - - if (skb_can_coalesce(skb, i, page, off) && - off != PAGE_SIZE) { - /* We can extend the last page - * fragment. */ - merge = 1; - } else if (i == ssk->send_frags) { - /* Need to add new fragment and cannot - * do this because all the page slots are - * busy. */ - sdp_mark_push(ssk, skb); - return SDP_NEW_SEG; - } else if (page) { - if (off == PAGE_SIZE) { - put_page(page); - TCP_PAGE(sk) = page = NULL; - off = 0; + /* Fill uio until full or current end of socket buffer is reached. */ + len = min(uio->uio_resid, sb->sb_cc); + if (mp0 != NULL) { + /* Dequeue as many mbufs as possible. */ + if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { + for (*mp0 = m = sb->sb_mb; + m != NULL && m->m_len <= len; + m = m->m_next) { + len -= m->m_len; + uio->uio_resid -= m->m_len; + sbfree(sb, m); + n = m; } - } else - off = 0; - - if (copy > PAGE_SIZE - off) - copy = PAGE_SIZE - off; - - if (!sk_wmem_schedule(sk, copy)) - return SDP_DO_WAIT_MEM; - - if (!page) { - /* Allocate new cache page. */ - if (sdp_has_free_mem()) { - page = sk_stream_alloc_page(sk); - if (!page) - return SDP_DO_WAIT_MEM; - atomic_inc(&sdp_current_mem_usage); - } else - return SDP_DO_WAIT_MEM; + sb->sb_mb = m; + if (sb->sb_mb == NULL) + SB_EMPTY_FIXUP(sb); + n->m_next = NULL; } + /* Copy the remainder. */ + if (len > 0) { + KASSERT(sb->sb_mb != NULL, + ("%s: len > 0 && sb->sb_mb empty", __func__)); - /* Time to copy data. We are close to - * the end! */ - SDPSTATS_COUNTER_ADD(memcpy_count, copy); - err = skb_copy_to_page(sk, from, skb, page, - off, copy); - if (err) { - /* If this page was new, give it to the - * socket so it does not get leaked. - */ - if (!TCP_PAGE(sk)) { - TCP_PAGE(sk) = page; - TCP_OFF(sk) = 0; + m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); + if (m == NULL) + len = 0; /* Don't flush data from sockbuf. */ + else + uio->uio_resid -= m->m_len; + if (*mp0 != NULL) + n->m_next = m; + else + *mp0 = m; + if (*mp0 == NULL) { + error = ENOBUFS; + goto out; } - return SDP_ERR_ERROR; } + } else { + /* NB: Must unlock socket buffer as uiomove may sleep. */ + SOCKBUF_UNLOCK(sb); + error = m_mbuftouio(uio, sb->sb_mb, len); + SOCKBUF_LOCK(sb); + if (error) + goto out; + } + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); - /* Update the skb. */ - if (merge) { - skb_shinfo(skb)->frags[i - 1].size += copy; - } else { - skb_fill_page_desc(skb, i, page, off, copy); - if (TCP_PAGE(sk)) { - get_page(page); - } else if (off + copy < PAGE_SIZE) { - get_page(page); - TCP_PAGE(sk) = page; - } - } + /* + * Remove the delivered data from the socket buffer unless we + * were only peeking. + */ + if (!(flags & MSG_PEEK)) { + if (len > 0) + sbdrop_locked(sb, len); - TCP_OFF(sk) = off + copy; + /* Notify protocol that we drained some data. */ + SOCKBUF_UNLOCK(sb); + SDP_WLOCK(ssk); + sdp_do_posts(ssk); + SDP_WUNLOCK(ssk); + SOCKBUF_LOCK(sb); } - return copy; + /* + * For MSG_WAITALL we may have to loop again and wait for + * more data to come in. + */ + if ((flags & MSG_WAITALL) && uio->uio_resid > 0) + goto restart; +out: + SOCKBUF_LOCK_ASSERT(sb); + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); + SOCKBUF_UNLOCK(sb); + sbunlock(sb); + return (error); } -static inline int sdp_bzcopy_get(struct sock *sk, struct sk_buff *skb, - char __user *from, int copy, - struct bzcopy_state *bz) +/* + * Abort is used to teardown a connection typically while sitting in + * the accept queue. + */ +void +sdp_abort(struct socket *so) { - int this_page, left; - struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *ssk; - /* Push the first chunk to page align all following - TODO: review */ - if (skb_shinfo(skb)->nr_frags == ssk->send_frags) { - sdp_mark_push(ssk, skb); - return SDP_NEW_SEG; - } - - left = copy; - BUG_ON(left > bz->left); - - while (left) { - if (skb_shinfo(skb)->nr_frags == ssk->send_frags) { - copy = copy - left; - break; - } - - this_page = PAGE_SIZE - bz->cur_offset; - - if (left <= this_page) - this_page = left; - - if (!sk_wmem_schedule(sk, copy)) - return SDP_DO_WAIT_MEM; - - /* put_page in skb_release_data() (called by __kfree_skb) */ - get_page(bz->pages[bz->cur_page]); - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, - bz->pages[bz->cur_page], bz->cur_offset, - this_page); - - BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); - BUG_ON(bz->cur_offset > PAGE_SIZE); - - bz->cur_offset += this_page; - if (bz->cur_offset == PAGE_SIZE) { - bz->cur_offset = 0; - bz->cur_page++; - - BUG_ON(bz->cur_page > bz->page_cnt); - } - - left -= this_page; - - skb->len += this_page; - skb->data_len += this_page; - skb->truesize += this_page; - sk->sk_wmem_queued += this_page; - sk->sk_forward_alloc -= this_page; - } - - bz->left -= copy; - bz->busy++; - return copy; + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + /* + * If we have not yet dropped, do it now. + */ + if (!(ssk->flags & SDP_TIMEWAIT) && + !(ssk->flags & SDP_DROPPED)) + sdp_drop(ssk, ECONNABORTED); + KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", + ssk, ssk->flags)); + SDP_WUNLOCK(ssk); } -/* like sk_stream_wait_memory - except: - * - if credits_needed provided - wait for enough credits - * - TX irq will use this (in sendmsg context) to do the actual tx - * comp poll and post +/* + * Close a SDP socket and initiate a friendly disconnect. */ -int sdp_tx_wait_memory(struct sdp_sock *ssk, long *timeo_p, int *credits_needed) +static void +sdp_close(struct socket *so) { - struct sock *sk = sk_ssk(ssk); - int err = 0; - long vm_wait = 0; - long current_timeo = *timeo_p; - DEFINE_WAIT(wait); + struct sdp_sock *ssk; - if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + /* + * If we have not yet dropped, do it now. + */ + if (!(ssk->flags & SDP_TIMEWAIT) && + !(ssk->flags & SDP_DROPPED)) + sdp_start_disconnect(ssk); - while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; - if (!*timeo_p) - goto do_nonblock; - if (signal_pending(current)) - goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - - sdp_do_posts(ssk); - - if (credits_needed) { - if (tx_slots_free(ssk) >= *credits_needed) - break; - } else { - if (sk_stream_memory_free(sk) && !vm_wait) - break; - } - - /* Before going to sleep, make sure no credit update is missed, - * rx_cq will be armed now. */ - posts_handler_put(ssk, 0); - - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - sk->sk_write_pending++; - - sdp_prf1(sk, NULL, "Going to sleep"); - - if (tx_credits(ssk) > SDP_MIN_TX_CREDITS) - sdp_arm_tx_cq(sk); - - if (credits_needed) { - sk_wait_event(sk, ¤t_timeo, - !sk->sk_err && - !(sk->sk_shutdown & SEND_SHUTDOWN) && - !ssk->tx_compl_pending && - tx_slots_free(ssk) >= *credits_needed && - vm_wait); - } else { - sk_wait_event(sk, ¤t_timeo, - !sk->sk_err && - !(sk->sk_shutdown & SEND_SHUTDOWN) && - !ssk->tx_compl_pending && - sk_stream_memory_free(sk) && - tx_credits(ssk) > SDP_MIN_TX_CREDITS && - vm_wait); - } - - sdp_prf(sk, NULL, "Woke up. memfree: %d", sk_stream_memory_free(sk)); - sk->sk_write_pending--; - - posts_handler_get(ssk); - - if (!ssk->qp_active) - goto do_error; - - if (vm_wait) { - vm_wait -= current_timeo; - current_timeo = *timeo_p; - if (current_timeo != MAX_SCHEDULE_TIMEOUT && - (current_timeo -= vm_wait) < 0) - current_timeo = 0; - vm_wait = 0; - } - *timeo_p = current_timeo; + /* + * If we've still not dropped let the socket layer know we're + * holding on to the socket and pcb for a while. + */ + if (!(ssk->flags & SDP_DROPPED)) { + SOCK_LOCK(so); + so->so_state |= SS_PROTOREF; + SOCK_UNLOCK(so); + ssk->flags |= SDP_SOCKREF; } -out: - finish_wait(sk->sk_sleep, &wait); - return err; - -do_error: - err = -EPIPE; - goto out; -do_nonblock: - err = -EAGAIN; - goto out; -do_interrupted: - err = sock_intr_errno(*timeo_p); - goto out; + SDP_WUNLOCK(ssk); } -/* Like tcp_sendmsg */ -/* TODO: check locking */ -static int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +/* + * User requests out-of-band data. + */ +static int +sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) { - int i; - struct sdp_sock *ssk = sdp_sk(sk); - struct sk_buff *skb; - int flags; - const int size_goal = MIN(ssk->xmit_size_goal, SDP_MAX_PAYLOAD); - int err, copied; - long timeo; - struct bzcopy_state *bz = NULL; - int zcopy_thresh = - -1 != ssk->zcopy_thresh ? ssk->zcopy_thresh : sdp_zcopy_thresh; + int error = 0; + struct sdp_sock *ssk; - SDPSTATS_COUNTER_INC(sendmsg); - - lock_sock(sk); - ssk->cpu = smp_processor_id(); - sdp_dbg_data(sk, "%s size = 0x%zx\n", __func__, size); - - posts_handler_get(ssk); - SDP_WARN_ON(ssk->tx_sa); - - flags = msg->msg_flags; - timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; - - /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - - /* Ok commence sending. */ - copied = 0; - - err = -EPIPE; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; - - for (i = 0; i < msg->msg_iovlen; i++) { - struct iovec *iov = &msg->msg_iov[i]; - int seglen = iov->iov_len; - char __user *from = iov->iov_base; - - sdp_dbg_data(sk, "Sending iov: 0x%x/0x%zx %p\n", i, msg->msg_iovlen, from); - - SDPSTATS_HIST(sendmsg_seglen, seglen); - - if (zcopy_thresh && seglen > zcopy_thresh && - seglen > SDP_MIN_ZCOPY_THRESH && - tx_slots_free(ssk) && ssk->sdp_dev && - ssk->sdp_dev->fmr_pool && !(flags & MSG_OOB)) { - int zcopied = 0; - - zcopied = sdp_sendmsg_zcopy(iocb, sk, iov); - - if (zcopied < 0) { - sdp_dbg_data(sk, "ZCopy send err: %d\n", zcopied); - err = zcopied; - goto out_err; - } - - copied += zcopied; - seglen = iov->iov_len; - from = iov->iov_base; - - sdp_dbg_data(sk, "ZCopied: 0x%x/0x%x\n", zcopied, seglen); - } - - if (bz) - sdp_bz_cleanup(bz); - bz = sdp_bz_setup(ssk, from, seglen, size_goal); - if (IS_ERR(bz)) { - err = PTR_ERR(bz); - bz = NULL; - goto do_error; - } - - while (seglen > 0) { - int copy; - - skb = sk->sk_write_queue.prev; - - if (!sk->sk_send_head || - (copy = size_goal - (skb->len - sizeof(struct sdp_bsdh))) <= 0 || - bz != BZCOPY_STATE(skb)) { -new_segment: - /* - * Allocate a new segment - * For bcopy, we stop sending once we have - * SO_SENDBUF bytes in flight. For bzcopy - * we stop sending once we run out of remote - * receive credits. - */ -#define can_not_tx(__bz) (\ - ( __bz && tx_slots_free(ssk) < __bz->busy) || \ - (!__bz && !sk_stream_memory_free(sk))) - if (unlikely(can_not_tx(bz))) { - if (!poll_recv_cq(sk)) - sdp_do_posts(ssk); - if ((can_not_tx(bz))) - goto wait_for_sndbuf; - } - - skb = sdp_alloc_skb_data(sk, min(seglen, size_goal), 0); - if (!skb) { - err = -ENOMEM; - goto do_error; - } - - BZCOPY_STATE(skb) = bz; - - /* - * Check whether we can use HW checksum. - */ - if (sk->sk_route_caps & - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM)) - skb->ip_summed = CHECKSUM_PARTIAL; - - sdp_skb_entail(sk, skb); - copy = size_goal; - - sdp_dbg_data(sk, "created new skb: %p" - " len = 0x%zx, sk_send_head: %p " - "copy: 0x%x size_goal: 0x%x\n", - skb, skb->len - sizeof(struct sdp_bsdh), - sk->sk_send_head, copy, size_goal); - - - } else { - sdp_dbg_data(sk, "adding to existing skb: %p" - " len = 0x%zx, sk_send_head: %p " - "copy: 0x%x\n", - skb, skb->len - sizeof(struct sdp_bsdh), - sk->sk_send_head, copy); - } - - /* Try to append data to the end of skb. */ - if (copy > seglen) - copy = seglen; - - copy = (bz) ? sdp_bzcopy_get(sk, skb, from, copy, bz) : - sdp_bcopy_get(sk, skb, from, copy); - if (unlikely(copy < 0)) { - switch (copy) { - case SDP_DO_WAIT_MEM: - err = -ENOMEM; - goto do_error; - case SDP_NEW_SEG: - goto new_segment; - case SDP_ERR_FAULT: - goto do_fault; - default: - goto do_error; - } - } - - if (!copied) - SDP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; - - ssk->write_seq += copy; - SDP_SKB_CB(skb)->end_seq += copy; - /*unused: skb_shinfo(skb)->gso_segs = 0;*/ - - from += copy; - copied += copy; - seglen -= copy; - continue; - -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - sdp_prf(sk, skb, "wait for mem. credits: %d", tx_credits(ssk)); - SDPSTATS_COUNTER_INC(send_wait_for_mem); - if (copied) - sdp_push(sk, flags & ~MSG_MORE); - - err = sdp_tx_wait_memory(ssk, &timeo, - bz ? &bz->busy : NULL); - if (err) - goto do_error; - } + ssk = sdp_sk(so); + SDP_WLOCK(ssk); + if (!rx_ring_trylock(&ssk->rx_ring)) { + SDP_WUNLOCK(ssk); + return (ECONNRESET); } - -out: - if (copied) { - sdp_push(sk, flags); - - if (bz) - bz = sdp_bz_cleanup(bz); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + error = ECONNRESET; + goto out; } - - sdp_auto_moderation(ssk); - - err = copied; - - sdp_dbg_data(sk, "copied: 0x%x\n", copied); - - goto fin; - -do_fault: - sdp_prf(sk, skb, "prepare fault"); - - if (skb->len <= sizeof(struct sdp_bsdh)) { - if (sk->sk_send_head == skb) - sk->sk_send_head = NULL; - __skb_unlink(skb, &sk->sk_write_queue); - sk_wmem_free_skb(sk, skb); + if ((so->so_oobmark == 0 && + (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + ssk->oobflags & SDP_HADOOB) { + error = EINVAL; + goto out; } - -do_error: - if (copied) + if ((ssk->oobflags & SDP_HAVEOOB) == 0) { + error = EWOULDBLOCK; goto out; -out_err: - if (bz) - bz = sdp_bz_cleanup(bz); - err = sk_stream_error(sk, flags, err); - sdp_dbg_data(sk, "err: %d\n", err); - -fin: - posts_handler_put(ssk, SDP_RX_ARMING_DELAY); - - if (!err && !ssk->qp_active) { - err = -EPIPE; - sdp_set_error(sk, err); - sdp_dbg(sk, "can't send anymore\n"); } - - release_sock(sk); - - return err; + m->m_len = 1; + *mtod(m, caddr_t) = ssk->iobc; + if ((flags & MSG_PEEK) == 0) + ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); +out: + rx_ring_unlock(&ssk->rx_ring); + SDP_WUNLOCK(ssk); + return (error); } -int sdp_abort_rx_srcavail(struct sock *sk) +void +sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) { - struct sdp_sock *ssk = sdp_sk(sk); - struct sdp_bsdh *h = - (struct sdp_bsdh *)skb_transport_header(ssk->rx_sa->skb); + struct mbuf *m; + struct socket *so; - sdp_dbg_data(sk, "SrcAvail aborted\n"); + so = ssk->socket; + if (so == NULL) + return; - h->mid = SDP_MID_DATA; - - if (sdp_post_rdma_rd_compl(sk, ssk->rx_sa)) { - sdp_warn(sk, "Couldn't send RdmaRdComp - " - "data corruption might occur\n"); + so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1; + sohasoutofband(so); + ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); + if (!(so->so_options & SO_OOBINLINE)) { + for (m = mb; m->m_next != NULL; m = m->m_next); + ssk->iobc = *(mtod(m, char *) + m->m_len - 1); + ssk->oobflags |= SDP_HAVEOOB; + m->m_len--; + mb->m_pkthdr.len--; } - - RX_SRCAVAIL_STATE(ssk->rx_sa->skb) = NULL; - kfree(ssk->rx_sa); - ssk->rx_sa = NULL; - - return 0; } -/* Like tcp_recvmsg */ -/* Maybe use skb_recv_datagram here? */ -/* Note this does not seem to handle vectored messages. Relevant? */ -static int sdp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, - int *addr_len) +/* + * Notify a sdp socket of an asynchronous error. + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +struct sdp_sock * +sdp_notify(struct sdp_sock *ssk, int error) { - struct sk_buff *skb = NULL; - struct sdp_sock *ssk = sdp_sk(sk); - long timeo; - int target; - unsigned long used; - int err; - u32 peek_seq; - u32 *seq; - int copied = 0; - int rc; - int avail_bytes_count = 0; /* Could be inlined in skb */ - /* or advertised for RDMA */ - SDPSTATS_COUNTER_INC(recvmsg); - lock_sock(sk); - ssk->cpu = smp_processor_id(); - sdp_dbg_data(sk, "iovlen: %zd iov_len: 0x%zx flags: 0x%x peek: 0x%x\n", - msg->msg_iovlen, msg->msg_iov[0].iov_len, flags, - MSG_PEEK); + SDP_WLOCK_ASSERT(ssk); - posts_handler_get(ssk); + if ((ssk->flags & SDP_TIMEWAIT) || + (ssk->flags & SDP_DROPPED)) + return (ssk); - err = -ENOTCONN; - if (sk->sk_state == TCP_LISTEN) - goto out; - - timeo = sock_rcvtimeo(sk, noblock); - /* Urgent data needs to be handled specially. */ - if (flags & MSG_OOB) - goto recv_urg; - - seq = &ssk->copied_seq; - if (flags & MSG_PEEK) { - peek_seq = ssk->copied_seq; - seq = &peek_seq; - } - - target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - - do { - struct rx_srcavail_state *rx_sa = NULL; - u32 offset; - - /* Are we at urgent data? Stop if we have read anything or have - * SIGURG pending. */ - if (ssk->urg_data && ssk->urg_seq == *seq) { - if (copied) - break; - if (signal_pending(current)) { - copied = timeo ? sock_intr_errno(timeo) : - -EAGAIN; - break; - } - } - - skb = skb_peek(&sk->sk_receive_queue); - do { - struct sdp_bsdh *h; - if (!skb) - break; - - offset = *seq - SDP_SKB_CB(skb)->seq; - avail_bytes_count = 0; - - h = (struct sdp_bsdh *)skb_transport_header(skb); - - switch (h->mid) { - case SDP_MID_DISCONN: - if (flags & MSG_PEEK) { - /* There is no point of handling a - * remote disconnection request while - * MSG_PEEK. The remote disconnection - * request will be handled upon regular - * recv. */ - goto got_disconn_in_peek; - } - sdp_dbg(sk, "Handle RX SDP_MID_DISCONN\n"); - sdp_prf(sk, NULL, "Handle RX SDP_MID_DISCONN"); - sdp_handle_disconn(sk); - goto found_fin_ok; - - case SDP_MID_SRCAVAIL: - rx_sa = RX_SRCAVAIL_STATE(skb); - if (unlikely(!rx_sa)) { - /* SrcAvailCancel arrived and handled */ - h->mid = SDP_MID_DATA; - goto check_srcavail_skb; - } - - if (sdp_chk_sa_cancel(ssk, rx_sa) || - !ssk->sdp_dev || - !ssk->sdp_dev->fmr_pool) { - sdp_dbg_data(sk, "Aborting SA " - "due to SACancel or " - "no fmr pool\n"); - sdp_abort_rx_srcavail(sk); - sdp_post_sendsm(sk); - rx_sa = NULL; -check_srcavail_skb: - if (offset < skb->len) { - sdp_prf(sk, skb, "Converted SA to DATA"); - goto sdp_mid_data; - } else { - sdp_prf(sk, skb, "Cancelled SA with no payload left"); - goto skb_cleanup; - } - } - - /* if has payload - handle as if MID_DATA */ - if (offset < skb->len) { - sdp_dbg_data(sk, "SrcAvail has " - "payload: %d/%d\n", - offset, - skb->len); - avail_bytes_count = skb->len; - } else { - sdp_dbg_data(sk, "Finished payload. " - "RDMAing: %d/%d\n", - offset, rx_sa->len); - - if (flags & MSG_PEEK) { - u32 real_offset = - ssk->copied_seq - - SDP_SKB_CB(skb)->seq; - sdp_dbg_data(sk, "Peek on RDMA data - " - "fallback to BCopy\n"); - sdp_abort_rx_srcavail(sk); - sdp_post_sendsm(sk); - rx_sa = NULL; - if (real_offset >= skb->len) - goto force_skb_cleanup; - } else { - avail_bytes_count = rx_sa->len; - } - } - - break; - - case SDP_MID_DATA: -sdp_mid_data: - rx_sa = NULL; - avail_bytes_count = skb->len; - break; - default: - break; - } - - if (before(*seq, SDP_SKB_CB(skb)->seq)) { - sdp_warn(sk, "skb: %p recvmsg bug: copied %X seq %X\n", - skb, *seq, SDP_SKB_CB(skb)->seq); - sdp_reset(sk); - break; - } - - if (offset < avail_bytes_count) - goto found_ok_skb; - - if (unlikely(!(flags & MSG_PEEK))) { - /* Could happen when SrcAvail was canceled - * and transformed into DATA SKB */ - goto skb_cleanup; - } - - SDP_WARN_ON(h->mid == SDP_MID_SRCAVAIL); - - skb = skb->next; - } while (skb != (struct sk_buff *)&sk->sk_receive_queue); - - if (copied >= target) - break; - - if (copied) { - if (sk->sk_err || - sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || - signal_pending(current) || - (flags & MSG_PEEK)) - break; - } else { - if (sock_flag(sk, SOCK_DONE)) - break; - - if (sk->sk_err) { - copied = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) - break; - - if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } - break; - } - - if (!timeo) { - copied = -EAGAIN; - break; - } - - if (signal_pending(current)) { - copied = sock_intr_errno(timeo); - break; - } - } - - if (poll_recv_cq(sk)) { - sdp_dbg_data(sk, "sk_wait_data %ld\n", timeo); - if (remote_credits(ssk) <= SDP_MIN_TX_CREDITS) { - /* Remote host can not send, so there is no - * point of waiting for data. - * This situation is possible if current host - * can not send credits-update due to lack of - * memory. - */ - if (!copied) - copied = -ENOMEM; - break; - } - - posts_handler_put(ssk, 0); - sk_wait_data(sk, &timeo); - posts_handler_get(ssk); - - sdp_dbg_data(sk, "got data/timeout\n"); - } - sdp_do_posts(ssk); - continue; - - found_ok_skb: - sdp_dbg_data(sk, "bytes avail: %d\n", avail_bytes_count); - sdp_dbg_data(sk, "buf len %Zd offset %d\n", len, offset); - sdp_dbg_data(sk, "copied %d target %d\n", copied, target); - used = avail_bytes_count - offset; - if (len < used) - used = len; - - sdp_dbg_data(sk, "%s: used %ld\n", __func__, used); - - if (ssk->urg_data) { - u32 urg_offset = ssk->urg_seq - *seq; - if (urg_offset < used) { - if (!urg_offset) { - if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; - offset++; - used--; - if (!used) - goto skip_copy; - } - } else - used = urg_offset; - } - } - if (!(flags & MSG_TRUNC)) { - if (rx_sa && offset >= skb->len) { - /* No more payload - start rdma copy */ - sdp_dbg_data(sk, "RDMA copy of 0x%lx bytes\n", used); - err = sdp_rdma_to_iovec(sk, msg->msg_iov, msg->msg_iovlen, skb, - &used, offset); - if (unlikely(err)) { - /* ssk->rx_sa might had been freed when - * we slept. */ - if (ssk->rx_sa) { - sdp_abort_rx_srcavail(sk); - sdp_post_sendsm(sk); - } - rx_sa = NULL; - if (err == -EAGAIN || err == -ETIME) - goto skb_cleanup; - sdp_warn(sk, "err from rdma %d - sendSM\n", err); - skb_unlink(skb, &sk->sk_receive_queue); - sdp_free_skb(skb); - } - } else { - sdp_dbg_data(sk, "memcpy 0x%lx bytes +0x%x -> %p\n", - used, offset, msg->msg_iov[0].iov_base); - - err = skb_copy_datagram_iovec(skb, offset, - /* TODO: skip header? */ - msg->msg_iov, used); - if (rx_sa && !(flags & MSG_PEEK)) { - rx_sa->copied += used; - rx_sa->reported += used; - } - } - if (err) { - sdp_dbg(sk, "%s: data copy failed" - "offset %d size %ld status %d\n", - __func__, offset, used, err); - /* Exception. Bailout! */ - if (!copied) - copied = err; - break; - } - } - - copied += used; - len -= used; - *seq += used; - offset = *seq - SDP_SKB_CB(skb)->seq; - sdp_dbg_data(sk, "done copied %d target %d\n", copied, target); - - sdp_do_posts(sdp_sk(sk)); - if (rx_sa && !ssk->rx_sa) { - /* SrcAvail canceled. Must not access local rx_sa */ - rx_sa = NULL; - } -skip_copy: - if (ssk->urg_data && after(ssk->copied_seq, ssk->urg_seq)) - ssk->urg_data = 0; - - - if (rx_sa && !(flags & MSG_PEEK)) { - rc = sdp_post_rdma_rd_compl(sk, rx_sa); - if (unlikely(rc)) { - sdp_abort_rx_srcavail(sk); - rx_sa = NULL; - err = rc; - goto out; - } - } - - if (!rx_sa && offset < skb->len) - continue; - - if (rx_sa && offset < rx_sa->len) - continue; - - offset = 0; - -skb_cleanup: - if (!(flags & MSG_PEEK)) { - struct sdp_bsdh *h; - h = (struct sdp_bsdh *)skb_transport_header(skb); - sdp_prf1(sk, skb, "READ finished. mseq: %d mseq_ack:%d", - ntohl(h->mseq), ntohl(h->mseq_ack)); - - if (rx_sa) { - /* ssk->rx_sa might had been freed when we slept. - */ - if (ssk->rx_sa) - sdp_abort_rx_srcavail(sk); - rx_sa = NULL; - } -force_skb_cleanup: - sdp_dbg_data(sk, "unlinking skb %p\n", skb); - skb_unlink(skb, &sk->sk_receive_queue); - sdp_free_skb(skb); - } - continue; -found_fin_ok: - ++*seq; - if (!(flags & MSG_PEEK)) { - skb_unlink(skb, &sk->sk_receive_queue); - sdp_free_skb(skb); - } - break; - - } while (len > 0); - -got_disconn_in_peek: - err = copied; -out: - - posts_handler_put(ssk, SDP_RX_ARMING_DELAY); - - sdp_auto_moderation(ssk); - - if (!err && !ssk->qp_active) { - err = -EPIPE; - sdp_set_error(sk, err); - sdp_dbg(sk, "data won't be available anymore\n"); - } - - release_sock(sk); - sdp_dbg_data(sk, "recvmsg finished. ret = %d\n", err); - return err; - -recv_urg: - err = sdp_recv_urg(sk, timeo, msg, len, flags, addr_len); - goto out; + /* + * Ignore some errors if we are hooked up. + */ + if (ssk->state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) + return (ssk); + ssk->softerror = error; + return sdp_drop(ssk, error); } -static int sdp_listen(struct sock *sk, int backlog) +static void +sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { - struct sdp_sock *ssk = sdp_sk(sk); - int rc; + struct in_addr faddr; - sdp_dbg(sk, "%s\n", __func__); - sdp_add_to_history(sk, __func__); + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; - if (!ssk->id) { - rc = sdp_get_port(sk, 0); - if (rc) - return rc; - inet_sk(sk)->sport = htons(inet_sk(sk)->num); - } + sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); +} - rc = rdma_listen(ssk->id, backlog); - if (rc) { - sdp_warn(sk, "rdma_listen failed: %d\n", rc); - sdp_set_error(sk, rc); - } else - sdp_exch_state(sk, TCPF_CLOSE, TCP_LISTEN); - return rc; +static int +sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, + struct thread *td) +{ + return (EOPNOTSUPP); } -/* We almost could use inet_listen, but that calls - inet_csk_listen_start. Longer term we'll want to add - a listen callback to struct proto, similiar to bind. */ -static int sdp_inet_listen(struct socket *sock, int backlog) +static void +sdp_keepalive_timeout(void *data) { - struct sock *sk = sock->sk; - unsigned char old_state; - int err; + struct sdp_sock *ssk; - lock_sock(sk); - sdp_sk(sk)->cpu = smp_processor_id(); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED) + ssk = data; + /* Callout canceled. */ + if (!callout_active(&ssk->keep2msl)) + return; + /* Callout rescheduled as a different kind of timer. */ + if (callout_pending(&ssk->keep2msl)) goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + callout_deactivate(&ssk->keep2msl); + if (ssk->flags & SDP_DROPPED || + (ssk->socket->so_options & SO_KEEPALIVE) == 0) goto out; - - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != TCP_LISTEN) { - err = sdp_listen(sk, backlog); - if (err) - goto out; - } - sk->sk_max_ack_backlog = backlog; - err = 0; - + sdp_post_keepalive(ssk); + callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, + sdp_keepalive_timeout, ssk); out: - release_sock(sk); - return err; + SDP_WUNLOCK(ssk); } -static void sdp_unhash(struct sock *sk) -{ - sdp_dbg(sk, "%s\n", __func__); -} -static inline unsigned int sdp_listen_poll(const struct sock *sk) +void +sdp_start_keepalive_timer(struct socket *so) { - return !list_empty(&sdp_sk(sk)->accept_queue) ? - (POLLIN | POLLRDNORM) : 0; -} + struct sdp_sock *ssk; -static unsigned int sdp_poll(struct file *file, struct socket *socket, - struct poll_table_struct *wait) -{ - unsigned int mask; - struct sock *sk = socket->sk; - - sdp_dbg_data(sk, "%s\n", __func__); - - lock_sock(sk); - sdp_sk(sk)->cpu = smp_processor_id(); - - if (sk->sk_state == TCP_ESTABLISHED) { - sdp_prf(sk, NULL, "posting\n"); - sdp_do_posts(sdp_sk(sk)); - } - mask = datagram_poll(file, socket, wait); - if (!(mask & POLLIN)) - sdp_arm_rx_cq(sk); - - /* - * Adjust for memory in later kernels - */ - if (!sk_stream_memory_free(sk)) - mask &= ~(POLLOUT | POLLWRNORM | POLLWRBAND); - - /* TODO: Slightly ugly: it would be nicer if there was function - * like datagram_poll that didn't include poll_wait, - * then we could reverse the order. */ - if (sk->sk_state == TCP_LISTEN) { - mask = sdp_listen_poll(sk); - goto out; - } - - if (sdp_sk(sk)->urg_data & TCP_URG_VALID) - mask |= POLLPRI; -out: - release_sock(sk); - return mask; + ssk = sdp_sk(so); + if (!callout_pending(&ssk->keep2msl)) + callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, + sdp_keepalive_timeout, ssk); } -static void sdp_enter_memory_pressure(struct sock *sk) +static void +sdp_stop_keepalive_timer(struct socket *so) { - sdp_dbg(sk, "%s\n", __func__); -} + struct sdp_sock *ssk; -void sdp_urg(struct sdp_sock *ssk, struct sk_buff *skb) -{ - struct sock *sk = sk_ssk(ssk); - u8 tmp; - u32 ptr = skb->len - 1; - - ssk->urg_seq = SDP_SKB_CB(skb)->seq + ptr; - - if (skb_copy_bits(skb, ptr, &tmp, 1)) - BUG(); - ssk->urg_data = TCP_URG_VALID | tmp; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + ssk = sdp_sk(so); + callout_stop(&ssk->keep2msl); } -static struct percpu_counter *sockets_allocated; -static atomic_t memory_allocated; -static struct percpu_counter *orphan_count; -static int memory_pressure; -struct proto sdp_proto = { - .close = sdp_close, - .connect = sdp_connect, - .disconnect = sdp_disconnect, - .accept = sdp_accept, - .ioctl = sdp_ioctl, - .init = sdp_init_sock, - .shutdown = sdp_shutdown, - .setsockopt = sdp_setsockopt, - .getsockopt = sdp_getsockopt, - .sendmsg = sdp_sendmsg, - .recvmsg = sdp_recvmsg, - .unhash = sdp_unhash, - .get_port = sdp_get_port, - /* Wish we had this: .listen = sdp_listen */ - .enter_memory_pressure = sdp_enter_memory_pressure, - .memory_allocated = &memory_allocated, - .memory_pressure = &memory_pressure, - .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, - .max_header = sizeof(struct sdp_bsdh), - .obj_size = sizeof(struct sdp_sock), - .owner = THIS_MODULE, - .name = "SDP", -}; +/* + * sdp_ctloutput() must drop the inpcb lock before performing copyin on + * socket option arguments. When it re-acquires the lock after the copy, it + * has to revalidate that the connection is still valid for the socket + * option. + */ +#define SDP_WLOCK_RECHECK(inp) do { \ + SDP_WLOCK(ssk); \ + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ + SDP_WUNLOCK(ssk); \ + return (ECONNRESET); \ + } \ +} while(0) -static struct proto_ops sdp_proto_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, /* TODO: inet_datagram connect would - autobind, but need to fix get_port - with port 0 first. */ - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - .poll = sdp_poll, - .ioctl = inet_ioctl, - .listen = sdp_inet_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -}; - -static int sdp_create_socket(struct net *net, struct socket *sock, int protocol) +static int +sdp_ctloutput(struct socket *so, struct sockopt *sopt) { - struct sock *sk; - int rc; + int error, opt, optval; + struct sdp_sock *ssk; - sdp_dbg(NULL, "type %d protocol %d\n", sock->type, protocol); - - if (net != &init_net) - return -EAFNOSUPPORT; - - if (sock->type != SOCK_STREAM) { - sdp_warn(NULL, "SDP: unsupported type %d.\n", sock->type); - return -ESOCKTNOSUPPORT; + error = 0; + ssk = sdp_sk(so); + if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { + SDP_WLOCK(ssk); + if (so->so_options & SO_KEEPALIVE) + sdp_start_keepalive_timer(so); + else + sdp_stop_keepalive_timer(so); + SDP_WUNLOCK(ssk); } + if (sopt->sopt_level != IPPROTO_TCP) + return (error); - /* IPPROTO_IP is a wildcard match */ - if (protocol != IPPROTO_TCP && protocol != IPPROTO_IP) { - sdp_warn(NULL, "SDP: unsupported protocol %d.\n", protocol); - return -EPROTONOSUPPORT; + SDP_WLOCK(ssk); + if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { + SDP_WUNLOCK(ssk); + return (ECONNRESET); } - sk = sk_alloc(net, PF_INET_SDP, GFP_KERNEL, &sdp_proto); - if (!sk) { - sdp_warn(NULL, "SDP: failed to allocate socket.\n"); - return -ENOMEM; - } - sock_init_data(sock, sk); - sk->sk_protocol = 0x0 /* TODO: inherit tcp socket to use IPPROTO_TCP */; - percpu_counter_inc(sk->sk_prot->sockets_allocated); + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + SDP_WUNLOCK(ssk); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); - memset((struct inet_sock *)sk + 1, 0, - sizeof(struct sdp_sock) - sizeof(struct inet_sock)); - rc = sdp_init_sock(sk); - if (rc) { - sdp_warn(sk, "SDP: failed to init sock.\n"); - sdp_common_release(sk); - return -ENOMEM; - } + SDP_WLOCK_RECHECK(ssk); + opt = SDP_NODELAY; + if (optval) + ssk->flags |= opt; + else + ssk->flags &= ~opt; + sdp_do_posts(ssk); + SDP_WUNLOCK(ssk); + break; - sdp_add_to_history(sk, __func__); - sk->sk_destruct = sdp_destruct; - sock->ops = &sdp_proto_ops; - sock->state = SS_UNCONNECTED; + default: + SDP_WUNLOCK(ssk); + error = ENOPROTOOPT; + break; + } + break; - sdp_add_sock(sdp_sk(sk)); + case SOPT_GET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + optval = ssk->flags & SDP_NODELAY; + SDP_WUNLOCK(ssk); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + default: + SDP_WUNLOCK(ssk); + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} +#undef SDP_WLOCK_RECHECK - return 0; +int sdp_mod_count = 0; +int sdp_mod_usec = 0; + +void +sdp_set_default_moderation(struct sdp_sock *ssk) +{ + if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) + return; + ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); } -static void sdp_add_device(struct ib_device *device) + +static void +sdp_dev_add(struct ib_device *device) { + struct ib_fmr_pool_param param; struct sdp_device *sdp_dev; - struct ib_fmr_pool_param fmr_param; - sdp_dev = kmalloc(sizeof *sdp_dev, GFP_KERNEL); - if (!sdp_dev) - return; - + sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); sdp_dev->pd = ib_alloc_pd(device); - if (IS_ERR(sdp_dev->pd)) { - printk(KERN_WARNING "Unable to allocate PD: %ld.\n", - PTR_ERR(sdp_dev->pd)); - goto err_pd_alloc; - } - + if (IS_ERR(sdp_dev->pd)) + goto out_pd; sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(sdp_dev->mr)) { - printk(KERN_WARNING "Unable to get dma MR: %ld.\n", - PTR_ERR(sdp_dev->mr)); - goto err_mr; - } - - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = sdp_fmr_pool_size; - fmr_param.dirty_watermark = sdp_fmr_dirty_wm; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = SDP_FMR_SIZE; - fmr_param.page_shift = PAGE_SHIFT; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ); - - sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &fmr_param); - if (IS_ERR(sdp_dev->fmr_pool)) { - printk(KERN_WARNING "Error creating fmr pool\n"); - sdp_dev->fmr_pool = NULL; - } - + if (IS_ERR(sdp_dev->mr)) + goto out_mr; + memset(¶m, 0, sizeof param); + param.max_pages_per_fmr = SDP_FMR_SIZE; + param.page_shift = PAGE_SHIFT; + param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); + param.pool_size = SDP_FMR_POOL_SIZE; + param.dirty_watermark = SDP_FMR_DIRTY_SIZE; + param.cache = 1; + sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); + if (IS_ERR(sdp_dev->fmr_pool)) + goto out_fmr; ib_set_client_data(device, &sdp_client, sdp_dev); - return; -err_mr: +out_fmr: + ib_dereg_mr(sdp_dev->mr); +out_mr: ib_dealloc_pd(sdp_dev->pd); -err_pd_alloc: - kfree(sdp_dev); +out_pd: + free(sdp_dev, M_SDP); } -static void sdp_remove_device(struct ib_device *device) +static void +sdp_dev_rem(struct ib_device *device) { - struct sdp_sock *ssk; - struct sock *sk; - struct rdma_cm_id *id; struct sdp_device *sdp_dev; + struct sdp_sock *ssk; - sdp_dev = ib_get_client_data(device, &sdp_client); - ib_set_client_data(device, &sdp_client, NULL); - - /* destroy_ids: */ -do_next: - down_write(&device_removal_lock); - - spin_lock_irq(&sock_list_lock); - list_for_each_entry(ssk, &sock_list, sock_list) { - if (ssk->ib_device == device && !ssk->id_destroyed_already) { - spin_unlock_irq(&sock_list_lock); - sk = sk_ssk(ssk); - sdp_add_to_history(sk, __func__); - lock_sock(sk); - /* ssk->id must be lock-protected, - * to enable mutex with sdp_close() */ - id = ssk->id; - ssk->id = NULL; - ssk->id_destroyed_already = 1; - - release_sock(sk); - up_write(&device_removal_lock); - - if (id) - rdma_destroy_id(id); - schedule(); - goto do_next; - } + SDP_LIST_WLOCK(); + LIST_FOREACH(ssk, &sdp_list, list) { + if (ssk->ib_device != device) + continue; + SDP_WLOCK(ssk); + if ((ssk->flags & SDP_DESTROY) == 0) + ssk = sdp_notify(ssk, ECONNRESET); + if (ssk) + SDP_WUNLOCK(ssk); } - - /* destroy qps: */ -kill_socks: - list_for_each_entry(ssk, &sock_list, sock_list) { - if (ssk->ib_device == device) { - spin_unlock_irq(&sock_list_lock); - sk = sk_ssk(ssk); - lock_sock(sk); - - sdp_abort_srcavail(sk); - sdp_abort_rdma_read(sk); - sdp_destroy_qp(ssk); - sdp_set_error(sk, -ENODEV); - ssk->ib_device = NULL; - ssk->sdp_dev = NULL; - - release_sock(sk); - flush_workqueue(rx_comp_wq); - schedule(); - spin_lock_irq(&sock_list_lock); - - goto kill_socks; - } - } - - spin_unlock_irq(&sock_list_lock); - - up_write(&device_removal_lock); - + SDP_LIST_WUNLOCK(); + /* + * XXX Do I need to wait between these two? + */ + sdp_dev = ib_get_client_data(device, &sdp_client); if (!sdp_dev) return; - - if (sdp_dev->fmr_pool) { - ib_flush_fmr_pool(sdp_dev->fmr_pool); - ib_destroy_fmr_pool(sdp_dev->fmr_pool); - } - + ib_flush_fmr_pool(sdp_dev->fmr_pool); + ib_destroy_fmr_pool(sdp_dev->fmr_pool); ib_dereg_mr(sdp_dev->mr); - ib_dealloc_pd(sdp_dev->pd); - - kfree(sdp_dev); + free(sdp_dev, M_SDP); } -static struct net_proto_family sdp_net_proto = { - .family = AF_INET_SDP, - .create = sdp_create_socket, - .owner = THIS_MODULE, -}; +struct ib_client sdp_client = + { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; -struct ib_client sdp_client = { - .name = "sdp", - .add = sdp_add_device, - .remove = sdp_remove_device -}; -static int __init sdp_init(void) +static int +sdp_pcblist(SYSCTL_HANDLER_ARGS) { - int rc = -ENOMEM; + int error, n, i; + struct sdp_sock *ssk; + struct xinpgen xig; - INIT_LIST_HEAD(&sock_list); - spin_lock_init(&sock_list_lock); - spin_lock_init(&sdp_large_sockets_lock); + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == NULL) { + n = sdp_count; + n += imax(n / 8, 10); + req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); + return (0); + } - sockets_allocated = kzalloc(sizeof(*sockets_allocated), GFP_KERNEL); - if (!sockets_allocated) - goto no_mem_sockets_allocated; + if (req->newptr != NULL) + return (EPERM); - orphan_count = kzalloc(sizeof(*orphan_count), GFP_KERNEL); - if (!orphan_count) - goto no_mem_orphan_count; + /* + * OK, now we're committed to doing something. + */ + SDP_LIST_RLOCK(); + n = sdp_count; + SDP_LIST_RUNLOCK(); - percpu_counter_init(sockets_allocated, 0); - percpu_counter_init(orphan_count, 0); + error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + + n * sizeof(struct xtcpcb)); + if (error != 0) + return (error); - sdp_proto.sockets_allocated = sockets_allocated; - sdp_proto.orphan_count = orphan_count; + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = 0; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return (error); - rx_comp_wq = create_workqueue("rx_comp_wq"); - if (!rx_comp_wq) - goto no_mem_rx_wq; + SDP_LIST_RLOCK(); + for (ssk = LIST_FIRST(&sdp_list), i = 0; + ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { + struct xtcpcb xt; - sdp_wq = create_singlethread_workqueue("sdp_wq"); - if (!sdp_wq) - goto no_mem_sdp_wq; + SDP_RLOCK(ssk); + if (ssk->flags & SDP_TIMEWAIT) { + if (ssk->cred != NULL) + error = cr_cansee(req->td->td_ucred, + ssk->cred); + else + error = EINVAL; /* Skip this inp. */ + } else if (ssk->socket) + error = cr_canseesocket(req->td->td_ucred, + ssk->socket); + else + error = EINVAL; + if (error) { + error = 0; + goto next; + } - rc = proto_register(&sdp_proto, 1); - if (rc) { - printk(KERN_WARNING "proto_register failed: %d\n", rc); - goto error_proto_reg; + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof xt; + xt.xt_inp.inp_gencnt = 0; + xt.xt_inp.inp_vflag = INP_IPV4; + memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); + xt.xt_inp.inp_lport = ssk->lport; + memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); + xt.xt_inp.inp_fport = ssk->fport; + xt.xt_tp.t_state = ssk->state; + if (ssk->socket != NULL) + sotoxsocket(ssk->socket, &xt.xt_socket); + else + bzero(&xt.xt_socket, sizeof xt.xt_socket); + xt.xt_socket.xso_protocol = IPPROTO_TCP; + SDP_RUNLOCK(ssk); + error = SYSCTL_OUT(req, &xt, sizeof xt); + if (error) + break; + i++; + continue; +next: + SDP_RUNLOCK(ssk); } - - rc = sock_register(&sdp_net_proto); - if (rc) { - printk(KERN_WARNING "sock_register failed: %d\n", rc); - goto error_sock_reg; + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + xig.xig_gen = 0; + xig.xig_sogen = so_gencnt; + xig.xig_count = sdp_count; + error = SYSCTL_OUT(req, &xig, sizeof xig); } + SDP_LIST_RUNLOCK(); + return (error); +} - sdp_proc_init(); +SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); - atomic_set(&sdp_current_mem_usage, 0); +SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + sdp_pcblist, "S,xtcpcb", "List of active SDP connections"); - ib_register_client(&sdp_client); +static void +sdp_zone_change(void *tag) +{ - return 0; - -error_sock_reg: - proto_unregister(&sdp_proto); -error_proto_reg: - destroy_workqueue(sdp_wq); -no_mem_sdp_wq: - destroy_workqueue(rx_comp_wq); -no_mem_rx_wq: - kfree(orphan_count); -no_mem_orphan_count: - kfree(sockets_allocated); -no_mem_sockets_allocated: - return rc; + uma_zone_set_max(sdp_zone, maxsockets); } -static void __exit sdp_exit(void) +static void +sdp_init(void) { - sock_unregister(PF_INET_SDP); - proto_unregister(&sdp_proto); - if (percpu_counter_sum(orphan_count)) - printk(KERN_WARNING "%s: orphan_count %lld\n", __func__, - percpu_counter_sum(orphan_count)); + LIST_INIT(&sdp_list); + sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(sdp_zone, maxsockets); + EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, + EVENTHANDLER_PRI_ANY); + rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); + ib_register_client(&sdp_client); +} - destroy_workqueue(rx_comp_wq); - destroy_workqueue(sdp_wq); +extern struct domain sdpdomain; - BUG_ON(!list_empty(&sock_list)); +struct pr_usrreqs sdp_usrreqs = { + .pru_abort = sdp_abort, + .pru_accept = sdp_accept, + .pru_attach = sdp_attach, + .pru_bind = sdp_bind, + .pru_connect = sdp_connect, + .pru_control = sdp_control, + .pru_detach = sdp_detach, + .pru_disconnect = sdp_disconnect, + .pru_listen = sdp_listen, + .pru_peeraddr = sdp_getpeeraddr, + .pru_rcvoob = sdp_rcvoob, + .pru_send = sdp_send, + .pru_sosend = sdp_sosend, + .pru_soreceive = sdp_sorecv, + .pru_shutdown = sdp_shutdown, + .pru_sockaddr = sdp_getsockaddr, + .pru_close = sdp_close, +}; - if (atomic_read(&sdp_current_mem_usage)) - printk(KERN_WARNING "%s: current mem usage %d\n", __func__, - atomic_read(&sdp_current_mem_usage)); +struct protosw sdpsw[] = { +{ + .pr_type = SOCK_STREAM, + .pr_domain = &sdpdomain, + .pr_protocol = IPPROTO_IP, + .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + .pr_ctlinput = sdp_ctlinput, + .pr_ctloutput = sdp_ctloutput, + .pr_usrreqs = &sdp_usrreqs +}, +{ + .pr_type = SOCK_STREAM, + .pr_domain = &sdpdomain, + .pr_protocol = IPPROTO_TCP, + .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + .pr_ctlinput = sdp_ctlinput, + .pr_ctloutput = sdp_ctloutput, + .pr_usrreqs = &sdp_usrreqs +}, +}; - if (percpu_counter_sum(sockets_allocated)) - printk(KERN_WARNING "%s: sockets_allocated %lld\n", __func__, - percpu_counter_sum(sockets_allocated)); +struct domain sdpdomain = { + .dom_family = AF_INET_SDP, + .dom_name = "SDP", + .dom_init = sdp_init, + .dom_protosw = sdpsw, + .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], +}; - sdp_proc_unregister(); +DOMAIN_SET(sdp); - ib_unregister_client(&sdp_client); - - percpu_counter_destroy(sockets_allocated); - percpu_counter_destroy(orphan_count); - - kfree(orphan_count); - kfree(sockets_allocated); -} - -module_init(sdp_init); -module_exit(sdp_exit); +int sdp_debug_level = 1; +int sdp_data_debug_level = 0; Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c (.../head) (revision 219811) @@ -39,33 +39,23 @@ #include #include #include -#include +#include #include /* for memcpy_toiovec */ #include #include #include #include "sdp.h" -static int sdp_post_srcavail(struct sock *sk, struct tx_srcavail_state *tx_sa) +static int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa) { struct sdp_sock *ssk = sdp_sk(sk); - struct sk_buff *skb; + struct mbuf *mb; int payload_len; struct page *payload_pg; int off, len; struct ib_umem_chunk *chunk; - if (ssk->tx_sa) { - /* ssk->tx_sa might already be there in a case of - * multithreading: user thread initiated Zcopy and went to - * sleep, and now another user thread tries to ZCopy. - * Fallback to BCopy - data might be mixed. - * TODO: Fix it. fallback to BCopy is not enough because recv - * side has seq warnings. - */ - sdp_dbg_data(sk, "user already initiated ZCopy transmission\n"); - return -EAGAIN; - } + WARN_ON(ssk->tx_sa); BUG_ON(!tx_sa); BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey); @@ -80,14 +70,14 @@ tx_sa->bytes_sent = tx_sa->bytes_acked = 0; - skb = sdp_alloc_skb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0); - if (!skb) { + mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0); + if (!mb) { return -ENOMEM; } sdp_dbg_data(sk, "sending SrcAvail\n"); - - TX_SRCAVAIL_STATE(skb) = tx_sa; /* tx_sa is hanged on the skb - * but continue to live after skb is freed */ + + TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb + * but continue to live after mb is freed */ ssk->tx_sa = tx_sa; /* must have payload inlined in SrcAvail packet in combined mode */ @@ -99,56 +89,68 @@ sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n", off, payload_pg, payload_len); - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, + mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags, payload_pg, off, payload_len); - /* Need to increase mem_usage counter even thought this page was not - * allocated. - * The reason is that when freeing this skb, we are decreasing the same - * counter according to nr_frags. we don't want to check h->mid since - * h->mid is not always a valid value. - */ - atomic_add(skb_shinfo(skb)->nr_frags, &sdp_current_mem_usage); - skb->len += payload_len; - skb->data_len = payload_len; - skb->truesize += payload_len; + mb->len += payload_len; + mb->data_len = payload_len; + mb->truesize += payload_len; +// sk->sk_wmem_queued += payload_len; +// sk->sk_forward_alloc -= payload_len; - sdp_skb_entail(sk, skb); - + mb_entail(sk, ssk, mb); + ssk->write_seq += payload_len; - SDP_SKB_CB(skb)->end_seq += payload_len; + SDP_SKB_CB(mb)->end_seq += payload_len; tx_sa->bytes_sent = tx_sa->umem->length; tx_sa->bytes_acked = payload_len; - /* TODO: pushing the skb into the tx_queue should be enough */ + /* TODO: pushing the mb into the tx_queue should be enough */ return 0; } -static int sdp_post_srcavail_cancel(struct sock *sk) +static int sdp_post_srcavail_cancel(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); - struct sk_buff *skb; + struct mbuf *mb; - sdp_dbg_data(sk_ssk(ssk), "Posting srcavail cancel\n"); + sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n"); - skb = sdp_alloc_skb_srcavail_cancel(sk, 0); - if (unlikely(!skb)) - return -ENOMEM; + mb = sdp_alloc_mb_srcavail_cancel(sk, 0); + mb_entail(sk, ssk, mb); - sdp_skb_entail(sk, skb); - sdp_post_sends(ssk, 0); + schedule_delayed_work(&ssk->srcavail_cancel_work, + SDP_SRCAVAIL_CANCEL_TIMEOUT); + return 0; } +void srcavail_cancel_timeout(struct work_struct *work) +{ + struct sdp_sock *ssk = + container_of(work, struct sdp_sock, srcavail_cancel_work.work); + struct socket *sk = ssk->socket; + + lock_sock(sk); + + sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout." + " closing connection\n"); + sdp_set_error(sk, -ECONNRESET); + wake_up(&ssk->wq); + + release_sock(sk); +} + static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p, int ignore_signals) { - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; int err = 0; + long vm_wait = 0; long current_timeo = *timeo_p; struct tx_srcavail_state *tx_sa = ssk->tx_sa; DEFINE_WAIT(wait); @@ -197,19 +199,28 @@ } } - posts_handler_put(ssk, 0); + posts_handler_put(ssk); sk_wait_event(sk, ¤t_timeo, tx_sa->abort_flags && ssk->rx_sa && - (tx_sa->bytes_acked < tx_sa->bytes_sent)); - sdp_prf(sk_ssk(ssk), NULL, "woke up sleepers"); + (tx_sa->bytes_acked < tx_sa->bytes_sent) && + vm_wait); + sdp_dbg_data(ssk->socket, "woke up sleepers\n"); posts_handler_get(ssk); if (tx_sa->bytes_acked == tx_sa->bytes_sent) break; + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } *timeo_p = current_timeo; } @@ -225,11 +236,10 @@ return err; } -static int sdp_wait_rdma_wr_finished(struct sdp_sock *ssk) +static void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk) { - struct sock *sk = sk_ssk(ssk); - long timeo = SDP_RDMA_READ_TIMEOUT; - int rc = 0; + struct socket *sk = ssk->socket; + long timeo = HZ * 5; /* Timeout for for RDMA read */ DEFINE_WAIT(wait); sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n"); @@ -238,32 +248,27 @@ if (!ssk->tx_ring.rdma_inflight->busy) { sdp_dbg_data(sk, "got rdma cqe\n"); - if (sk->sk_err == ECONNRESET) - rc = -EPIPE; break; } if (!ssk->qp_active) { sdp_dbg_data(sk, "QP destroyed\n"); - rc = -EPIPE; break; } if (!timeo) { - sdp_warn(sk, "Fatal: no RDMA read completion\n"); - rc = -EIO; - sdp_set_error(sk, rc); + sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n"); + WARN_ON(1); break; } - posts_handler_put(ssk, 0); + posts_handler_put(ssk); sdp_prf1(sk, NULL, "Going to sleep"); - sk_wait_event(sk, &timeo, - !ssk->tx_ring.rdma_inflight->busy || - !ssk->qp_active); + sk_wait_event(sk, &timeo, + !ssk->tx_ring.rdma_inflight->busy); sdp_prf1(sk, NULL, "Woke up"); - sdp_dbg_data(sk_ssk(ssk), "woke up sleepers\n"); + sdp_dbg_data(ssk->socket, "woke up sleepers\n"); posts_handler_get(ssk); } @@ -271,45 +276,37 @@ finish_wait(sk->sk_sleep, &wait); sdp_dbg_data(sk, "Finished waiting\n"); - return rc; } -int sdp_post_rdma_rd_compl(struct sock *sk, struct rx_srcavail_state *rx_sa) +int sdp_post_rdma_rd_compl(struct sdp_sock *ssk, + struct rx_srcavail_state *rx_sa) { - struct sk_buff *skb; - int unreported = rx_sa->copied - rx_sa->reported; + struct mbuf *mb; + int copied = rx_sa->used - rx_sa->reported; - if (rx_sa->copied <= rx_sa->reported) + if (rx_sa->used <= rx_sa->reported) return 0; - skb = sdp_alloc_skb_rdmardcompl(sk, unreported, 0); - if (unlikely(!skb)) - return -ENOMEM; + mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0); - sdp_skb_entail(sk, skb); + rx_sa->reported += copied; - rx_sa->reported += unreported; + /* TODO: What if no tx_credits available? */ + sdp_post_send(ssk, mb); - sdp_post_sends(sdp_sk(sk), 0); - return 0; } -int sdp_post_sendsm(struct sock *sk) +int sdp_post_sendsm(struct socket *sk) { - struct sk_buff *skb = sdp_alloc_skb_sendsm(sk, 0); + struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0); - if (unlikely(!skb)) - return -ENOMEM; + sdp_post_send(sdp_sk(sk), mb); - sdp_skb_entail(sk, skb); - - sdp_post_sends(sdp_sk(sk), 0); - return 0; } -static int sdp_update_iov_used(struct sock *sk, struct iovec *iov, int len) +static int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len) { sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len); while (len > 0) { @@ -339,7 +336,7 @@ } void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack) { - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; unsigned long flags; spin_lock_irqsave(&ssk->tx_sa_lock, flags); @@ -349,7 +346,7 @@ goto out; } - if (after(ssk->tx_sa->mseq, mseq_ack)) { + if (ssk->tx_sa->mseq > mseq_ack) { sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. " "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", mseq_ack, ssk->tx_sa->mseq); @@ -359,6 +356,8 @@ sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n"); ssk->tx_sa->abort_flags |= TX_SA_SENDSM; + cancel_delayed_work(&ssk->srcavail_cancel_work); + wake_up(sk->sk_sleep); sdp_dbg_data(sk, "woke up sleepers\n"); @@ -369,7 +368,7 @@ void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, u32 bytes_completed) { - struct sock *sk = sk_ssk(ssk); + struct socket *sk = ssk->socket; unsigned long flags; sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa); @@ -377,12 +376,14 @@ spin_lock_irqsave(&ssk->tx_sa_lock, flags); + BUG_ON(!ssk); + if (!ssk->tx_sa) { sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n"); goto out; } - if (after(ssk->tx_sa->mseq, mseq_ack)) { + if (ssk->tx_sa->mseq > mseq_ack) { sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. " "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", mseq_ack, ssk->tx_sa->mseq); @@ -396,6 +397,7 @@ out: spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); + return; } static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset) @@ -409,85 +411,77 @@ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT); - return avail < offset ? 0 : avail - offset; + return avail - offset; } -static int sdp_alloc_fmr(struct sock *sk, void *uaddr, size_t len, - struct ib_pool_fmr **_fmr, struct ib_umem **_umem, int access, int min_len) +static int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len, + struct ib_pool_fmr **_fmr, struct ib_umem **_umem) { struct ib_pool_fmr *fmr; struct ib_umem *umem; - struct ib_device *dev = sdp_sk(sk)->ib_device; + struct ib_device *dev; u64 *pages; struct ib_umem_chunk *chunk; - int n = 0, j, k; + int n, j, k; int rc = 0; unsigned long max_lockable_bytes; if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) { - sdp_dbg_data(sk, "len:0x%zx > FMR_SIZE: 0x%lx\n", + sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n", len, SDP_MAX_RDMA_READ_LEN); len = SDP_MAX_RDMA_READ_LEN; } max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK); if (unlikely(len > max_lockable_bytes)) { - sdp_dbg_data(sk, "len:0x%zx > RLIMIT_MEMLOCK available: 0x%lx\n", + sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n", len, max_lockable_bytes); len = max_lockable_bytes; } - if (unlikely(len <= min_len)) - return -EAGAIN; - - sdp_dbg_data(sk, "user buf: %p, len:0x%zx max_lockable_bytes: 0x%lx\n", + sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n", uaddr, len, max_lockable_bytes); umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len, - access, 0); + IB_ACCESS_REMOTE_WRITE, 0); if (IS_ERR(umem)) { - rc = -EAGAIN; - sdp_dbg_data(sk, "Error doing umem_get 0x%zx bytes: %ld\n", len, PTR_ERR(umem)); - sdp_dbg_data(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n", + rc = PTR_ERR(umem); + sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc); + sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n", current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur, current->signal->rlim[RLIMIT_MEMLOCK].rlim_max, capable(CAP_IPC_LOCK)); goto err_umem_get; } - sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%zx\n", + sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n", umem->offset, umem->length); pages = (u64 *) __get_free_page(GFP_KERNEL); - if (!pages) { - rc = -ENOMEM; + if (!pages) goto err_pages_alloc; - } + n = 0; + + dev = sdp_sk(sk)->ib_device; list_for_each_entry(chunk, &umem->chunk_list, list) { for (j = 0; j < chunk->nmap; ++j) { - unsigned len2; - len2 = ib_sg_dma_len(dev, + len = ib_sg_dma_len(dev, &chunk->page_list[j]) >> PAGE_SHIFT; - - SDP_WARN_ON(len2 > len); - len -= len2; - for (k = 0; k < len2; ++k) { + for (k = 0; k < len; ++k) { pages[n++] = ib_sg_dma_address(dev, &chunk->page_list[j]) + umem->page_size * k; - BUG_ON(n >= SDP_FMR_SIZE); + } } } fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0); if (IS_ERR(fmr)) { - sdp_dbg_data(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr)); - SDPSTATS_COUNTER_INC(fmr_alloc_error); - rc = PTR_ERR(fmr); + sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr)); goto err_fmr_alloc; } @@ -498,7 +492,7 @@ return 0; -err_fmr_alloc: +err_fmr_alloc: free_page((unsigned long) pages); err_pages_alloc: @@ -509,28 +503,24 @@ return rc; } -static inline void sdp_free_fmr(struct sock *sk, struct ib_pool_fmr **_fmr, - struct ib_umem **_umem) +void sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem) { - if (*_fmr) { - ib_fmr_pool_unmap(*_fmr); - *_fmr = NULL; - } + if (!sdp_sk(sk)->qp_active) + return; - if (*_umem) { - ib_umem_release(*_umem); - *_umem = NULL; - } + ib_fmr_pool_unmap(*_fmr); + *_fmr = NULL; + + ib_umem_release(*_umem); + *_umem = NULL; } -static int sdp_post_rdma_read(struct sock *sk, struct rx_srcavail_state *rx_sa, - u32 offset) +static int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa) { struct sdp_sock *ssk = sdp_sk(sk); struct ib_send_wr *bad_wr; struct ib_send_wr wr = { NULL }; struct ib_sge sge; - int rc; wr.opcode = IB_WR_RDMA_READ; wr.next = NULL; @@ -544,93 +534,90 @@ sge.length = rx_sa->umem->length; sge.lkey = rx_sa->fmr->fmr->lkey; - wr.wr.rdma.remote_addr = rx_sa->vaddr + offset; + wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used; wr.num_sge = 1; wr.sg_list = &sge; rx_sa->busy++; wr.send_flags = IB_SEND_SIGNALED; - rc = ib_post_send(ssk->qp, &wr, &bad_wr); - if (unlikely(rc)) { - rx_sa->busy--; - ssk->tx_ring.rdma_inflight = NULL; - } - - return rc; + return ib_post_send(ssk->qp, &wr, &bad_wr); } -int sdp_rdma_to_iovec(struct sock *sk, struct iovec *iov, int msg_iovlen, - struct sk_buff *skb, unsigned long *used, u32 offset) +int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb, + unsigned long *used) { struct sdp_sock *ssk = sdp_sk(sk); - struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(skb); + struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb); + int got_srcavail_cancel; int rc = 0; int len = *used; int copied; - int i = 0; - if (unlikely(!ssk->ib_device)) - return -ENODEV; + sdp_dbg_data(ssk->socket, "preparing RDMA read." + " len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len); - while (!iov->iov_len) { - ++iov; - i++; - } - WARN_ON(i >= msg_iovlen); - - sdp_dbg_data(sk_ssk(ssk), "preparing RDMA read." - " len: 0x%x. buffer len: 0x%zx\n", len, iov->iov_len); - sock_hold(sk, SOCK_REF_RDMA_RD); if (len > rx_sa->len) { sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len); - SDP_WARN_ON(1); + WARN_ON(1); len = rx_sa->len; } - rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem, - IB_ACCESS_LOCAL_WRITE, 0); + rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem); if (rc) { - sdp_dbg_data(sk, "Error allocating fmr: %d\n", rc); + sdp_warn(sk, "Error allocating fmr: %d\n", rc); goto err_alloc_fmr; } - rc = sdp_post_rdma_read(sk, rx_sa, offset); + rc = sdp_post_rdma_read(sk, rx_sa); if (unlikely(rc)) { sdp_warn(sk, "ib_post_send failed with status %d.\n", rc); - sdp_set_error(sk_ssk(ssk), -ECONNRESET); + sdp_set_error(ssk->socket, -ECONNRESET); + wake_up(&ssk->wq); goto err_post_send; } - sdp_prf(sk, skb, "Finished posting, now to wait"); + sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc); + + got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq; + sdp_arm_tx_cq(sk); - rc = sdp_wait_rdma_wr_finished(ssk); - if (unlikely(rc)) - goto err_wait; + sdp_wait_rdma_wr_finished(ssk); + sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc); + if (!ssk->qp_active) { + sdp_dbg_data(sk, "QP destroyed during RDMA read\n"); + rc = -EPIPE; + goto err_post_send; + } + copied = rx_sa->umem->length; sdp_update_iov_used(sk, iov, copied); + rx_sa->used += copied; atomic_add(copied, &ssk->rcv_nxt); *used = copied; - rx_sa->copied += copied; -err_wait: ssk->tx_ring.rdma_inflight = NULL; err_post_send: sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); err_alloc_fmr: + if (rc && ssk->qp_active) { + sdp_warn(sk, "Couldn't do RDMA - post sendsm\n"); + rx_sa->flags |= RX_SA_ABORTED; + } + sock_put(sk, SOCK_REF_RDMA_RD); return rc; } -static inline int wait_for_sndbuf(struct sock *sk, long *timeo_p) +static inline int wait_for_sndbuf(struct socket *sk, long *timeo_p) { struct sdp_sock *ssk = sdp_sk(sk); int ret = 0; @@ -644,15 +631,14 @@ sdp_do_posts(ssk); - if (sdp_xmit_poll(ssk, 1)) - sdp_post_sends(ssk, 0); + sdp_xmit_poll(ssk, 1); ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed); return ret; } -static int do_sdp_sendmsg_zcopy(struct sock *sk, struct tx_srcavail_state *tx_sa, +static int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa, struct iovec *iov, long *timeo) { struct sdp_sock *ssk = sdp_sk(sk); @@ -660,23 +646,23 @@ unsigned long lock_flags; rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len, - &tx_sa->fmr, &tx_sa->umem, IB_ACCESS_REMOTE_READ, sdp_zcopy_thresh); - if (unlikely(rc)) { - sdp_dbg_data(sk, "Error allocating fmr: %d\n", rc); + &tx_sa->fmr, &tx_sa->umem); + if (rc) { + sdp_warn(sk, "Error allocating fmr: %d\n", rc); goto err_alloc_fmr; } if (tx_slots_free(ssk) == 0) { rc = wait_for_sndbuf(sk, timeo); - if (unlikely(rc)) { + if (rc) { sdp_warn(sk, "Couldn't get send buffer\n"); goto err_no_tx_slots; } } rc = sdp_post_srcavail(sk, tx_sa); - if (unlikely(rc)) { - sdp_dbg(sk, "Error posting SrcAvail: %d\n", rc); + if (rc) { + sdp_dbg(sk, "Error posting SrcAvail\n"); goto err_abort_send; } @@ -695,15 +681,12 @@ /* Wait for RdmaRdCompl/SendSM to * finish the transaction */ - *timeo = SDP_SRCAVAIL_CANCEL_TIMEOUT; - rc = sdp_wait_rdmardcompl(ssk, timeo, 1); - if (unlikely(rc == -ETIME || rc == -EINVAL)) { - /* didn't get RdmaRdCompl/SendSM after sending - * SrcAvailCancel - There is a connection - * problem. */ - sdp_reset(sk); - rc = -sk->sk_err; - } + *timeo = 2 * HZ; + sdp_dbg_data(sk, "Waiting for SendSM\n"); + sdp_wait_rdmardcompl(ssk, timeo, 1); + sdp_dbg_data(sk, "finished waiting\n"); + + cancel_delayed_work(&ssk->srcavail_cancel_work); } else { sdp_dbg_data(sk, "QP was destroyed while waiting\n"); } @@ -722,32 +705,35 @@ sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); err_alloc_fmr: - return rc; + return rc; } -int sdp_sendmsg_zcopy(struct kiocb *iocb, struct sock *sk, struct iovec *iov) +int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov) { struct sdp_sock *ssk = sdp_sk(sk); int rc = 0; - long timeo = SDP_SRCAVAIL_ADV_TIMEOUT; + long timeo; struct tx_srcavail_state *tx_sa; - size_t bytes_to_copy = iov->iov_len; + int offset; + size_t bytes_to_copy = 0; int copied = 0; - sdp_dbg_data(sk, "Sending ZCopy iov: %p, iov_len: 0x%zx\n", + sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n", iov->iov_base, iov->iov_len); + sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start"); if (ssk->rx_sa) { - /* Don't want both sides to send SrcAvail because both of them - * will wait on sendmsg() until timeout. - */ sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n"); return 0; } - sock_hold(sk_ssk(ssk), SOCK_REF_ZCOPY); + sock_hold(ssk->socket, SOCK_REF_ZCOPY); + SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment); + timeo = SDP_SRCAVAIL_ADV_TIMEOUT ; + /* Ok commence sending. */ + offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1); tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL); if (!tx_sa) { @@ -756,13 +742,14 @@ goto err_alloc_tx_sa; } + bytes_to_copy = iov->iov_len; do { tx_sa_reset(tx_sa); rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo); if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) { - sdp_dbg_data(sk, "0x%zx bytes left, switching to bcopy\n", + sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n", iov->iov_len); break; } @@ -774,7 +761,7 @@ sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied); - sock_put(sk_ssk(ssk), SOCK_REF_ZCOPY); + sock_put(ssk->socket, SOCK_REF_ZCOPY); if (rc < 0 && rc != -EAGAIN && rc != -ETIME) return rc; @@ -782,7 +769,7 @@ return copied; } -void sdp_abort_srcavail(struct sock *sk) +void sdp_abort_srcavail(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); struct tx_srcavail_state *tx_sa = ssk->tx_sa; @@ -791,6 +778,9 @@ if (!tx_sa) return; + cancel_delayed_work(&ssk->srcavail_cancel_work); + flush_scheduled_work(); + spin_lock_irqsave(&ssk->tx_sa_lock, flags); sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); @@ -800,18 +790,15 @@ spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); } -void sdp_abort_rdma_read(struct sock *sk) +void sdp_abort_rdma_read(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); - struct rx_srcavail_state *rx_sa; + struct rx_srcavail_state *rx_sa = ssk->rx_sa; - rx_sa = ssk->rx_sa; if (!rx_sa) return; sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); - /* kfree(rx_sa) and posting SendSM will be handled in the nornal - * flows. - */ + ssk->rx_sa = NULL; } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp.h =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (.../head) (revision 219811) @@ -1,18 +1,69 @@ #ifndef _SDP_H_ #define _SDP_H_ +#include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_ofed.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include #include #include -#include -#include -#include /* For urgent data flags */ +#include +#include +#include + #include -#include #include #include +#include +#include + +#ifdef SDP_DEBUG +#define CONFIG_INFINIBAND_SDP_DEBUG +#endif + #include "sdp_dbg.h" -#define sk_ssk(ssk) ((struct sock *)ssk) +#undef LIST_HEAD +/* From sys/queue.h */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} /* Interval between sucessive polls in the Tx routine when polling is used instead of interrupts (in per-core Tx rings) - should be power of 2 */ @@ -20,38 +71,37 @@ #define SDP_TX_POLL_TIMEOUT (HZ / 20) #define SDP_NAGLE_TIMEOUT (HZ / 10) -#define SDP_RX_ARMING_DELAY (msecs_to_jiffies(10)) -#define SDP_RDMA_READ_TIMEOUT (60 * HZ) /* timeout - fatal hw error */ - #define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5) #define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ) +#define SDP_SRCAVAIL_PAYLOAD_LEN 1 #define SDP_RESOLVE_TIMEOUT 1000 #define SDP_ROUTE_TIMEOUT 1000 #define SDP_RETRY_COUNT 5 #define SDP_KEEPALIVE_TIME (120 * 60 * HZ) #define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */ -#define SDP_CMA_TIMEWAIT_TIMEOUT (150 * HZ) #define SDP_TX_SIZE 0x40 #define SDP_RX_SIZE 0x40 -#define SDP_DEF_INLINE_THRESH 256 #define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64)) +#define SDP_FMR_POOL_SIZE 1024 +#define SDP_FMR_DIRTY_SIZE ( SDP_FMR_POOL_SIZE / 4 ) #define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2)) -#define SDP_MAX_RECV_SGES 9 /* 1 for sdp header + 8 for payload */ -#define SDP_MAX_SEND_SGES 9 /* same as above */ +/* mb inlined data len - rest will be rx'ed into frags */ +#define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh)) -/* skb inlined data len - rest will be rx'ed into frags */ -#define SDP_SKB_HEAD_SIZE (0x500 + sizeof(struct sdp_bsdh)) - /* limit tx payload len, if the sink supports bigger buffers than the source * can handle. * or rx fragment size (limited by sge->length size) */ -#define SDP_MAX_PAYLOAD ((1UL << 16) - SDP_SKB_HEAD_SIZE) +#define SDP_MAX_PACKET (1 << 16) +#define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE) +#define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES) +#define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2 + #define SDP_NUM_WC 4 #define SDP_DEF_ZCOPY_THRESH 64*1024 @@ -67,21 +117,23 @@ #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10) #define SDP_AUTO_CONF 0xffff +#define AUTO_MOD_DELAY (HZ / 4) -struct sdp_skb_cb { +struct sdp_mb_cb { __u32 seq; /* Starting sequence number */ - __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u8 flags; /* TCP header flags. */ struct bzcopy_state *bz; struct rx_srcavail_state *rx_sa; struct tx_srcavail_state *tx_sa; }; -#define SDP_SKB_CB(__skb) ((struct sdp_skb_cb *)&((__skb)->cb[0])) -#define BZCOPY_STATE(skb) (SDP_SKB_CB(skb)->bz) -#define RX_SRCAVAIL_STATE(skb) (SDP_SKB_CB(skb)->rx_sa) -#define TX_SRCAVAIL_STATE(skb) (SDP_SKB_CB(skb)->tx_sa) +#define M_PUSH M_PROTO1 /* Do a 'push'. */ +#define M_URG M_PROTO2 /* Mark as urgent (oob). */ +#define SDP_SKB_CB(__mb) ((struct sdp_mb_cb *)&((__mb)->cb[0])) +#define BZCOPY_STATE(mb) (SDP_SKB_CB(mb)->bz) +#define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa) +#define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa) + #ifndef MIN #define MIN(a, b) (a < b ? a : b) #endif @@ -91,50 +143,17 @@ #define ring_posted(ring) (ring_head(ring) - ring_tail(ring)) #define rx_ring_posted(ssk) ring_posted(ssk->rx_ring) +#ifdef SDP_ZCOPY #define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \ (ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0)) +#else +#define tx_ring_posted(ssk) ring_posted(ssk->tx_ring) +#endif -#define posts_handler(ssk) atomic_read(&ssk->somebody_is_doing_posts) -#define posts_handler_get(ssk) \ - do { \ - atomic_inc(&ssk->somebody_is_doing_posts); \ - sdp_postpone_rx_timer(ssk); \ - } while (0) - -#define posts_handler_put(ssk, intr_delay) \ - do { \ - sdp_do_posts(ssk); \ - if (atomic_dec_and_test(&ssk->somebody_is_doing_posts) && \ - likely(ssk->qp_active)) \ - sdp_schedule_arm_rx_cq(ssk, intr_delay);\ - } while (0) - -#define sdp_common_release(sk) do { \ - sdp_dbg(sk, "%s:%d - sock_put(SOCK_REF_ALIVE" \ - ") - refcount = %d from withing sk_common_release\n",\ - __func__, __LINE__, atomic_read(&(sk)->sk_refcnt));\ - percpu_counter_inc((sk)->sk_prot->orphan_count);\ - sdp_add_to_history(sk, "sdp_common_release"); \ - _sdp_add_to_history(sk, "SOCK_REF_ALIVE", __func__, __LINE__, \ - 2, SOCK_REF_ALIVE); \ - sk_common_release(sk); \ -} while (0) - -extern int sdp_inline_thresh; extern int sdp_zcopy_thresh; -extern struct workqueue_struct *sdp_wq; -extern struct list_head sock_list; -extern spinlock_t sock_list_lock; extern int rcvbuf_initial_size; -extern struct proto sdp_proto; extern struct workqueue_struct *rx_comp_wq; -extern atomic_t sdp_current_mem_usage; -extern int top_mem_usage; -extern spinlock_t sdp_large_sockets_lock; extern struct ib_client sdp_client; -#ifdef SDPSTATS_ON -DECLARE_PER_CPU(struct sdpstats, sdpstats); -#endif enum sdp_mid { SDP_MID_HELLO = 0x0, @@ -221,13 +240,8 @@ } __attribute__((__packed__)); struct sdp_buf { - struct sk_buff *skb; - /* The relation of mapping <-> pages is like this: - * mapping[0] doesn't have a correspondent page. - * mapping[i + 1] <-> pages[i] - */ + struct mbuf *mb; u64 mapping[SDP_MAX_SEND_SGES]; - struct page *pages[SDP_MAX_SEND_SGES - 1]; } __attribute__((__packed__)); struct sdp_chrecvbuf { @@ -247,6 +261,10 @@ struct page **pages; }; +enum rx_sa_flag { + RX_SA_ABORTED = 2, +}; + enum tx_sa_flag { TX_SA_SENDSM = 0x01, TX_SA_CROSS_SEND = 0x02, @@ -258,8 +276,8 @@ struct rx_srcavail_state { /* Advertised buffer stuff */ u32 mseq; + u32 used; u32 reported; - u32 copied; u32 len; u32 rkey; u64 vaddr; @@ -270,7 +288,7 @@ /* Utility */ u8 busy; - struct sk_buff *skb; /* SrcAvail skb */ + enum rx_sa_flag flags; }; struct tx_srcavail_state { @@ -290,18 +308,18 @@ }; struct sdp_tx_ring { +#ifdef SDP_ZCOPY struct rx_srcavail_state *rdma_inflight; +#endif struct sdp_buf *buffer; atomic_t head; atomic_t tail; struct ib_cq *cq; - u32 una_seq; atomic_t credits; #define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits)) - struct timer_list timer; - struct tasklet_struct tasklet; + struct callout timer; u16 poll_cnt; }; @@ -311,7 +329,8 @@ atomic_t tail; struct ib_cq *cq; - struct timer_list cq_arm_timer; + int destroyed; + struct rwlock destroyed_lock; }; struct sdp_device { @@ -341,184 +360,142 @@ int moder_time; }; -struct sdp_sock { - /* sk has to be the first member of inet_sock */ - struct inet_sock isk; - struct list_head sock_list; - struct list_head accept_queue; - struct list_head backlog_queue; - struct sk_buff_head rx_ctl_q; - struct sock *parent; - struct sdp_device *sdp_dev; - int cpu; +/* These are flags fields. */ +#define SDP_TIMEWAIT 0x0001 /* In ssk timewait state. */ +#define SDP_DROPPED 0x0002 /* Socket has been dropped. */ +#define SDP_SOCKREF 0x0004 /* Holding a sockref for close. */ +#define SDP_NODELAY 0x0008 /* Disble nagle. */ +#define SDP_NEEDFIN 0x0010 /* Send a fin on the next tx. */ +#define SDP_DREQWAIT 0x0020 /* Waiting on DREQ. */ +#define SDP_DESTROY 0x0040 /* Being destroyed. */ +#define SDP_DISCON 0x0080 /* rdma_disconnect is owed. */ - unsigned int sk_id; +/* These are oobflags */ +#define SDP_HADOOB 0x0001 /* Had OOB data. */ +#define SDP_HAVEOOB 0x0002 /* Have OOB data. */ -#ifdef SDP_SOCK_HISTORY - struct sdp_sock_hist hst[SDP_SOCK_HISTORY_LEN]; - unsigned long hst_idx; /* next free slot */ - spinlock_t hst_lock; - struct dentry *hst_dentr; -#endif /* SDP_SOCK_HISTORY */ - - int qp_active; - spinlock_t tx_sa_lock; - struct tx_srcavail_state *tx_sa; - - /* set when SrcAvail received, reset when SendSM/RdmaRdCompl sent */ - struct rx_srcavail_state *rx_sa; - - u32 sa_cancel_mseq; - int sa_cancel_arrived; /* is 'sa_cancel_mseq' relevant or not, sticky */ - +struct sdp_sock { + LIST_ENTRY(sdp_sock) list; + struct socket *socket; + struct rdma_cm_id *id; + struct ib_device *ib_device; + struct sdp_device *sdp_dev; + struct ib_qp *qp; + struct ucred *cred; + struct callout keep2msl; /* 2msl and keepalive timer. */ + struct callout nagle_timer; /* timeout waiting for ack */ struct ib_ucontext context; + in_port_t lport; + in_addr_t laddr; + in_port_t fport; + in_addr_t faddr; + int flags; + int oobflags; /* protected by rx lock. */ + int state; + int softerror; + int recv_bytes; /* Bytes per recv. buf including header */ + int xmit_size_goal; + char iobc; - int max_sge; + struct sdp_rx_ring rx_ring; + struct sdp_tx_ring tx_ring; + struct rwlock lock; + struct mbuf *rx_ctl_q; + struct mbuf *rx_ctl_tail; + int qp_active; /* XXX Flag. */ + int max_sge; struct work_struct rx_comp_work; - - struct delayed_work dreq_wait_work; - struct delayed_work cma_timewait_work; - struct work_struct destroy_work; - - int tx_compl_pending; - atomic_t somebody_is_doing_posts; - - /* Like tcp_sock */ - u16 urg_data; - u32 urg_seq; - u32 copied_seq; #define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt)) atomic_t rcv_nxt; - u32 write_seq; - int xmit_size_goal; - int nonagle; - - int dreq_wait_timeout; - int cma_timewait_timeout; - - unsigned keepalive_time; - - spinlock_t lock; - - /* tx_head/rx_head when keepalive timer started */ - unsigned keepalive_tx_head; - unsigned keepalive_rx_head; - - int destructed_already; - int sdp_disconnect; /* Need to send SDP_MID_DISCONNECT */ - int id_destroyed_already; /* for sdp_remove_device() only */ - - struct sdp_rx_ring rx_ring; - struct sdp_tx_ring tx_ring; - - /* Data below will be reset on error */ - struct rdma_cm_id *id; - struct ib_device *ib_device; - /* SDP specific */ atomic_t mseq_ack; #define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack)) unsigned max_bufs; /* Initial buffers offered by other side */ unsigned min_bufs; /* Low water mark to wake senders */ - u32 nagle_last_unacked; /* mseq of lastest unacked packet */ - struct timer_list nagle_timer; /* timeout waiting for ack */ + unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */ atomic_t remote_credits; #define remote_credits(ssk) (atomic_read(&ssk->remote_credits)) int poll_cq; - /* rdma specific */ - struct ib_qp *qp; - /* SDP slow start */ - int rcvbuf_scale; /* local recv buf scale for each socket */ - int sent_request_head; /* mark the tx_head of the last send resize - request */ - int sent_request; /* 0 - not sent yet, 1 - request pending - -1 - resize done succesfully */ int recv_request_head; /* mark the rx_head when the resize request was recieved */ - int recv_request; /* flag if request to resize was recieved */ - int recv_frags; /* max skb frags in recv packets */ - int send_frags; /* max skb frags in send packets */ + int recv_request; /* XXX flag if request to resize was recieved */ unsigned long tx_packets; unsigned long rx_packets; + unsigned long tx_bytes; unsigned long rx_bytes; struct sdp_moderation auto_mod; - + struct task shutdown_task; +#ifdef SDP_ZCOPY + struct tx_srcavail_state *tx_sa; + struct rx_srcavail_state *rx_sa; + spinlock_t tx_sa_lock; + struct delayed_work srcavail_cancel_work; + int srcavail_cancel_mseq; /* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */ int zcopy_thresh; - int inline_thresh; - - int last_bind_err; +#endif }; +#define sdp_sk(so) ((struct sdp_sock *)(so->so_pcb)) + +#define SDP_RLOCK(ssk) rw_rlock(&(ssk)->lock) +#define SDP_WLOCK(ssk) rw_wlock(&(ssk)->lock) +#define SDP_RUNLOCK(ssk) rw_runlock(&(ssk)->lock) +#define SDP_WUNLOCK(ssk) rw_wunlock(&(ssk)->lock) +#define SDP_WLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_WLOCKED) +#define SDP_RLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_RLOCKED) +#define SDP_LOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_LOCKED) + static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa) { memset((void *)&tx_sa->busy, 0, sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy)); } -static inline int sdp_chk_sa_cancel(struct sdp_sock *ssk, struct rx_srcavail_state *rx_sa) +static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring) { - return ssk->sa_cancel_arrived && - before(rx_sa->mseq, ssk->sa_cancel_mseq); + rw_runlock(&rx_ring->destroyed_lock); } -static inline struct sdp_sock *sdp_sk(const struct sock *sk) +static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring) { - return (struct sdp_sock *)sk; + rw_rlock(&rx_ring->destroyed_lock); + if (rx_ring->destroyed) { + rx_ring_unlock(rx_ring); + return 0; + } + return 1; } -static inline int _sdp_exch_state(const char *func, int line, struct sock *sk, - int from_states, int state) +static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring) { - unsigned long flags; - int old; + rw_wlock(&rx_ring->destroyed_lock); + rx_ring->destroyed = 1; + rw_wunlock(&rx_ring->destroyed_lock); +} - spin_lock_irqsave(&sdp_sk(sk)->lock, flags); +static inline void sdp_arm_rx_cq(struct sdp_sock *ssk) +{ + sdp_prf(ssk->socket, NULL, "Arming RX cq"); + sdp_dbg_data(ssk->socket, "Arming RX cq\n"); - sdp_dbg(sk, "%s:%d - set state: %s -> %s 0x%x\n", func, line, - sdp_state_str(sk->sk_state), - sdp_state_str(state), from_states); - - if ((1 << sk->sk_state) & ~from_states) { - sdp_warn(sk, "%s:%d: trying to exchange state from unexpected " - "state %s to state %s. expected states: 0x%x\n", - func, line, sdp_state_str(sk->sk_state), - sdp_state_str(state), from_states); - } - - old = sk->sk_state; - sk->sk_state = state; - - spin_unlock_irqrestore(&sdp_sk(sk)->lock, flags); - - sdp_add_to_history(sk, sdp_state_str(state)); - - return old; + ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP); } -#define sdp_exch_state(sk, from_states, state) \ - _sdp_exch_state(__func__, __LINE__, sk, from_states, state) -static inline void sdp_set_error(struct sock *sk, int err) +static inline void sdp_arm_tx_cq(struct sdp_sock *ssk) { - int ib_teardown_states = TCPF_FIN_WAIT1 | TCPF_CLOSE_WAIT - | TCPF_LAST_ACK; - sk->sk_err = -err; - if (sk->sk_socket) - sk->sk_socket->state = SS_DISCONNECTING; + sdp_prf(ssk->socket, NULL, "Arming TX cq"); + sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n", + tx_credits(ssk), tx_ring_posted(ssk)); - if ((1 << sk->sk_state) & ib_teardown_states) - sdp_exch_state(sk, ib_teardown_states, TCP_TIME_WAIT); - else if (TCP_TIME_WAIT != sk->sk_state) - sdp_exch_state(sk, ~0, TCP_CLOSE); - - sk->sk_error_report(sk); + ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP); } /* return the min of: @@ -537,25 +514,6 @@ return min_free - SDP_MIN_TX_CREDITS; }; -static inline unsigned sdp_cycles_to_usecs(unsigned long c) -{ -#ifdef CONFIG_PPC - return c / tb_ticks_per_usec; -#elif defined(__ia64__) - return c / local_cpu_data->cyc_per_usec; -#else - return c * 1000 / cpu_khz; -#endif -} - -static inline int sdp_has_free_mem(void) -{ - /* TODO: count also kmalloc's and skb's allocations. */ - - return !top_mem_usage || atomic_read(&sdp_current_mem_usage) < - top_mem_usage << (20 - PAGE_SHIFT); -} - /* utilities */ static inline char *mid2str(int mid) { @@ -575,159 +533,113 @@ ENUM2STR(SDP_MID_SINKAVAIL), }; - if (mid < 0 || mid >= ARRAY_SIZE(mid2str)) { - printk(KERN_WARNING "mid %d is illegal\n", mid); + if (mid >= ARRAY_SIZE(mid2str)) return NULL; - } return mid2str[mid]; } -static inline void sdp_free_skb(struct sk_buff *skb) +static inline struct mbuf * +sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait) { - if (unlikely(skb_shinfo(skb)->nr_frags)) - atomic_sub(skb_shinfo(skb)->nr_frags, &sdp_current_mem_usage); - - __kfree_skb(skb); -} - -static inline struct sk_buff *sdp_stream_alloc_skb(struct sock *sk, int size, - gfp_t gfp) -{ - struct sk_buff *skb; - - /* The TCP header must be at least 32-bit aligned. */ - size = ALIGN(size, 4); - - skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { - /* - * Make sure that we have exactly size bytes - * available to the caller, no more, no less. - */ - skb_reserve(skb, skb_tailroom(skb) - size); - return skb; - } - __kfree_skb(skb); - } else { - sk->sk_prot->enter_memory_pressure(sk); - sk_stream_moderate_sndbuf(sk); - } - return NULL; -} - -static inline struct sk_buff *sdp_alloc_skb(struct sock *sk, u8 mid, int size, - gfp_t gfp) -{ struct sdp_bsdh *h; - struct sk_buff *skb; + struct mbuf *mb; - if (!gfp) { - if (unlikely(sk->sk_allocation)) - gfp = sk->sk_allocation; - else - gfp = GFP_KERNEL; - } - - skb = sdp_stream_alloc_skb(sk, size, gfp); - if (unlikely(!skb)) - return NULL; - - skb_header_release(skb); - - h = (struct sdp_bsdh *)skb_push(skb, sizeof *h); + MGETHDR(mb, wait, MT_DATA); + if (mb == NULL) + return (NULL); + mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh); + h = mtod(mb, struct sdp_bsdh *); h->mid = mid; - skb_reset_transport_header(skb); - - return skb; + return mb; } -static inline struct sk_buff *sdp_alloc_skb_data(struct sock *sk, int size, gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_data(struct socket *sk, int wait) { - return sdp_alloc_skb(sk, SDP_MID_DATA, size, gfp); + return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait); } -static inline struct sk_buff *sdp_alloc_skb_disconnect(struct sock *sk, - gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_disconnect(struct socket *sk, int wait) { - return sdp_alloc_skb(sk, SDP_MID_DISCONN, 0, gfp); + return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait); } -static inline struct sk_buff *sdp_alloc_skb_chrcvbuf_ack(struct sock *sk, - int size, gfp_t gfp) +static inline void * +mb_put(struct mbuf *mb, int len) { - struct sk_buff *skb; - struct sdp_chrecvbuf *resp_size; + uint8_t *data; - skb = sdp_alloc_skb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), gfp); - if (unlikely(!skb)) - return NULL; + data = mb->m_data; + data += mb->m_len; + mb->m_len += len; + return (void *)data; +} - resp_size = (struct sdp_chrecvbuf *)skb_put(skb, sizeof *resp_size); +static inline struct mbuf * +sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait) +{ + struct mbuf *mb; + struct sdp_chrecvbuf *resp_size; + + mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait); + if (mb == NULL) + return (NULL); + resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size); resp_size->size = htonl(size); - return skb; + return mb; } -static inline struct sk_buff *sdp_alloc_skb_srcavail(struct sock *sk, - u32 len, u32 rkey, u64 vaddr, gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait) { - struct sk_buff *skb; + struct mbuf *mb; struct sdp_srcah *srcah; - skb = sdp_alloc_skb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), gfp); - if (unlikely(!skb)) - return NULL; - - srcah = (struct sdp_srcah *)skb_put(skb, sizeof(*srcah)); + mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait); + if (mb == NULL) + return (NULL); + srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah)); srcah->len = htonl(len); srcah->rkey = htonl(rkey); srcah->vaddr = cpu_to_be64(vaddr); - return skb; + return mb; } -static inline struct sk_buff *sdp_alloc_skb_srcavail_cancel(struct sock *sk, - gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait) { - return sdp_alloc_skb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, gfp); + return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait); } -static inline struct sk_buff *sdp_alloc_skb_rdmardcompl(struct sock *sk, - u32 len, gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait) { - struct sk_buff *skb; + struct mbuf *mb; struct sdp_rrch *rrch; - skb = sdp_alloc_skb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), gfp); - if (unlikely(!skb)) - return NULL; - - rrch = (struct sdp_rrch *)skb_put(skb, sizeof(*rrch)); + mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait); + if (mb == NULL) + return (NULL); + rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch)); rrch->len = htonl(len); - return skb; + return mb; } -static inline struct sk_buff *sdp_alloc_skb_sendsm(struct sock *sk, gfp_t gfp) +static inline struct mbuf * +sdp_alloc_mb_sendsm(struct socket *sk, int wait) { - return sdp_alloc_skb(sk, SDP_MID_SENDSM, 0, gfp); + return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait); } static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk) { return SDP_TX_SIZE - tx_ring_posted(ssk); } -/* Return true if need to send credit update. Rules are: - * - at least half of the RX buffer is available - * - 1.5 * c < p - * - has TX credits - * - has room in tx Q - * - * p = number of posted buffers - * c = current credits count at the peer - */ static inline int credit_update_needed(struct sdp_sock *ssk) { int c; @@ -741,181 +653,33 @@ } -#ifdef SDPSTATS_ON - -#define SDPSTATS_MAX_HIST_SIZE 256 -struct sdpstats { - u32 post_send[256]; - u32 inline_sends; - u32 sendmsg_bcopy_segment; - u32 sendmsg_bzcopy_segment; - u32 sendmsg_zcopy_segment; - u32 sendmsg; - u32 recvmsg; - u32 post_send_credits; - u32 sendmsg_seglen[25]; - u32 send_size[25]; - u32 post_recv; - u32 rx_int_arm; - u32 tx_int_arm; - u32 rx_int_count; - u32 tx_int_count; - u32 rx_int_wake_up; - u32 rx_int_queue; - u32 rx_int_no_op; - u32 rx_cq_modified; - u32 rx_cq_arm_timer; - u32 rx_wq; - u32 bzcopy_poll_miss; - u32 send_wait_for_mem; - u32 send_miss_no_credits; - u32 rx_poll_miss; - u32 rx_poll_hit; - u32 poll_hit_usec[16]; - u32 tx_poll_miss; - u32 tx_poll_hit; - u32 tx_poll_busy; - u32 tx_poll_no_op; - u32 memcpy_count; - u32 credits_before_update[64]; - u32 zcopy_tx_timeout; - u32 zcopy_cross_send; - u32 zcopy_tx_aborted; - u32 zcopy_tx_error; - u32 fmr_alloc_error; - u32 keepalive_timer; - u32 nagle_timer; -}; - -static inline void sdpstats_hist(u32 *h, u32 val, u32 maxidx, int is_log) -{ - int idx = is_log ? ilog2(val) : val; - - /* ilog2(0) == -1 */ - if (idx < 0) - idx = 0; - else if (unlikely(idx > maxidx)) - idx = maxidx; - - h[idx]++; -} - -#define SDPSTATS_COUNTER_INC(stat) do { __get_cpu_var(sdpstats).stat++; } while (0) -#define SDPSTATS_COUNTER_ADD(stat, val) do { __get_cpu_var(sdpstats).stat += val; } while (0) -#define SDPSTATS_COUNTER_MID_INC(stat, mid) do { __get_cpu_var(sdpstats).stat[mid]++; } \ - while (0) -#define SDPSTATS_HIST(stat, size) \ - sdpstats_hist(__get_cpu_var(sdpstats).stat, size, ARRAY_SIZE(__get_cpu_var(sdpstats).stat) - 1, 1) - -#define SDPSTATS_HIST_LINEAR(stat, size) \ - sdpstats_hist(__get_cpu_var(sdpstats).stat, size, ARRAY_SIZE(__get_cpu_var(sdpstats).stat) - 1, 0) - -#else #define SDPSTATS_COUNTER_INC(stat) #define SDPSTATS_COUNTER_ADD(stat, val) #define SDPSTATS_COUNTER_MID_INC(stat, mid) #define SDPSTATS_HIST_LINEAR(stat, size) #define SDPSTATS_HIST(stat, size) -#endif -static inline void sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, - size_t head_size, enum dma_data_direction dir) +static inline void +sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, + enum dma_data_direction dir) { + struct ib_device *dev; + struct mbuf *mb; int i; - struct sk_buff *skb; - struct ib_device *dev = ssk->ib_device; - skb = sbuf->skb; - sbuf->skb = NULL; - - if (!sbuf->mapping[0]) - return; /* Inlined send - nothing to cleanup */ - - ib_dma_unmap_single(dev, sbuf->mapping[0], head_size, dir); - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - ib_dma_unmap_page(dev, sbuf->mapping[i + 1], - skb_shinfo(skb)->frags[i].size, - dir); - sbuf->mapping[i + 1] = 0; - } + dev = ssk->ib_device; + for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++) + ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir); } -static inline void sdp_postpone_rx_timer(struct sdp_sock *ssk) -{ - if (timer_pending(&ssk->rx_ring.cq_arm_timer) && ssk->qp_active) - mod_timer(&ssk->rx_ring.cq_arm_timer, MAX_JIFFY_OFFSET); -} - -static inline void sdp_arm_rx_cq(struct sock *sk) -{ - if (unlikely(!sdp_sk(sk)->rx_ring.cq)) - return; - - SDPSTATS_COUNTER_INC(rx_int_arm); - sdp_dbg_data(sk, "Arming RX cq\n"); - - sdp_postpone_rx_timer(sdp_sk(sk)); - - if (unlikely(0 > ib_req_notify_cq(sdp_sk(sk)->rx_ring.cq, - IB_CQ_NEXT_COMP))) - sdp_warn(sk, "error arming rx cq\n"); -} - -static inline void sdp_arm_tx_cq(struct sock *sk) -{ - if (unlikely(!sdp_sk(sk)->tx_ring.cq)) - return; - - SDPSTATS_COUNTER_INC(tx_int_arm); - sdp_dbg_data(sk, "Arming TX cq. credits: %d, posted: %d\n", - tx_credits(sdp_sk(sk)), tx_ring_posted(sdp_sk(sk))); - - if (unlikely(0 > ib_req_notify_cq(sdp_sk(sk)->tx_ring.cq, - IB_CQ_NEXT_COMP))) - sdp_warn(sk, "error arming tx cq\n"); -} - -static inline void sdp_schedule_arm_rx_cq(struct sdp_sock *ssk, - unsigned long delay) -{ - if (unlikely(!ssk->rx_ring.cq)) - return; - - if (delay && ssk->qp_active) - mod_timer(&ssk->rx_ring.cq_arm_timer, jiffies + delay); - else { - /* There is no point of setting up a timer for an immediate - * cq-arming, better arm it now. */ - sdp_arm_rx_cq(sk_ssk(ssk)); - } -} - -static inline int somebody_is_waiting(struct sock *sk) -{ - return sk->sk_socket && - test_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); -} - /* sdp_main.c */ void sdp_set_default_moderation(struct sdp_sock *ssk); -int sdp_init_sock(struct sock *sk); -void sdp_start_keepalive_timer(struct sock *sk); -void sdp_remove_sock(struct sdp_sock *ssk); -void sdp_add_sock(struct sdp_sock *ssk); -void sdp_urg(struct sdp_sock *ssk, struct sk_buff *skb); +void sdp_start_keepalive_timer(struct socket *sk); +void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb); void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk); -void sdp_reset_sk(struct sock *sk, int rc); -void sdp_reset(struct sock *sk); -int sdp_tx_wait_memory(struct sdp_sock *ssk, long *timeo_p, int *credits_needed); -void sdp_skb_entail(struct sock *sk, struct sk_buff *skb); -void sdp_start_cma_timewait_timeout(struct sdp_sock *ssk, int timeo); -int sdp_abort_rx_srcavail(struct sock *sk); -extern struct rw_semaphore device_removal_lock; +void sdp_abort(struct socket *sk); +struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error); -/* sdp_proc.c */ -int __init sdp_proc_init(void); -void sdp_proc_unregister(void); /* sdp_cma.c */ int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); @@ -924,35 +688,34 @@ int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_tx_ring_destroy(struct sdp_sock *ssk); int sdp_xmit_poll(struct sdp_sock *ssk, int force); -void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb); -void sdp_post_sends(struct sdp_sock *ssk, gfp_t gfp); -void sdp_nagle_timeout(unsigned long data); +void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb); +void sdp_post_sends(struct sdp_sock *ssk, int wait); void sdp_post_keepalive(struct sdp_sock *ssk); /* sdp_rx.c */ +void sdp_rx_ring_init(struct sdp_sock *ssk); int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_rx_ring_destroy(struct sdp_sock *ssk); int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size); int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size); void sdp_do_posts(struct sdp_sock *ssk); void sdp_rx_comp_full(struct sdp_sock *ssk); -void sdp_remove_large_sock(const struct sdp_sock *ssk); -void sdp_handle_disconn(struct sock *sk); -int sdp_poll_rx_cq(struct sdp_sock *ssk); /* sdp_zcopy.c */ -int sdp_sendmsg_zcopy(struct kiocb *iocb, struct sock *sk, struct iovec *iov); +int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov); int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah); void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack); void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, u32 bytes_completed); int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk); -int sdp_rdma_to_iovec(struct sock *sk, struct iovec *iov, int msg_iovlen, - struct sk_buff *skb, unsigned long *used, u32 offset); -int sdp_post_rdma_rd_compl(struct sock *sk, +int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb, + unsigned long *used); +int sdp_post_rdma_rd_compl(struct sdp_sock *ssk, struct rx_srcavail_state *rx_sa); -int sdp_post_sendsm(struct sock *sk); -void sdp_abort_srcavail(struct sock *sk); -void sdp_abort_rdma_read(struct sock *sk); +int sdp_post_sendsm(struct socket *sk); +void srcavail_cancel_timeout(struct work_struct *work); +void sdp_abort_srcavail(struct socket *sk); +void sdp_abort_rdma_read(struct socket *sk); +int sdp_process_rx(struct sdp_sock *ssk); #endif Index: sys/ofed/drivers/infiniband/ulp/srpt/ib_srpt.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/srpt/ib_srpt.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/ulp/srpt/ib_srpt.c (.../head) (revision 219811) @@ -2597,7 +2597,7 @@ static void srpt_wait_for_cred(struct srpt_rdma_ch *ch, int req_lim_min) { while (unlikely(srpt_must_wait_for_cred(ch, req_lim_min))) - schedule(); + sched_yield(); } /** Index: sys/ofed/drivers/infiniband/core/cm.c =================================================================== --- sys/ofed/drivers/infiniband/core/cm.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/cm.c (.../head) (revision 219811) @@ -3889,6 +3889,6 @@ idr_destroy(&cm.local_id_table); } -module_init(ib_cm_init); +module_init_order(ib_cm_init, SI_ORDER_SECOND); module_exit(ib_cm_cleanup); Index: sys/ofed/drivers/infiniband/core/umem.c =================================================================== --- sys/ofed/drivers/infiniband/core/umem.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/umem.c (.../head) (revision 219811) @@ -35,9 +35,20 @@ #include #include #include +#ifdef __linux__ #include +#endif #include +#include +#include +#include + +#include +#include +#include +#include + #include "uverbs.h" static int allow_weak_ordering; @@ -101,6 +112,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { +#ifdef __linux__ struct ib_umem_chunk *chunk, *tmp; int i; @@ -109,14 +121,39 @@ chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); for (i = 0; i < chunk->nents; ++i) { struct page *page = sg_page(&chunk->page_list[i]); - if (umem->writable && dirty) set_page_dirty_lock(page); put_page(page); } + kfree(chunk); + } +#else + struct ib_umem_chunk *chunk, *tmp; + vm_object_t object; + int i; + object = NULL; + list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { + ib_dma_unmap_sg_attrs(dev, chunk->page_list, + chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); + for (i = 0; i < chunk->nents; ++i) { + struct page *page = sg_page(&chunk->page_list[i]); + if (umem->writable && dirty) { + if (object && object != page->object) + VM_OBJECT_UNLOCK(object); + if (object != page->object) { + object = page->object; + VM_OBJECT_LOCK(object); + } + vm_page_dirty(page); + } + } kfree(chunk); } + if (object) + VM_OBJECT_UNLOCK(object); + +#endif } /** @@ -130,6 +167,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync) { +#ifdef __linux__ struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; @@ -148,7 +186,6 @@ else if (allow_weak_ordering) dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); - if (!can_do_mlock()) return ERR_PTR(-EPERM); @@ -203,6 +240,7 @@ cur_base = addr & PAGE_MASK; ret = 0; + while (npages) { ret = get_user_pages(current, current->mm, cur_base, min_t(unsigned long, npages, @@ -271,9 +309,128 @@ free_page((unsigned long) page_list); return ret < 0 ? ERR_PTR(ret) : umem; +#else + struct ib_umem *umem; + struct ib_umem_chunk *chunk; + struct proc *proc; + pmap_t pmap; + vm_offset_t end, last, start; + vm_size_t npages; + int error; + int ents; + int ret; + int i; + DEFINE_DMA_ATTRS(attrs); + + error = priv_check(curthread, PRIV_VM_MLOCK); + if (error) + return ERR_PTR(-error); + + last = addr + size; + start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ + end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ + if (last < addr || end < addr) + return ERR_PTR(-EINVAL); + npages = atop(end - start); + if (npages > vm_page_max_wired) + return ERR_PTR(-ENOMEM); + umem = kzalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + proc = curthread->td_proc; + PROC_LOCK(proc); + if (ptoa(npages + + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > + lim_cur(proc, RLIMIT_MEMLOCK)) { + PROC_UNLOCK(proc); + kfree(umem); + return ERR_PTR(-ENOMEM); + } + PROC_UNLOCK(proc); + if (npages + cnt.v_wire_count > vm_page_max_wired) { + kfree(umem); + return ERR_PTR(-EAGAIN); + } + error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | + (umem->writable ? VM_MAP_WIRE_WRITE : 0)); + if (error != KERN_SUCCESS) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + umem->context = context; + umem->length = size; + umem->offset = addr & ~PAGE_MASK; + umem->page_size = PAGE_SIZE; + umem->start = addr; + /* + * We ask for writable memory if any access flags other than + * "remote read" are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + umem->hugetlb = 0; + INIT_LIST_HEAD(&umem->chunk_list); + + pmap = vm_map_pmap(&proc->p_vmspace->vm_map); + ret = 0; + while (npages) { + ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); + chunk = kmalloc(sizeof(*chunk) + + (sizeof(struct scatterlist) * ents), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + chunk->attrs = attrs; + chunk->nents = ents; + sg_init_table(&chunk->page_list[0], ents); + for (i = 0; i < chunk->nents; ++i) { + vm_paddr_t pa; + + pa = pmap_extract(pmap, start); + if (pa == 0) { + ret = -ENOMEM; + kfree(chunk); + goto out; + } + sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa), + PAGE_SIZE, 0); + npages--; + start += PAGE_SIZE; + } + + chunk->nmap = ib_dma_map_sg_attrs(context->device, + &chunk->page_list[0], + chunk->nents, + DMA_BIDIRECTIONAL, + &attrs); + if (chunk->nmap != chunk->nents) { + kfree(chunk); + ret = -ENOMEM; + goto out; + } + + list_add_tail(&chunk->list, &umem->chunk_list); + } + +out: + if (ret < 0) { + __ib_umem_release(context->device, umem, 0); + kfree(umem); + } + + return ret < 0 ? ERR_PTR(ret) : umem; +#endif } EXPORT_SYMBOL(ib_umem_get); +#ifdef __linux__ static void ib_umem_account(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); @@ -284,6 +441,7 @@ mmput(umem->mm); kfree(umem); } +#endif /** * ib_umem_release - release memory pinned with ib_umem_get @@ -291,6 +449,7 @@ */ void ib_umem_release(struct ib_umem *umem) { +#ifdef __linux__ struct ib_ucontext *context = umem->context; struct mm_struct *mm; unsigned long diff; @@ -328,6 +487,28 @@ current->mm->locked_vm -= diff; up_write(&mm->mmap_sem); mmput(mm); +#else + vm_offset_t addr, end, last, start; + vm_size_t size; + int error; + + __ib_umem_release(umem->context->device, umem, 1); + if (umem->context->closing) { + kfree(umem); + return; + } + error = priv_check(curthread, PRIV_VM_MUNLOCK); + if (error) + return; + addr = umem->start; + size = umem->length; + last = addr + size; + start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ + end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ + vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + +#endif kfree(umem); } EXPORT_SYMBOL(ib_umem_release); Index: sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- sys/ofed/drivers/infiniband/core/addr.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/addr.c (.../head) (revision 219811) @@ -65,7 +65,7 @@ static DEFINE_MUTEX(lock); static LIST_HEAD(req_list); -static DECLARE_DELAYED_WORK(work, process_req); +static struct delayed_work work; static struct workqueue_struct *addr_wq; void rdma_addr_register_client(struct rdma_addr_client *client) @@ -88,6 +88,7 @@ } EXPORT_SYMBOL(rdma_addr_unregister_client); +#ifdef __linux__ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr) { @@ -99,6 +100,25 @@ dev_addr->bound_dev_if = dev->ifindex; return 0; } +#else +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, + const unsigned char *dst_dev_addr) +{ + if (dev->if_type == IFT_INFINIBAND) + dev_addr->dev_type = ARPHRD_INFINIBAND; + else if (dev->if_type == IFT_ETHER) + dev_addr->dev_type = ARPHRD_ETHER; + else + dev_addr->dev_type = 0; + memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen); + memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr), + dev->if_addrlen); + if (dst_dev_addr) + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen); + dev_addr->bound_dev_if = dev->if_index; + return 0; +} +#endif EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) @@ -117,7 +137,7 @@ switch (addr->sa_family) { case AF_INET: - dev = ip_dev_find(&init_net, + dev = ip_dev_find(NULL, ((struct sockaddr_in *) addr)->sin_addr.s_addr); if (!dev) @@ -127,8 +147,9 @@ dev_put(dev); break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if defined(INET6) case AF_INET6: +#ifdef __linux__ read_lock(&dev_base_lock); for_each_netdev(&init_net, dev) { if (ipv6_chk_addr(&init_net, @@ -139,6 +160,26 @@ } } read_unlock(&dev_base_lock); +#else + { + struct sockaddr_in6 *sin6; + struct ifaddr *ifa; + in_port_t port; + + sin6 = (struct sockaddr_in6 *)addr; + port = sin6->sin6_port; + sin6->sin6_port = 0; + ifa = ifa_ifwithaddr(addr); + sin6->sin6_port = port; + if (ifa == NULL) { + ret = -ENODEV; + break; + } + ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); + ifa_free(ifa); + break; + } +#endif break; #endif } @@ -176,6 +217,7 @@ mutex_unlock(&lock); } +#ifdef __linux__ static int addr4_resolve(struct sockaddr_in *src_in, struct sockaddr_in *dst_in, struct rdma_dev_addr *addr) @@ -230,7 +272,7 @@ return ret; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if defined(INET6) static int addr6_resolve(struct sockaddr_in6 *src_in, struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr) @@ -293,18 +335,146 @@ } #endif +#else +#include + static int addr_resolve(struct sockaddr *src_in, struct sockaddr *dst_in, struct rdma_dev_addr *addr) { - if (src_in->sa_family == AF_INET) { - return addr4_resolve((struct sockaddr_in *) src_in, - (struct sockaddr_in *) dst_in, addr); - } else - return addr6_resolve((struct sockaddr_in6 *) src_in, - (struct sockaddr_in6 *) dst_in, addr); + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + struct ifaddr *ifa; + struct ifnet *ifp; + struct llentry *lle; + struct rtentry *rte; + in_port_t port; + u_char edst[MAX_ADDR_LEN]; + int multi; + int bcast; + int error; + + /* + * Determine whether the address is unicast, multicast, or broadcast + * and whether the source interface is valid. + */ + multi = 0; + bcast = 0; + sin = NULL; + sin6 = NULL; + ifp = NULL; + rte = NULL; + switch (dst_in->sa_family) { + case AF_INET: + sin = (struct sockaddr_in *)dst_in; + if (sin->sin_addr.s_addr == INADDR_BROADCAST) + bcast = 1; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + multi = 1; + sin = (struct sockaddr_in *)src_in; + if (sin->sin_addr.s_addr != INADDR_ANY) { + /* + * Address comparison fails if the port is set + * cache it here to be restored later. + */ + port = sin->sin_port; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } else + src_in = NULL; + break; +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)dst_in; + if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + multi = 1; + sin6 = (struct sockaddr_in6 *)src_in; + if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + port = sin6->sin6_port; + sin6->sin6_port = 0; + } else + src_in = NULL; + break; +#endif + default: + return -EINVAL; + } + /* + * If we have a source address to use look it up first and verify + * that it is a local interface. + */ + if (src_in) { + ifa = ifa_ifwithaddr(src_in); + if (sin) + sin->sin_port = port; + if (sin6) + sin6->sin6_port = port; + if (ifa == NULL) + return -ENETUNREACH; + ifp = ifa->ifa_ifp; + ifa_free(ifa); + if (bcast || multi) + goto mcast; + } + /* + * Make sure the route exists and has a valid link. + */ + rte = rtalloc1(dst_in, 1, 0); + if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { + if (rte) + RTFREE_LOCKED(rte); + return -EHOSTUNREACH; + } + /* + * If it's not multicast or broadcast and the route doesn't match the + * requested interface return unreachable. Otherwise fetch the + * correct interface pointer and unlock the route. + */ + if (multi || bcast) { + if (ifp == NULL) + ifp = rte->rt_ifp; + RTFREE_LOCKED(rte); + } else if (ifp && ifp != rte->rt_ifp) { + RTFREE_LOCKED(rte); + return -ENETUNREACH; + } else { + if (ifp == NULL) + ifp = rte->rt_ifp; + RT_UNLOCK(rte); + } +mcast: + if (bcast) + return rdma_copy_addr(addr, ifp, ifp->if_broadcastaddr); + if (multi) { + struct sockaddr *llsa; + + error = ifp->if_resolvemulti(ifp, &llsa, dst_in); + if (error) + return -error; + error = rdma_copy_addr(addr, ifp, + LLADDR((struct sockaddr_dl *)llsa)); + free(llsa, M_IFMADDR); + return error; + } + /* + * Resolve the link local address. + */ + if (dst_in->sa_family == AF_INET) + error = arpresolve(ifp, rte, NULL, dst_in, edst, &lle); +#ifdef INET6 + else + error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, &lle); +#endif + RTFREE(rte); + if (error == 0) + return rdma_copy_addr(addr, ifp, edst); + if (error == EWOULDBLOCK) + return -ENODATA; + return -error; } +#endif + static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; @@ -422,11 +592,15 @@ void *ctx) { if (event == NETEVENT_NEIGH_UPDATE) { +#ifdef __linux__ struct neighbour *neigh = ctx; if (neigh->nud_state & NUD_VALID) { set_timeout(jiffies); } +#else + set_timeout(jiffies); +#endif } return 0; } @@ -437,6 +611,7 @@ static int addr_init(void) { + INIT_DELAYED_WORK(&work, process_req); addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) return -ENOMEM; Index: sys/ofed/drivers/infiniband/core/uverbs_main.c =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs_main.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/uverbs_main.c (.../head) (revision 219811) @@ -119,7 +119,10 @@ [IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP] = ib_uverbs_unreg_xrc_rcv_qp, }; +#ifdef __linux__ +/* BSD Does not require a fake mountpoint for all files. */ static struct vfsmount *uverbs_event_mnt; +#endif static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -358,6 +361,7 @@ unsigned int pollflags = 0; struct ib_uverbs_event_file *file = filp->private_data; + file->filp = filp; poll_wait(filp, &file->poll_wait, wait); spin_lock_irq(&file->lock); @@ -438,6 +442,8 @@ spin_unlock_irqrestore(&file->lock, flags); wake_up_interruptible(&file->poll_wait); + if (file->filp) + selwakeup(&file->filp->f_selinfo); kill_fasync(&file->async_queue, SIGIO, POLL_IN); } @@ -471,6 +477,8 @@ spin_unlock_irqrestore(&file->async_file->lock, flags); wake_up_interruptible(&file->async_file->poll_wait); + if (file->async_file->filp) + selwakeup(&file->async_file->filp->f_selinfo); kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN); } @@ -544,6 +552,7 @@ ev_file->async_queue = NULL; ev_file->is_async = is_async; ev_file->is_closed = 0; + ev_file->filp = NULL; *fd = get_unused_fd(); if (*fd < 0) { @@ -765,6 +774,46 @@ } static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +#include + +static ssize_t +show_dev_device(struct device *device, struct device_attribute *attr, char *buf) +{ + struct ib_uverbs_device *dev = dev_get_drvdata(device); + + if (!dev) + return -ENODEV; + + return sprintf(buf, "0x%04x\n", + ((struct pci_dev *)dev->ib_dev->dma_device)->device); +} +static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL); + +static ssize_t +show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf) +{ + struct ib_uverbs_device *dev = dev_get_drvdata(device); + + if (!dev) + return -ENODEV; + + return sprintf(buf, "0x%04x\n", + ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); +} +static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL); + +struct attribute *device_attrs[] = +{ + &dev_attr_device.attr, + &dev_attr_vendor.attr, + NULL +}; + +static struct attribute_group device_group = { + .name = "device", + .attrs = device_attrs +}; + static void ib_uverbs_add_one(struct ib_device *device) { struct ib_uverbs_device *uverbs_dev; @@ -810,6 +859,8 @@ goto err_class; if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group)) + goto err_class; spin_lock(&map_lock); dev_table[uverbs_dev->devnum] = uverbs_dev; @@ -840,6 +891,7 @@ if (!uverbs_dev) return; + sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group); dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev->dev); cdev_del(uverbs_dev->cdev); @@ -854,7 +906,7 @@ wait_for_completion(&uverbs_dev->comp); kfree(uverbs_dev); } - +#ifdef __linux__ static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) @@ -869,6 +921,7 @@ .get_sb = uverbs_event_get_sb, .kill_sb = kill_litter_super }; +#endif static int __init ib_uverbs_init(void) { @@ -896,6 +949,7 @@ goto out_class; } +#ifdef __linux__ ret = register_filesystem(&uverbs_event_fs); if (ret) { printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n"); @@ -908,6 +962,7 @@ printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n"); goto out_fs; } +#endif ret = ib_register_client(&uverbs_client); if (ret) { @@ -918,10 +973,12 @@ return 0; out_mnt: +#ifdef __linux__ mntput(uverbs_event_mnt); out_fs: unregister_filesystem(&uverbs_event_fs); +#endif out_class: class_destroy(uverbs_class); @@ -936,8 +993,10 @@ static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); +#ifdef __linux__ mntput(uverbs_event_mnt); unregister_filesystem(&uverbs_event_fs); +#endif class_destroy(uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); idr_destroy(&ib_uverbs_pd_idr); Index: sys/ofed/drivers/infiniband/core/sa_query.c =================================================================== --- sys/ofed/drivers/infiniband/core/sa_query.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/sa_query.c (.../head) (revision 219811) @@ -1476,5 +1476,5 @@ idr_destroy(&query_idr); } -module_init(ib_sa_init); +module_init_order(ib_sa_init, SI_ORDER_SECOND); module_exit(ib_sa_cleanup); Index: sys/ofed/drivers/infiniband/core/device.c =================================================================== --- sys/ofed/drivers/infiniband/core/device.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/device.c (.../head) (revision 219811) @@ -100,7 +100,7 @@ int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((void *) device + mandatory_table[i].offset)) { + if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; Index: sys/ofed/drivers/infiniband/core/user_mad.c =================================================================== --- sys/ofed/drivers/infiniband/core/user_mad.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/user_mad.c (.../head) (revision 219811) @@ -111,6 +111,7 @@ struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; + struct file *filp; struct list_head recv_list; struct list_head send_list; struct list_head port_list; @@ -174,6 +175,7 @@ packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); + selwakeup(&file->filp->f_selinfo); wake_up_interruptible(&file->recv_wait); ret = 0; break; @@ -678,7 +680,7 @@ file->already_used = 1; if (!file->use_pkey_index) { printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", current->comm); + "P_Key index support.\n", curproc->p_comm); printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " "has info on the new ABI.\n"); } @@ -824,6 +826,7 @@ init_waitqueue_head(&file->recv_wait); file->port = port; + file->filp = filp; filp->private_data = file; list_add_tail(&file->port_list, &port->file_list); Index: sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- sys/ofed/drivers/infiniband/core/cma.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/cma.c (.../head) (revision 219811) @@ -1265,6 +1265,7 @@ cma_mask->dst_addr.ip4.addr = htonl(~0); } break; +#ifdef INET6 case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; if (ps == RDMA_PS_SDP) { @@ -1281,6 +1282,7 @@ sizeof cma_mask->dst_addr.ip6); } break; +#endif default: break; } @@ -1373,7 +1375,7 @@ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = CMA_CONNECT; - dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); + dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; mutex_unlock(&conn_id->handler_mutex); @@ -1831,7 +1833,11 @@ route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = tos_to_sl(id_priv->tos); +#ifdef __linux__ route->path_rec->mtu = iboe_get_mtu(ndev->mtu); +#else + route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); +#endif route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); @@ -2182,8 +2188,10 @@ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; snum = ntohs(sin->sin_port); +#ifdef __linux__ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; +#endif bind_list = idr_find(ps, snum); if (!bind_list) @@ -2218,15 +2226,21 @@ ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; +#ifdef __linux__ ret = sock->ops->bind(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); +#else + ret = -sobind(sock, + (struct sockaddr *)&id_priv->id.route.addr.src_addr, + curthread); +#endif if (ret) { sock_release(sock); return ret; } size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); - ret = sock->ops->getname(sock, + ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, &size, 0); if (ret) { @@ -2277,14 +2291,18 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { -#if defined(CONFIG_IPv6) || defined(CONFIG_IPV6_MODULE) +#if defined(INET6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; +#ifdef __linux__ if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && +#else + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && +#endif !sin6->sin6_scope_id) return -EINVAL; @@ -3041,7 +3059,11 @@ mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; +#ifdef __linux__ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); +#else + mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); +#endif dev_put(ndev); if (!mc->multicast.ib->rec.mtu) { err = -EINVAL; @@ -3160,10 +3182,17 @@ dev_addr = &id_priv->id.route.addr.dev_addr; +#ifdef __linux__ if ((dev_addr->bound_dev_if == ndev->ifindex) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); +#else + if ((dev_addr->bound_dev_if == ndev->if_index) && + memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { + printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", + ndev->if_xname, &id_priv->id); +#endif work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -3186,6 +3215,7 @@ struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; +#ifdef __linux__ if (dev_net(ndev) != &init_net) return NOTIFY_DONE; @@ -3194,6 +3224,10 @@ if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; +#else + if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) + return NOTIFY_DONE; +#endif mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) Index: sys/ofed/drivers/infiniband/core/sysfs.c =================================================================== --- sys/ofed/drivers/infiniband/core/sysfs.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/sysfs.c (.../head) (revision 219811) @@ -265,12 +265,16 @@ container_of(attr, struct port_table_attribute, attr); union ib_gid gid; ssize_t ret; + u16 *raw; ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); if (ret) return ret; - return sprintf(buf, "%pI6\n", gid.raw); + raw = (u16 *)gid.raw; + return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n", + htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]), + htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7])); } static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, @@ -439,6 +443,8 @@ kfree(dev); } +#ifdef __linux__ +/* BSD supports this through devfs(5) and devd(8). */ static int ib_device_uevent(struct device *device, struct kobj_uevent_env *env) { @@ -453,6 +459,7 @@ return 0; } +#endif static struct attribute ** alloc_group_attrs(ssize_t (*show)(struct ib_port *, @@ -515,7 +522,7 @@ p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - kobject_get(device->ports_parent), + device->ports_parent, "%d", port_num); if (ret) goto err_put; @@ -545,7 +552,9 @@ list_add_tail(&p->kobj.entry, &device->port_list); +#ifdef __linux__ kobject_uevent(&p->kobj, KOBJ_ADD); +#endif return 0; err_free_pkey: @@ -658,7 +667,9 @@ static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, +#ifdef __linux__ .dev_uevent = ib_device_uevent, +#endif }; /* Show a given an attribute in the statistics group */ @@ -666,7 +677,7 @@ struct device_attribute *attr, char *buf, unsigned offset) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev); union rdma_protocol_stats stats; ssize_t ret; @@ -799,7 +810,7 @@ } device->ports_parent = kobject_create_and_add("ports", - kobject_get(&class_dev->kobj)); + &class_dev->kobj); if (!device->ports_parent) { ret = -ENOMEM; goto err_put; Index: sys/ofed/drivers/infiniband/core/uverbs.h =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs.h (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/uverbs.h (.../head) (revision 219811) @@ -79,6 +79,7 @@ struct ib_uverbs_event_file { struct kref ref; + struct file *filp; struct ib_uverbs_file *uverbs_file; spinlock_t lock; wait_queue_head_t poll_wait; Index: sys/ofed/drivers/infiniband/core/ucm.c =================================================================== --- sys/ofed/drivers/infiniband/core/ucm.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/ucm.c (.../head) (revision 219811) @@ -380,6 +380,8 @@ list_add_tail(&uevent->file_list, &ctx->file->events); list_add_tail(&uevent->ctx_list, &ctx->events); wake_up_interruptible(&ctx->file->poll_wait); + if (ctx->file->filp) + selwakeup(&ctx->file->filp->f_selinfo); mutex_unlock(&ctx->file->file_mutex); return 0; @@ -1165,7 +1167,7 @@ { struct ib_ucm_file *file; - file = kmalloc(sizeof(*file), GFP_KERNEL); + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) return -ENOMEM; @@ -1177,7 +1179,7 @@ filp->private_data = file; file->filp = filp; - file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); + file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev); return 0; } @@ -1342,5 +1344,5 @@ idr_destroy(&ctx_id_table); } -module_init(ib_ucm_init); +module_init_order(ib_ucm_init, SI_ORDER_THIRD); module_exit(ib_ucm_cleanup); Index: sys/ofed/drivers/infiniband/core/ucma.c =================================================================== --- sys/ofed/drivers/infiniband/core/ucma.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/core/ucma.c (.../head) (revision 219811) @@ -284,6 +284,8 @@ list_add_tail(&uevent->list, &ctx->file->event_list); wake_up_interruptible(&ctx->file->poll_wait); + if (ctx->file->filp) + selwakeup(&ctx->file->filp->f_selinfo); out: mutex_unlock(&ctx->file->mut); return ret; @@ -598,7 +600,6 @@ dev_put(dev); } - iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, dev_addr->dst_dev_addr, vid); iboe_addr_get_sgid(dev_addr, Index: sys/ofed/drivers/infiniband/hw/mlx4/qp.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mlx4/qp.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mlx4/qp.c (.../head) (revision 219811) @@ -104,11 +104,11 @@ #ifndef wc_wmb #if defined(__i386__) - #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory") #elif defined(__x86_64__) - #define wc_wmb() asm volatile("sfence" ::: "memory") + #define wc_wmb() __asm volatile("sfence" ::: "memory") #elif defined(__ia64__) - #define wc_wmb() asm volatile("fwb" ::: "memory") + #define wc_wmb() __asm volatile("fwb" ::: "memory") #else #define wc_wmb() wmb() #endif @@ -1515,6 +1515,7 @@ int err; u16 vlan; + vlan = 0; send_size = 0; for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; @@ -1578,7 +1579,11 @@ u8 *smac; memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); +#ifdef __linux__ smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */ +#else + smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); /* fixme: cache this value */ +#endif memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -1945,6 +1950,7 @@ __be16 vlan = 0; int inl = 0; + ctrl = NULL; spin_lock_irqsave(&qp->sq.lock, flags); ind = qp->sq_next_wqe; Index: sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mlx4/main.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mlx4/main.c (.../head) (revision 219811) @@ -223,7 +223,8 @@ props->link_layer = IB_LINK_LAYER_INFINIBAND; } -int eth_to_ib_width(int w) +#ifdef notyet +static int eth_to_ib_width(int w) { switch (w) { case 4: @@ -238,7 +239,7 @@ } } -int eth_to_ib_speed(int s) +static int eth_to_ib_speed(int s) { switch (s) { case 256: @@ -251,6 +252,7 @@ return 1; } } +#endif static u8 state_to_phys_state(enum ib_port_state state) { @@ -286,7 +288,11 @@ if (!ndev) goto out; +#ifdef __linux__ tmp = iboe_get_mtu(ndev->mtu); +#else + tmp = iboe_get_mtu(ndev->if_mtu); +#endif props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = netif_carrier_ok(ndev) && netif_oper_up(ndev) ? IB_PORT_ACTIVE : IB_PORT_DOWN; @@ -524,7 +530,7 @@ resp.bf_regs_per_page = 0; } - context = kmalloc(sizeof *context, GFP_KERNEL); + context = kzalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); @@ -652,7 +658,6 @@ if (!mqp->port) return 0; - spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) @@ -696,7 +701,7 @@ return err; } -struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +static struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct gid_entry *ge; struct gid_entry *tmp; @@ -1033,15 +1038,20 @@ NULL }; -static struct attribute_group diag_counters_group = { +struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { +#ifdef __linux__ memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); +#else + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); +#endif if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; @@ -1099,7 +1109,7 @@ u8 *hits; int ret; union ib_gid gid; - int free; + int tofree; int found; int need_update = 0; u16 vid; @@ -1114,18 +1124,23 @@ goto out; } +#ifdef __linux__ read_lock(&dev_base_lock); for_each_netdev(&init_net, tmp) { +#else + IFNET_RLOCK(); + TAILQ_FOREACH(tmp, &V_ifnet, if_link) { +#endif if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); vid = rdma_vlan_dev_vlan_id(tmp); mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); found = 0; - free = -1; + tofree = -1; for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) { - if (free < 0 && + if (tofree < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) - free = i; + tofree = i; if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { hits[i] = 1; found = 1; @@ -1138,15 +1153,20 @@ dev->iboe.gid_table[port - 1][0] = gid; ++need_update; hits[0] = 1; - } else if (free >= 0) { - dev->iboe.gid_table[port - 1][free] = gid; - hits[free] = 1; + } else if (tofree >= 0) { + dev->iboe.gid_table[port - 1][tofree] = gid; + hits[tofree] = 1; ++need_update; } } } +#ifdef __linux__ } read_unlock(&dev_base_lock); +#else + } + IFNET_RUNLOCK(); +#endif for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) if (!hits[i]) { @@ -1177,7 +1197,9 @@ { switch (event) { case NETDEV_UP: +#ifdef __linux__ case NETDEV_CHANGEADDR: +#endif update_ipv6_gids(dev, port, 0); break; @@ -1206,8 +1228,10 @@ struct mlx4_ib_iboe *iboe; int port; +#ifdef __linux__ if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; +#endif ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; @@ -1400,7 +1424,6 @@ if (mlx4_ib_mad_init(ibdev)) goto err_reg; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); @@ -1539,5 +1562,19 @@ destroy_workqueue(wq); } -module_init(mlx4_ib_init); +module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); + +#undef MODULE_VERSION +#include +static int +mlx4ib_evhand(module_t mod, int event, void *arg) +{ + return (0); +} +static moduledata_t mlx4ib_mod = { + .name = "mlx4ib", + .evhand = mlx4ib_evhand, +}; +DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY); +MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); Index: sys/ofed/drivers/infiniband/hw/mlx4/srq.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mlx4/srq.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mlx4/srq.c (.../head) (revision 219811) @@ -92,7 +92,7 @@ return ERR_PTR(-EINVAL); } - srq = kmalloc(sizeof *srq, GFP_KERNEL); + srq = kzalloc(sizeof *srq, GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c (.../head) (revision 219811) @@ -70,6 +70,7 @@ if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) { /* Look for the bridge -- its device ID will be 2 more than HCA's device ID. */ +#ifdef __linux__ while ((bridge = pci_get_device(mdev->pdev->vendor, mdev->pdev->device + 2, bridge)) != NULL) { @@ -90,7 +91,11 @@ mthca_warn(mdev, "No bridge found for %s\n", pci_name(mdev->pdev)); } +#else + mthca_warn(mdev, "Reset on PCI-X is not supported.\n"); + goto out; +#endif } /* For Arbel do we need to save off the full 4K PCI Express header?? */ @@ -116,6 +121,7 @@ hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); hca_pcie_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP); +#ifdef __linux__ if (bridge) { bridge_header = kmalloc(256, GFP_KERNEL); if (!bridge_header) { @@ -143,6 +149,7 @@ goto out; } } +#endif /* actually hit reset */ { @@ -280,8 +287,10 @@ } out: +#ifdef __linux__ if (bridge) pci_dev_put(bridge); +#endif kfree(bridge_header); kfree(hca_header); Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (.../head) (revision 219811) @@ -125,7 +125,7 @@ MODULE_PARM_DESC(fmr_reserved_mtts, "number of memory translation table segments reserved for FMR"); -static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); +static int log_mtts_per_seg; module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); @@ -179,7 +179,7 @@ } if (dev_lim->min_page_sz > PAGE_SIZE) { mthca_err(mdev, "HCA minimum page size of %d bigger than " - "kernel PAGE_SIZE of %ld, aborting.\n", + "kernel PAGE_SIZE of %d, aborting.\n", dev_lim->min_page_sz, PAGE_SIZE); return -ENODEV; } @@ -1322,9 +1322,10 @@ printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n", hca_profile.fmr_reserved_mtts); } - + if (log_mtts_per_seg == 0) + log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { - printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", + printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %ld\n", log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); } @@ -1355,5 +1356,5 @@ mthca_catas_cleanup(); } -module_init(mthca_init); +module_init_order(mthca_init, SI_ORDER_MIDDLE); module_exit(mthca_cleanup); Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c (.../head) (revision 219811) @@ -1021,7 +1021,7 @@ if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", - current->comm); + curproc->p_comm); mthca_warn(dev, " Update libmthca to fix this.\n"); } ++to_mucontext(pd->uobject->context)->reg_mr_warned; Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c (.../head) (revision 219811) @@ -37,14 +37,14 @@ #include "mthca_dev.h" enum { - MTHCA_CATAS_POLL_INTERVAL = 5 * HZ, - MTHCA_CATAS_TYPE_INTERNAL = 0, MTHCA_CATAS_TYPE_UPLINK = 3, MTHCA_CATAS_TYPE_DDR = 4, MTHCA_CATAS_TYPE_PARITY = 5, }; +#define MTHCA_CATAS_POLL_INTERVAL (5 * HZ) + static DEFINE_SPINLOCK(catas_lock); static LIST_HEAD(catas_list); Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c (.../head) (revision 219811) @@ -161,17 +161,13 @@ CMD_TIME_CLASS_D = 60 * HZ }; #else -enum { - CMD_TIME_CLASS_A = 60 * HZ, - CMD_TIME_CLASS_B = 60 * HZ, - CMD_TIME_CLASS_C = 60 * HZ, - CMD_TIME_CLASS_D = 60 * HZ -}; +#define CMD_TIME_CLASS_A (60 * HZ) +#define CMD_TIME_CLASS_B (60 * HZ) +#define CMD_TIME_CLASS_C (60 * HZ) +#define CMD_TIME_CLASS_D (60 * HZ) #endif -enum { - GO_BIT_TIMEOUT = HZ * 10 -}; +#define GO_BIT_TIMEOUT (HZ * 10) struct mthca_cmd_context { struct completion done; @@ -237,10 +233,8 @@ if (event) { unsigned long end = jiffies + GO_BIT_TIMEOUT; - while (go_bit(dev) && time_before(jiffies, end)) { - set_current_state(TASK_RUNNING); - schedule(); - } + while (go_bit(dev) && time_before(jiffies, end)) + sched_yield(); } if (go_bit(dev)) @@ -323,10 +317,8 @@ goto out; end = timeout + jiffies; - while (go_bit(dev) && time_before(jiffies, end)) { - set_current_state(TASK_RUNNING); - schedule(); - } + while (go_bit(dev) && time_before(jiffies, end)) + sched_yield(); if (go_bit(dev)) { err = -EBUSY; Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c (.../base) (revision 219811) +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c (.../head) (revision 219811) @@ -448,9 +448,17 @@ page * MTHCA_ICM_PAGE_SIZE; } +#include +#include +#include + +#include +#include + int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, struct mthca_user_db_table *db_tab, int index, u64 uaddr) { +#ifdef __linux__ struct page *pages[1]; int ret = 0; u8 status; @@ -508,6 +516,94 @@ out: mutex_unlock(&db_tab->mutex); return ret; +#else + struct proc *proc; + vm_offset_t start; + vm_paddr_t paddr; + pmap_t pmap; + vm_page_t m; + int ret = 0; + u8 status; + int i; + + if (!mthca_is_memfree(dev)) + return 0; + + if (index < 0 || index > dev->uar_table.uarc_size / 8) + return -EINVAL; + + mutex_lock(&db_tab->mutex); + + i = index / MTHCA_DB_REC_PER_PAGE; + start = 0; + + if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE) || + (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) || + (uaddr & 4095)) { + ret = -EINVAL; + goto out; + } + + if (db_tab->page[i].refcount) { + ++db_tab->page[i].refcount; + goto out; + } + + proc = curproc; + pmap = vm_map_pmap(&proc->p_vmspace->vm_map); + PROC_LOCK(proc); + if (ptoa(pmap_wired_count(pmap) + 1) > lim_cur(proc, RLIMIT_MEMLOCK)) { + PROC_UNLOCK(proc); + ret = -ENOMEM; + goto out; + } + PROC_UNLOCK(proc); + if (cnt.v_wire_count + 1 > vm_page_max_wired) { + ret = -EAGAIN; + goto out; + } + start = uaddr & PAGE_MASK; + ret = vm_map_wire(&proc->p_vmspace->vm_map, start, start + PAGE_SIZE, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | VM_MAP_WIRE_WRITE); + if (ret != KERN_SUCCESS) { + start = 0; + ret = -ENOMEM; + goto out; + } + paddr = pmap_extract(pmap, uaddr); + if (paddr == 0) { + ret = -EFAULT; + goto out; + } + m = PHYS_TO_VM_PAGE(paddr); + + sg_set_page(&db_tab->page[i].mem, m, MTHCA_ICM_PAGE_SIZE, + uaddr & ~PAGE_MASK); + + ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + if (ret < 0) + goto out; + + ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem), + mthca_uarc_virt(dev, uar, i), &status); + if (!ret && status) + ret = -EINVAL; + if (ret) { + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + goto out; + } + + db_tab->page[i].uvirt = uaddr; + db_tab->page[i].refcount = 1; + +out: + if (ret < 0 && start) + vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, + start, start + PAGE_SIZE, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + mutex_unlock(&db_tab->mutex); + return ret; +#endif } void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, @@ -565,7 +661,16 @@ if (db_tab->page[i].uvirt) { mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1, &status); pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); +#ifdef __linux__ put_page(sg_page(&db_tab->page[i].mem)); +#else + vm_offset_t start; + + start = db_tab->page[i].uvirt & PAGE_MASK; + vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, + start, start + PAGE_SIZE, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); +#endif } } Index: sys/ofed/drivers/net/mlx4/en_frag.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_frag.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_frag.c (.../head) (revision 219811) @@ -31,17 +31,14 @@ * */ -#include -#include -#include -#include -#include - #include "mlx4_en.h" +#include +#include +#include static struct mlx4_en_ipfrag *find_session(struct mlx4_en_rx_ring *ring, - struct iphdr *iph) + struct ip *iph) { struct mlx4_en_ipfrag *session; int i; @@ -50,10 +47,10 @@ session = &ring->ipfrag[i]; if (session->fragments == NULL) continue; - if (session->daddr == iph->daddr && - session->saddr == iph->saddr && - session->id == iph->id && - session->protocol == iph->protocol) { + if (session->daddr == iph->ip_dst.s_addr && + session->saddr == iph->ip_src.s_addr && + session->id == iph->ip_id && + session->protocol == iph->ip_p) { return session; } } @@ -61,7 +58,7 @@ } static struct mlx4_en_ipfrag *start_session(struct mlx4_en_rx_ring *ring, - struct iphdr *iph) + struct ip *iph) { struct mlx4_en_ipfrag *session; int index = -1; @@ -86,22 +83,18 @@ struct mlx4_en_ipfrag *session, u16 more) { - struct sk_buff *skb = session->fragments; - struct iphdr *iph = ip_hdr(skb); - struct net_device *dev = skb->dev; + struct mbuf *mb = session->fragments; + struct ip *iph = mb->m_pkthdr.header; + struct net_device *dev = mb->m_pkthdr.rcvif; /* Update IP length and checksum */ - iph->tot_len = htons(session->total_len); - iph->frag_off = htons(more | (session->offset >> 3)); - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + iph->ip_len = htons(session->total_len); + iph->ip_off = htons(more | (session->offset >> 3)); + iph->ip_sum = 0; + iph->ip_sum = in_cksum_skip(mb, iph->ip_hl * 4, + (char *)iph - mb->m_data); - if (session->vlan) - vlan_hwaccel_receive_skb(skb, priv->vlgrp, - be16_to_cpu(session->sl_vid)); - else - netif_receive_skb(skb); - dev->last_rx = jiffies; + dev->if_input(dev, mb); session->fragments = NULL; session->last = NULL; } @@ -109,89 +102,73 @@ static inline void frag_append(struct mlx4_en_priv *priv, struct mlx4_en_ipfrag *session, - struct sk_buff *skb, + struct mbuf *mb, unsigned int data_len) { - struct sk_buff *parent = session->fragments; + struct mbuf *parent = session->fragments; - /* Update skb bookkeeping */ - parent->len += data_len; - parent->data_len += data_len; + /* Update mb bookkeeping */ + parent->m_pkthdr.len += data_len; session->total_len += data_len; - skb_pull(skb, skb->len - data_len); - parent->truesize += skb->truesize; + m_adj(mb, mb->m_pkthdr.len - data_len); - if (session->last) - session->last->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - session->last = skb; + session->last->m_next = mb; + for (; mb->m_next != NULL; mb = mb->m_next); + session->last = mb; } int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, - struct sk_buff *skb, struct mlx4_cqe *cqe) + struct mbuf *mb, struct mlx4_cqe *cqe) { struct mlx4_en_ipfrag *session; - struct iphdr *iph; + struct ip *iph; u16 ip_len; u16 ip_hlen; int data_len; u16 offset; - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - iph = ip_hdr(skb); - ip_len = ntohs(iph->tot_len); - ip_hlen = iph->ihl * 4; + iph = (struct ip *)(mtod(mb, char *) + ETHER_HDR_LEN); + mb->m_pkthdr.header = iph; + ip_len = ntohs(iph->ip_len); + ip_hlen = iph->ip_hl * 4; data_len = ip_len - ip_hlen; - offset = ntohs(iph->frag_off); - offset &= IP_OFFSET; + offset = ntohs(iph->ip_off); + offset &= IP_OFFMASK; offset <<= 3; session = find_session(ring, iph); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) { + if (unlikely(in_cksum_skip(mb, ip_hlen, (char *)iph - mb->m_data))) { if (session) flush_session(priv, session, IP_MF); return -EINVAL; } if (session) { if (unlikely(session->offset + session->total_len != - offset + ip_hlen)) { + offset + ip_hlen || + session->total_len + mb->m_pkthdr.len > 65536)) { flush_session(priv, session, IP_MF); goto new_session; } - /* Packets smaller then 60 bytes are padded to that size - * Need to fix len field of the skb to fit the actual data size - * Since ethernet header already removed, the IP total length - * is exactly the data size (the skb is linear) - */ - skb->len = ip_len; - - frag_append(priv, session, skb, data_len); + frag_append(priv, session, mb, data_len); } else { new_session: session = start_session(ring, iph); if (unlikely(!session)) return -ENOSPC; - session->fragments = skb; - session->daddr = iph->daddr; - session->saddr = iph->saddr; - session->id = iph->id; - session->protocol = iph->protocol; + session->fragments = mb; + session->daddr = iph->ip_dst.s_addr; + session->saddr = iph->ip_src.s_addr; + session->id = iph->ip_id; + session->protocol = iph->ip_p; session->total_len = ip_len; session->offset = offset; - session->vlan = (priv->vlgrp && - (be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK)) ? 1 : 0; - session->sl_vid = cqe->sl_vid; + for (; mb->m_next != NULL; mb = mb->m_next); + session->last = mb; } - if (!(ntohs(iph->frag_off) & IP_MF)) + if (!(ntohs(iph->ip_off) & IP_MF)) flush_session(priv, session, 0); - else if (session->fragments->len + priv->dev->mtu > 65536) - flush_session(priv, session, IP_MF); return 0; } Index: sys/ofed/drivers/net/mlx4/en_port.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_port.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_port.c (.../head) (revision 219811) @@ -32,15 +32,14 @@ */ +#include "mlx4_en.h" + #include #include #include -#include "en_port.h" -#include "mlx4_en.h" - int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode) { @@ -48,14 +47,11 @@ MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B); } -int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp) +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_vlan_fltr_mbox *filter; int i; - int j; - int index = 0; - u32 entry; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -63,19 +59,10 @@ return PTR_ERR(mailbox); filter = mailbox->buf; - if (grp) { - memset(filter, 0, sizeof *filter); - for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { - entry = 0; - for (j = 0; j < 32; j++) - if (vlan_group_get_device(grp, index++)) - entry |= 1 << j; - filter->entry[i] = cpu_to_be32(entry); - } - } else { - /* When no vlans are configured we block all vlans */ - memset(filter, 0, sizeof(*filter)); - } + memset(filter, 0, sizeof *filter); + if (vlans) + for (i = 0; i < VLAN_FLTR_SIZE; i ++) + filter->entry[i] = cpu_to_be32(vlans[i]); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR, MLX4_CMD_TIME_CLASS_B); mlx4_free_cmd_mailbox(dev, mailbox); @@ -223,15 +210,19 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) { struct mlx4_en_stat_out_mbox *mlx4_en_stats; - struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); - struct net_device_stats *stats = &priv->stats; + struct net_device *dev; + struct mlx4_en_priv *priv; struct mlx4_cmd_mailbox *mailbox; u64 in_mod = reset << 8 | port; + unsigned long oerror; + unsigned long ierror; int err; int i; int counter; u64 counters[4]; + dev = mdev->pndev[port]; + priv = netdev_priv(dev); memset(counters, 0, sizeof counters); counter = mlx4_get_iboe_counter(priv->mdev->dev, port); if (counter >= 0) @@ -248,48 +239,45 @@ mlx4_en_stats = mailbox->buf; - spin_lock_bh(&priv->stats_lock); + spin_lock(&priv->stats_lock); - stats->rx_packets = counters[0]; - stats->rx_bytes = counters[2]; + oerror = ierror = 0; + dev->if_ipackets = counters[0]; + dev->if_ibytes = counters[2]; for (i = 0; i < priv->rx_ring_num; i++) { - stats->rx_packets += priv->rx_ring[i].packets; - stats->rx_bytes += priv->rx_ring[i].bytes; + dev->if_ipackets += priv->rx_ring[i].packets; + dev->if_ibytes += priv->rx_ring[i].bytes; + ierror += priv->rx_ring[i].errors; } - stats->tx_packets = counters[1]; - stats->tx_bytes = counters[3]; + dev->if_opackets = counters[1]; + dev->if_obytes = counters[3]; for (i = 0; i <= priv->tx_ring_num; i++) { - stats->tx_packets += priv->tx_ring[i].packets; - stats->tx_bytes += priv->tx_ring[i].bytes; + dev->if_opackets += priv->tx_ring[i].packets; + dev->if_obytes += priv->tx_ring[i].bytes; + oerror += priv->tx_ring[i].errors; } - stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) + - be32_to_cpu(mlx4_en_stats->RdropLength) + - be32_to_cpu(mlx4_en_stats->RJBBR) + - be32_to_cpu(mlx4_en_stats->RCRC) + - be32_to_cpu(mlx4_en_stats->RRUNT); - stats->tx_errors = be32_to_cpu(mlx4_en_stats->TDROP); - stats->multicast = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_1) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_2) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_3) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_4) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_5) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_6) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_7) + - be64_to_cpu(mlx4_en_stats->MCAST_novlan); - stats->collisions = 0; - stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength); - stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC); - stats->rx_frame_errors = 0; - stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->tx_aborted_errors = 0; - stats->tx_carrier_errors = 0; - stats->tx_fifo_errors = 0; - stats->tx_heartbeat_errors = 0; - stats->tx_window_errors = 0; + dev->if_ierrors = be32_to_cpu(mlx4_en_stats->RDROP) + ierror; + dev->if_oerrors = be32_to_cpu(mlx4_en_stats->TDROP) + oerror; + dev->if_imcasts = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_1) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_2) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_3) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_4) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_5) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_6) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_7) + + be64_to_cpu(mlx4_en_stats->MCAST_novlan); + dev->if_omcasts = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) + + be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) + + be64_to_cpu(mlx4_en_stats->TMCAST_novlan); + dev->if_collisions = 0; priv->pkstats.broadcast = be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) + @@ -317,7 +305,7 @@ priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5); priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6); priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7); - spin_unlock_bh(&priv->stats_lock); + spin_unlock(&priv->stats_lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); Index: sys/ofed/drivers/net/mlx4/en_port.h =================================================================== --- sys/ofed/drivers/net/mlx4/en_port.h (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_port.h (.../head) (revision 219811) @@ -586,5 +586,6 @@ __be32 TDROP; }; +enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev); #endif Index: sys/ofed/drivers/net/mlx4/mr.c =================================================================== --- sys/ofed/drivers/net/mlx4/mr.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/mr.c (.../head) (revision 219811) @@ -484,7 +484,7 @@ return -ENOMEM; for (i = 0; i < buf->npages; ++i) - if (buf->nbufs == 1) + if (buf->direct.map) page_list[i] = buf->direct.map + (i << buf->page_shift); else page_list[i] = buf->page_list[i].map; Index: sys/ofed/drivers/net/mlx4/catas.c =================================================================== --- sys/ofed/drivers/net/mlx4/catas.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/catas.c (.../head) (revision 219811) @@ -35,9 +35,7 @@ #include "mlx4.h" -enum { - MLX4_CATAS_POLL_INTERVAL = 5 * HZ, -}; +#define MLX4_CATAS_POLL_INTERVAL (5 * HZ) static DEFINE_SPINLOCK(catas_lock); Index: sys/ofed/drivers/net/mlx4/en_ethtool.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_ethtool.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_ethtool.c (.../head) (revision 219811) @@ -390,7 +390,7 @@ priv->prof->tx_pause = pause->tx_pause != 0; priv->prof->rx_pause = pause->rx_pause != 0; err = mlx4_SET_PORT_general(mdev->dev, priv->port, - priv->rx_skb_size + ETH_FCS_LEN, + priv->rx_mb_size + ETH_FCS_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, Index: sys/ofed/drivers/net/mlx4/cmd.c =================================================================== --- sys/ofed/drivers/net/mlx4/cmd.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/cmd.c (.../head) (revision 219811) @@ -322,7 +322,7 @@ int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout) { - if (mlx4_priv(dev)->cmd.use_events) + if (mlx4_priv(dev)->cmd.use_events && !cold) return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm, in_modifier, op_modifier, op, timeout); else Index: sys/ofed/drivers/net/mlx4/srq.c =================================================================== --- sys/ofed/drivers/net/mlx4/srq.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/srq.c (.../head) (revision 219811) @@ -34,6 +34,7 @@ #include #include +#include #include "mlx4.h" #include "icm.h" Index: sys/ofed/drivers/net/mlx4/en_main.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_main.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_main.c (.../head) (revision 219811) @@ -31,7 +31,6 @@ * */ -#include #include #include #include @@ -162,6 +161,8 @@ mlx4_mr_free(dev, &mdev->mr); mlx4_uar_free(dev, &mdev->priv_uar); mlx4_pd_free(dev, mdev->priv_pdn); + sx_destroy(&mdev->state_lock.sx); + mtx_destroy(&mdev->uar_lock.m); kfree(mdev); } @@ -191,10 +192,10 @@ if (mlx4_uar_alloc(dev, &mdev->priv_uar)) goto err_pd; + mtx_init(&mdev->uar_lock.m, "mlx4 uar", NULL, MTX_DEF); mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!mdev->uar_map) goto err_uar; - spin_lock_init(&mdev->uar_lock); mdev->dev = dev; mdev->dma_device = &(dev->pdev->dev); @@ -253,7 +254,7 @@ /* At this stage all non-port specific tasks are complete: * mark the card state as up */ - mutex_init(&mdev->state_lock); + sx_init(&mdev->state_lock.sx, "mlxen state"); mdev->device_up = true; /* Setup ports */ @@ -286,6 +287,7 @@ err_mr: mlx4_mr_free(dev, &mdev->mr); err_uar: + mtx_destroy(&mdev->uar_lock.m); mlx4_uar_free(dev, &mdev->priv_uar); err_pd: mlx4_pd_free(dev, mdev->priv_pdn); @@ -308,6 +310,7 @@ return MLX4_QUERY_NOT_MINE; } +#if 0 static struct pci_device_id mlx4_en_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */ { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */ @@ -342,6 +345,7 @@ }; MODULE_DEVICE_TABLE(pci, mlx4_en_pci_table); +#endif static struct mlx4_interface mlx4_en_interface = { .add = mlx4_en_add, @@ -365,3 +369,16 @@ module_init(mlx4_en_init); module_exit(mlx4_en_cleanup); +#undef MODULE_VERSION +#include +static int +mlxen_evhand(module_t mod, int event, void *arg) +{ + return (0); +} +static moduledata_t mlxen_mod = { + .name = "mlxen", + .evhand = mlxen_evhand, +}; +DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_SMP, SI_ORDER_ANY); +MODULE_DEPEND(mlxen, mlx4, 1, 1, 1); Index: sys/ofed/drivers/net/mlx4/en_netdev.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_netdev.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_netdev.c (.../head) (revision 219811) @@ -31,85 +31,62 @@ * */ -#include -#include -#include -#include +#include "mlx4_en.h" #include #include #include #include -#include "mlx4_en.h" -#include "en_port.h" +#include +#include +#include +#include +static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv); -static void mlx4_en_vlan_rx_register(struct net_device *dev, struct vlan_group *grp) +static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); - - en_dbg(HW, priv, "Registering VLAN group:%p\n", grp); - - spin_lock_bh(&priv->vlan_lock); - priv->vlgrp = grp; - priv->vlgrp_modified = true; - spin_unlock_bh(&priv->vlan_lock); -} - -static void mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); int idx; u8 field; -#ifndef HAVE_NETDEV_VLAN_FEATURES - struct net_device *vdev; -#endif - if (!priv->vlgrp) + if ((vid == 0) || (vid > 4095)) /* Invalid */ return; - en_dbg(HW, priv, "adding VLAN:%d (vlgrp entry:%p)\n", - vid, vlan_group_get_device(priv->vlgrp, vid)); + en_dbg(HW, priv, "adding VLAN:%d\n", vid); - spin_lock_bh(&priv->vlan_lock); + spin_lock(&priv->vlan_lock); priv->vlgrp_modified = true; - idx = vid >> 3; - field = 1 << (vid & 0x7); + idx = vid >> 5; + field = 1 << (vid & 0x1f); if (priv->vlan_unregister[idx] & field) priv->vlan_unregister[idx] &= ~field; else priv->vlan_register[idx] |= field; - spin_unlock_bh(&priv->vlan_lock); -#ifndef HAVE_NETDEV_VLAN_FEATURES - vdev = vlan_group_get_device(priv->vlgrp, vid); - vdev->features |= dev->features; - vdev->features |= NETIF_F_LLTX; - vlan_group_set_device(priv->vlgrp, vid, vdev); -#endif + priv->vlans[idx] |= field; + spin_unlock(&priv->vlan_lock); } -static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) +static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); int idx; u8 field; - if (!priv->vlgrp) + if ((vid == 0) || (vid > 4095)) /* Invalid */ return; - - en_dbg(HW, priv, "Killing VID:%d (vlgrp:%p vlgrp entry:%p)\n", - vid, priv->vlgrp, vlan_group_get_device(priv->vlgrp, vid)); - spin_lock_bh(&priv->vlan_lock); + en_dbg(HW, priv, "Killing VID:%d\n", vid); + spin_lock(&priv->vlan_lock); priv->vlgrp_modified = true; - vlan_group_set_device(priv->vlgrp, vid, NULL); - idx = vid >> 3; - field = 1 << (vid & 0x7); + idx = vid >> 5; + field = 1 << (vid & 0x1f); if (priv->vlan_register[idx] & field) priv->vlan_register[idx] &= ~field; else priv->vlan_unregister[idx] |= field; - spin_unlock_bh(&priv->vlan_lock); + priv->vlans[idx] &= ~field; + spin_unlock(&priv->vlan_lock); } u64 mlx4_en_mac_to_u64(u8 *addr) @@ -117,86 +94,58 @@ u64 mac = 0; int i; - for (i = 0; i < ETH_ALEN; i++) { + for (i = 0; i < ETHER_ADDR_LEN; i++) { mac <<= 8; mac |= addr[i]; } return mac; } -static int mlx4_en_set_mac(struct net_device *dev, void *addr) +static int mlx4_en_cache_mclist(struct net_device *dev, u64 **mcaddrp) { - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; - struct sockaddr *saddr = addr; + struct ifmultiaddr *ifma;; + u64 *mcaddr; + int cnt; + int i; - if (!is_valid_ether_addr(saddr->sa_data)) - return -EADDRNOTAVAIL; - - memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN); - priv->mac = mlx4_en_mac_to_u64(dev->dev_addr); - queue_work(mdev->workqueue, &priv->mac_task); - return 0; -} - -static void mlx4_en_do_set_mac(struct work_struct *work) -{ - struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, - mac_task); - struct mlx4_en_dev *mdev = priv->mdev; - int err = 0; - - mutex_lock(&mdev->state_lock); - if (priv->port_up) { - /* Remove old MAC and insert the new one */ - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); - err = mlx4_register_mac(mdev->dev, priv->port, - priv->mac, &priv->mac_index); - if (err) - en_err(priv, "Failed changing HW MAC address\n"); - } else - en_dbg(HW, priv, "Port is down while " - "registering mac, exiting...\n"); - - mutex_unlock(&mdev->state_lock); -} - -static void mlx4_en_clear_list(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct dev_mc_list *plist = priv->mc_list; - struct dev_mc_list *next; - - while (plist) { - next = plist->next; - kfree(plist); - plist = next; + *mcaddrp = NULL; +restart: + cnt = 0; + if_maddr_rlock(dev); + TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen != + ETHER_ADDR_LEN) + continue; + cnt++; } - priv->mc_list = NULL; -} - -static void mlx4_en_cache_mclist(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct dev_mc_list *mclist; - struct dev_mc_list *tmp; - struct dev_mc_list *plist = NULL; - - for (mclist = dev->mc_list; mclist; mclist = mclist->next) { - tmp = kmalloc(sizeof(struct dev_mc_list), GFP_ATOMIC); - if (!tmp) { - en_err(priv, "failed to allocate multicast list\n"); - mlx4_en_clear_list(dev); - return; + if_maddr_runlock(dev); + if (cnt == 0) + return (0); + mcaddr = kmalloc(sizeof(u64) * cnt, GFP_KERNEL); + if (mcaddr == NULL) + return (0); + i = 0; + if_maddr_rlock(dev); + TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen != + ETHER_ADDR_LEN) + continue; + /* Make sure the list didn't grow. */ + if (i == cnt) { + if_maddr_runlock(dev); + kfree(mcaddr); + goto restart; } - memcpy(tmp, mclist, sizeof(struct dev_mc_list)); - tmp->next = NULL; - if (plist) - plist->next = tmp; - else - priv->mc_list = tmp; - plist = tmp; + mcaddr[i++] = mlx4_en_mac_to_u64( + LLADDR((struct sockaddr_dl *)ifma->ifma_addr)); } + if_maddr_runlock(dev); + *mcaddrp = mcaddr; + return (i); } @@ -214,10 +163,8 @@ { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, mcast_task); + struct net_device *dev = priv->dev; struct mlx4_en_dev *mdev = priv->mdev; - struct net_device *dev = priv->dev; - struct dev_mc_list *mclist; - u64 mcast_addr = 0; int err; mutex_lock(&mdev->state_lock); @@ -236,10 +183,8 @@ * Promsicuous mode: disable all filters */ - if (dev->flags & IFF_PROMISC) { + if (dev->if_flags & IFF_PROMISC) { if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) { - if (netif_msg_rx_status(priv)) - en_warn(priv, "Entering promiscuous mode\n"); priv->flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ @@ -269,8 +214,6 @@ */ if (priv->flags & MLX4_EN_FLAG_PROMISC) { - if (netif_msg_rx_status(priv)) - en_warn(priv, "Leaving promiscuous mode\n"); priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ @@ -280,18 +223,22 @@ en_err(priv, "Failed disabling promiscous mode\n"); /* Enable port VLAN filter */ - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlans); if (err) en_err(priv, "Failed enabling VLAN filter\n"); } /* Enable/disable the multicast filter according to IFF_ALLMULTI */ - if (dev->flags & IFF_ALLMULTI) { + if (dev->if_flags & IFF_ALLMULTI) { err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); } else { + u64 *mcaddr; + int mccount; + int i; + err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) @@ -303,20 +250,16 @@ /* Update multicast list - we cache all addresses so they won't * change while HW is updated holding the command semaphor */ - netif_tx_lock_bh(dev); - mlx4_en_cache_mclist(dev); - netif_tx_unlock_bh(dev); - for (mclist = priv->mc_list; mclist; mclist = mclist->next) { - mcast_addr = mlx4_en_mac_to_u64(mclist->dmi_addr); + mccount = mlx4_en_cache_mclist(dev, &mcaddr); + for (i = 0; i < mccount; i++) mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, - mcast_addr, 0, MLX4_MCAST_CONFIG); - } + mcaddr[i], 0, MLX4_MCAST_CONFIG); err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_ENABLE); if (err) en_err(priv, "Failed enabling multicast filter\n"); - mlx4_en_clear_list(dev); + kfree(mcaddr); } out: mutex_unlock(&mdev->state_lock); @@ -337,37 +280,26 @@ if (priv->rx_ring[i].use_frags) mlx4_en_process_rx_cq(dev, cq, 0); else - mlx4_en_process_rx_cq_skb(dev, cq, 0); + mlx4_en_process_rx_cq_mb(dev, cq, 0); spin_unlock_irqrestore(&cq->lock, flags); } } #endif -static void mlx4_en_tx_timeout(struct net_device *dev) +static void mlx4_en_watchdog_timeout(void *arg) { - struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_priv *priv = arg; struct mlx4_en_dev *mdev = priv->mdev; - if (netif_msg_timer(priv)) - en_warn(priv, "Tx timeout called on port:%d\n", priv->port); - - priv->port_stats.tx_timeout++; en_dbg(DRV, priv, "Scheduling watchdog\n"); queue_work(mdev->workqueue, &priv->watchdog_task); + if (priv->port_up) + callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, + mlx4_en_watchdog_timeout, priv); } -static struct net_device_stats *mlx4_en_get_stats(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - - spin_lock_bh(&priv->stats_lock); - memcpy(&priv->ret_stats, &priv->stats, sizeof(priv->stats)); - spin_unlock_bh(&priv->stats_lock); - - return &priv->ret_stats; -} - +/* XXX This clears user settings in too many cases. */ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) { struct mlx4_en_cq *cq; @@ -379,11 +311,11 @@ * satisfy our coelsing target. * - moder_time is set to a fixed value. */ - priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->mtu + 1; + priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1; priv->rx_usecs = MLX4_EN_RX_COAL_TIME; - en_dbg(INTR, priv, "Default coalesing params for mtu:%d - " + en_dbg(INTR, priv, "Default coalesing params for mtu:%ld - " "rx_frames:%d rx_usecs:%d\n", - priv->dev->mtu, priv->rx_frames, priv->rx_usecs); + priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs); /* Setup cq moderation params */ for (i = 0; i < priv->rx_ring_num; i++) { @@ -430,11 +362,11 @@ if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ) return; - spin_lock_bh(&priv->stats_lock); - rx_packets = priv->stats.rx_packets; - rx_bytes = priv->stats.rx_bytes; - tx_packets = priv->stats.tx_packets; - spin_unlock_bh(&priv->stats_lock); + spin_lock(&priv->stats_lock); + rx_packets = priv->dev->if_ipackets; + rx_bytes = priv->dev->if_ibytes; + tx_packets = priv->dev->if_opackets; + spin_unlock(&priv->stats_lock); if (!priv->last_moder_jiffies || !period) goto out; @@ -505,32 +437,32 @@ static void mlx4_en_handle_vlans(struct mlx4_en_priv *priv) { - u8 vlan_register[MLX4_VLREG_SIZE]; - u8 vlan_unregister[MLX4_VLREG_SIZE]; + u8 vlan_register[VLAN_FLTR_SIZE]; + u8 vlan_unregister[VLAN_FLTR_SIZE]; int i, j, idx; u16 vid; /* cache the vlan data for processing * done under lock to avoid changes during work */ - spin_lock_bh(&priv->vlan_lock); - for (i = 0; i < MLX4_VLREG_SIZE; i++) { + spin_lock(&priv->vlan_lock); + for (i = 0; i < VLAN_FLTR_SIZE; i++) { vlan_register[i] = priv->vlan_register[i]; priv->vlan_register[i] = 0; vlan_unregister[i] = priv->vlan_unregister[i]; priv->vlan_unregister[i] = 0; } priv->vlgrp_modified = false; - spin_unlock_bh(&priv->vlan_lock); + spin_unlock(&priv->vlan_lock); /* Configure the vlan filter * The vlgrp is updated with all the vids that need to be allowed */ - if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlgrp)) + if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlans)) en_err(priv, "Failed configuring VLAN filter\n"); /* Configure the VLAN table */ - for (i = 0; i < MLX4_VLREG_SIZE; i++) { - for (j = 0; j < 8; j++) { - vid = (i << 3) + j; + for (i = 0; i < VLAN_FLTR_SIZE; i++) { + for (j = 0; j < 32; j++) { + vid = (i << 5) + j; if (vlan_register[i] & (1 << j)) if (mlx4_register_vlan(priv->mdev->dev, priv->port, vid, &idx)) en_dbg(HW, priv, "failed registering vlan %d\n", vid); @@ -569,7 +501,8 @@ queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); } if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) { - queue_work(mdev->workqueue, &priv->mac_task); + panic("mlx4_en_do_get_stats: Unexpected mac removed for %d\n", + priv->port); mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0; } mutex_unlock(&mdev->state_lock); @@ -587,11 +520,10 @@ * report to system log */ if (priv->last_link_state != linkstate) { if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) { - en_info(priv, "Link Down\n"); - netif_carrier_off(priv->dev); + if_link_state_change(priv->dev, LINK_STATE_DOWN); } else { en_info(priv, "Link Up\n"); - netif_carrier_on(priv->dev); + if_link_state_change(priv->dev, LINK_STATE_UP); } } priv->last_link_state = linkstate; @@ -617,9 +549,9 @@ } /* Calculate Rx buf size */ - dev->mtu = min(dev->mtu, priv->max_mtu); + dev->if_mtu = min(dev->if_mtu, priv->max_mtu); mlx4_en_calc_rx_buf(dev); - en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size); + en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size); /* Configure rx cq's and rings */ err = mlx4_en_activate_rx_rings(priv); @@ -689,7 +621,7 @@ /* Configure port */ err = mlx4_SET_PORT_general(mdev->dev, priv->port, - priv->rx_skb_size + ETH_FCS_LEN, + priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, @@ -708,7 +640,8 @@ /* Set port mac number */ en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port); err = mlx4_register_mac(mdev->dev, priv->port, - priv->mac, &priv->mac_index); + mlx4_en_mac_to_u64(IF_LLADDR(dev)), + &priv->mac_index); if (err) { en_err(priv, "Failed setting port mac\n"); goto tx_err; @@ -723,11 +656,29 @@ goto mac_err; } - /* Schedule multicast task to populate multicast list */ - queue_work(mdev->workqueue, &priv->mcast_task); + /* Set the various hardware offload abilities */ + dev->if_hwassist = 0; + if (dev->if_capenable & IFCAP_TSO4) + dev->if_hwassist |= CSUM_TSO; + if (dev->if_capenable & IFCAP_TXCSUM) + dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); + if (dev->if_capenable & IFCAP_RXCSUM) + priv->rx_csum = 1; + else + priv->rx_csum = 0; priv->port_up = true; - netif_tx_start_all_queues(dev); + + /* Populate multicast list */ + mlx4_en_set_multicast(dev); + + /* Enable the queues. */ + atomic_clear_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); + atomic_set_int(&dev->if_drv_flags, IFF_DRV_RUNNING); + + callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, + mlx4_en_watchdog_timeout, priv); + return 0; mac_err: @@ -760,11 +711,6 @@ return; } - /* Synchronize with tx routine */ - netif_tx_lock_bh(dev); - netif_tx_stop_all_queues(dev); - netif_tx_unlock_bh(dev); - /* Set port as not active */ priv->port_up = false; @@ -788,13 +734,15 @@ /* Free RX Rings */ for (i = 0; i < priv->rx_ring_num; i++) { mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); - while (test_bit(NAPI_STATE_SCHED, &priv->rx_cq[i].napi.state)) - msleep(1); mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]); } /* close port*/ mlx4_CLOSE_PORT(mdev->dev, priv->port); + + callout_stop(&priv->watchdog_timer); + + atomic_clear_int(&dev->if_drv_flags, IFF_DRV_RUNNING); } static void mlx4_en_restart(struct work_struct *work) @@ -803,7 +751,21 @@ watchdog_task); struct mlx4_en_dev *mdev = priv->mdev; struct net_device *dev = priv->dev; + struct mlx4_en_tx_ring *ring; + int i; + if (priv->blocked == 0 || priv->port_up == 0) + return; + for (i = 0; i < priv->tx_ring_num; i++) { + ring = &priv->tx_ring[i]; + if (ring->blocked && + ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks) + goto reset; + } + return; + +reset: + priv->port_stats.tx_timeout++; en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port); mutex_lock(&mdev->state_lock); @@ -816,18 +778,23 @@ } -static int mlx4_en_open(struct net_device *dev) +static void +mlx4_en_init(void *arg) { - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_priv *priv; + struct mlx4_en_dev *mdev; + struct ifnet *dev; int i; - int err = 0; + priv = arg; + dev = priv->dev; + mdev = priv->mdev; mutex_lock(&mdev->state_lock); + if (dev->if_drv_flags & IFF_DRV_RUNNING) + mlx4_en_stop_port(dev); if (!mdev->device_up) { en_err(priv, "Cannot open - device down/disabled\n"); - err = -EBUSY; goto out; } @@ -835,7 +802,6 @@ if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1)) en_dbg(HW, priv, "Failed dumping statistics\n"); - memset(&priv->stats, 0, sizeof(priv->stats)); memset(&priv->pstats, 0, sizeof(priv->pstats)); for (i = 0; i < priv->tx_ring_num; i++) { @@ -848,32 +814,13 @@ } mlx4_en_set_default_moderation(priv); - err = mlx4_en_start_port(dev); - if (err) + if (mlx4_en_start_port(dev)) en_err(priv, "Failed starting port:%d\n", priv->port); out: mutex_unlock(&mdev->state_lock); - return err; } - -static int mlx4_en_close(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; - - en_dbg(IFDOWN, priv, "Close port called\n"); - - mutex_lock(&mdev->state_lock); - - mlx4_en_stop_port(dev); - netif_carrier_off(dev); - - mutex_unlock(&mdev->state_lock); - return 0; -} - void mlx4_en_free_resources(struct mlx4_en_priv *priv) { int i; @@ -891,6 +838,10 @@ if (priv->rx_cq[i].buf) mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); } + /* Free the stats tree when we resize the rings. */ + if (priv->sysctl) + sysctl_ctx_free(&priv->stat_ctx); + } int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) @@ -924,6 +875,13 @@ goto err; } + /* Re-create stat sysctls in case the number of rings changed. */ + mlx4_en_sysctl_stat(priv); + + /* Populate Tx priority mappings */ + mlx4_en_set_prio_map(priv, priv->tx_prio_map, + prof->tx_ring_num - MLX4_EN_NUM_HASH_RINGS); + return 0; err: @@ -939,13 +897,21 @@ en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); + if (priv->vlan_attach != NULL) + EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); + if (priv->vlan_detach != NULL) + EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); + /* Unregister device - this will close the port if it was up */ if (priv->registered) - unregister_netdev(dev); + ether_ifdetach(dev); if (priv->allocated) mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); + if (priv->sysctl) + sysctl_ctx_free(&priv->conf_ctx); + cancel_delayed_work(&priv->stats_task); /* flush any pending task for this netdev */ flush_workqueue(mdev->workqueue); @@ -956,7 +922,10 @@ mutex_unlock(&mdev->state_lock); mlx4_en_free_resources(priv); - free_netdev(dev); + mtx_destroy(&priv->stats_lock.m); + mtx_destroy(&priv->vlan_lock.m); + kfree(priv); + if_free(dev); } static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) @@ -965,17 +934,16 @@ struct mlx4_en_dev *mdev = priv->mdev; int err = 0; - en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n", - dev->mtu, new_mtu); + en_dbg(DRV, priv, "Change MTU called - current:%ld new:%d\n", + dev->if_mtu, new_mtu); if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) { en_err(priv, "Bad MTU size:%d.\n", new_mtu); return -EPERM; } - dev->mtu = new_mtu; - - if (netif_running(dev)) { - mutex_lock(&mdev->state_lock); + mutex_lock(&mdev->state_lock); + dev->if_mtu = new_mtu; + if (dev->if_drv_flags & IFF_DRV_RUNNING) { if (!mdev->device_up) { /* NIC is probably restarting - let watchdog task reset * the port */ @@ -990,53 +958,455 @@ queue_work(mdev->workqueue, &priv->watchdog_task); } } - mutex_unlock(&mdev->state_lock); } + mutex_unlock(&mdev->state_lock); return 0; } -static const struct net_device_ops mlx4_netdev_ops = { - .ndo_open = mlx4_en_open, - .ndo_stop = mlx4_en_close, - .ndo_start_xmit = mlx4_en_xmit, - .ndo_select_queue = mlx4_en_select_queue, - .ndo_get_stats = mlx4_en_get_stats, - .ndo_set_multicast_list = mlx4_en_set_multicast, - .ndo_set_mac_address = mlx4_en_set_mac, - .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = mlx4_en_change_mtu, - .ndo_tx_timeout = mlx4_en_tx_timeout, - .ndo_vlan_rx_register = mlx4_en_vlan_rx_register, - .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, - .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx4_en_netpoll, +static int mlx4_en_calc_media(struct mlx4_en_priv *priv) +{ + int trans_type; + int active; + + active = IFM_ETHER; + if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN) + return (active); + if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) + return (active); + active |= IFM_FDX; + trans_type = priv->port_state.transciver; + /* XXX I don't know all of the transceiver values. */ + if (priv->port_state.link_speed == 1000) + active |= IFM_1000_T; + else if (trans_type > 0 && trans_type <= 0xC) + active |= IFM_10G_SR; + else if (trans_type == 0x80 || trans_type == 0) + active |= IFM_10G_CX4; + if (priv->prof->tx_pause) + active |= IFM_ETH_TXPAUSE; + if (priv->prof->rx_pause) + active |= IFM_ETH_RXPAUSE; + + return (active); +} + + +static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr) +{ + struct mlx4_en_priv *priv; + + priv = dev->if_softc; + ifmr->ifm_status = IFM_AVALID; + if (priv->last_link_state != MLX4_DEV_EVENT_PORT_DOWN) + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active = mlx4_en_calc_media(priv); + + return; +} + +static int mlx4_en_media_change(struct ifnet *dev) +{ + struct mlx4_en_priv *priv; + struct ifmedia *ifm; + int rxpause; + int txpause; + int error; + + priv = dev->if_softc; + ifm = &priv->media; + rxpause = txpause = 0; + error = 0; + + if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) + return (EINVAL); + switch (IFM_SUBTYPE(ifm->ifm_media)) { + case IFM_AUTO: + break; + case IFM_10G_SR: + case IFM_10G_CX4: + case IFM_1000_T: + if (IFM_SUBTYPE(ifm->ifm_media) == + IFM_SUBTYPE(mlx4_en_calc_media(priv)) && + (ifm->ifm_media & IFM_FDX)) + break; + /* Fallthrough */ + default: + printf("%s: Only auto media type\n", if_name(dev)); + return (EINVAL); + } + /* Allow user to set/clear pause */ + if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) + rxpause = 1; + if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) + txpause = 1; + if (priv->prof->tx_pause != txpause || priv->prof->rx_pause != rxpause) { + priv->prof->tx_pause = txpause; + priv->prof->rx_pause = rxpause; + error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, + priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, + priv->prof->tx_ppp, priv->prof->rx_pause, + priv->prof->rx_ppp); + } + return (error); +} + +static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data) +{ + struct mlx4_en_priv *priv; + struct mlx4_en_dev *mdev; + struct ifreq *ifr; + int error; + int mask; + + error = 0; + mask = 0; + priv = dev->if_softc; + mdev = priv->mdev; + ifr = (struct ifreq *) data; + switch (command) { + case SIOCSIFMTU: + error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu); + break; + case SIOCSIFFLAGS: + if (dev->if_flags & IFF_UP) { + if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) { + mutex_lock(&mdev->state_lock); + mlx4_en_start_port(dev); + mutex_unlock(&mdev->state_lock); + } else + mlx4_en_set_multicast(dev); + } else { + mutex_lock(&mdev->state_lock); + if (dev->if_drv_flags & IFF_DRV_RUNNING) { + mlx4_en_stop_port(dev); + if_link_state_change(dev, LINK_STATE_DOWN); + } + mutex_unlock(&mdev->state_lock); + } + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + mlx4_en_set_multicast(dev); + break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = ifmedia_ioctl(dev, ifr, &priv->media, command); + break; + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ dev->if_capenable; + if (mask & IFCAP_HWCSUM) + dev->if_capenable ^= IFCAP_HWCSUM; + if (mask & IFCAP_TSO4) + dev->if_capenable ^= IFCAP_TSO4; + if (mask & IFCAP_LRO) + dev->if_capenable ^= IFCAP_LRO; + if (mask & IFCAP_VLAN_HWTAGGING) + dev->if_capenable ^= IFCAP_VLAN_HWTAGGING; + if (mask & IFCAP_VLAN_HWFILTER) + dev->if_capenable ^= IFCAP_VLAN_HWFILTER; + if (dev->if_drv_flags & IFF_DRV_RUNNING) + mlx4_en_init(priv); + VLAN_CAPABILITIES(dev); + break; + default: + error = ether_ioctl(dev, command, data); + break; + } + + return (error); +} + +static int mlx4_en_set_ring_size(struct net_device *dev, + int rx_size, int tx_size) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int port_up = 0; + int err = 0; + + rx_size = roundup_pow_of_two(rx_size); + rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE); + rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE); + tx_size = roundup_pow_of_two(tx_size); + tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE); + tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE); + + if (rx_size == (priv->port_up ? + priv->rx_ring[0].actual_size : priv->rx_ring[0].size) && + tx_size == priv->tx_ring[0].size) + return 0; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev); + } + mlx4_en_free_resources(priv); + priv->prof->tx_ring_size = tx_size; + priv->prof->rx_ring_size = rx_size; + err = mlx4_en_alloc_resources(priv); + if (err) { + en_err(priv, "Failed reallocating port resources\n"); + goto out; + } + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port\n"); + } +out: + mutex_unlock(&mdev->state_lock); + return err; +} + +static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS) +{ + struct mlx4_en_priv *priv; + int size; + int error; + + priv = arg1; + size = priv->prof->rx_ring_size; + error = sysctl_handle_int(oidp, &size, 0, req); + if (error || !req->newptr) + return (error); + error = -mlx4_en_set_ring_size(priv->dev, size, + priv->prof->tx_ring_size); + + return (error); +} + +static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS) +{ + struct mlx4_en_priv *priv; + int size; + int error; + + priv = arg1; + size = priv->prof->tx_ring_size; + error = sysctl_handle_int(oidp, &size, 0, req); + if (error || !req->newptr) + return (error); + error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size, + size); + + return (error); +} + +static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv) +{ + struct net_device *dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *node; + struct sysctl_oid_list *node_list; + struct sysctl_oid *coal; + struct sysctl_oid_list *coal_list; + + dev = priv->dev; + ctx = &priv->conf_ctx; + + sysctl_ctx_init(ctx); + priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw), + OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet"); + node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO, + "conf", CTLFLAG_RD, NULL, "Configuration"); + node_list = SYSCTL_CHILDREN(node); + + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable", + CTLFLAG_RW, &priv->msg_enable, 0, + "Driver message enable bitfield"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings", + CTLTYPE_INT | CTLFLAG_RD, &priv->rx_ring_num, 0, + "Number of receive rings"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings", + CTLTYPE_INT | CTLFLAG_RD, &priv->tx_ring_num, 0, + "Number of transmit rings"); + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size", + CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_rx_ring_size, "I", + "Receive ring size"); + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size", + CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_tx_ring_size, "I", + "Transmit ring size"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "ip_reasm", + CTLFLAG_RD, &priv->mdev->profile.ip_reasm, 0, + "Allow reassembly of IP fragments."); + + /* Add coalescer configuration. */ + coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, + "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration"); + coal_list = SYSCTL_CHILDREN(node); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low", + CTLFLAG_RW, &priv->pkt_rate_low, 0, + "Packets per-second for minimum delay"); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low", + CTLFLAG_RW, &priv->rx_usecs_low, 0, + "Minimum RX delay in micro-seconds"); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high", + CTLFLAG_RW, &priv->pkt_rate_high, 0, + "Packets per-second for maximum delay"); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high", + CTLFLAG_RW, &priv->rx_usecs_high, 0, + "Maximum RX delay in micro-seconds"); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval", + CTLFLAG_RW, &priv->sample_interval, 0, + "adaptive frequency in units of HZ ticks"); + SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal", + CTLFLAG_RW, &priv->adaptive_rx_coal, 0, + "Enable adaptive rx coalescing"); +} + +static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv) +{ + struct net_device *dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *node; + struct sysctl_oid_list *node_list; + struct sysctl_oid *ring_node; + struct sysctl_oid_list *ring_list; + struct mlx4_en_tx_ring *tx_ring; + struct mlx4_en_rx_ring *rx_ring; + char namebuf[128]; + int i; + + dev = priv->dev; + + ctx = &priv->stat_ctx; + sysctl_ctx_init(ctx); + node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO, + "stat", CTLFLAG_RD, NULL, "Statistics"); + node_list = SYSCTL_CHILDREN(node); + +#ifdef MLX4_EN_PERF_STAT + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_poll", CTLFLAG_RD, + &priv->pstats.tx_poll, "TX Poll calls"); + SYSCTL_ADD_QUAD(ctx, node_list, OID_AUTO, "tx_pktsz_avg", CTLFLAG_RD, + &priv->pstats.tx_pktsz_avg, "TX average packet size"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "inflight_avg", CTLFLAG_RD, + &priv->pstats.inflight_avg, "TX average packets in-flight"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_coal_avg", CTLFLAG_RD, + &priv->pstats.tx_coal_avg, "TX average coalesced completions"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_coal_avg", CTLFLAG_RD, + &priv->pstats.rx_coal_avg, "RX average coalesced completions"); #endif -}; + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD, + &priv->port_stats.tso_packets, "TSO packets sent"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD, + &priv->port_stats.queue_stopped, "Queue full"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD, + &priv->port_stats.wake_queue, "Queue resumed after full"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD, + &priv->port_stats.tx_timeout, "Transmit timeouts"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD, + &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD, + &priv->port_stats.rx_chksum_good, "RX checksum offload success"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD, + &priv->port_stats.rx_chksum_none, "RX without checksum offload"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_chksum_offload", + CTLFLAG_RD, &priv->port_stats.tx_chksum_offload, + "TX checksum offloads"); + + /* Could strdup the names and add in a loop. This is simpler. */ + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "broadcast", CTLFLAG_RD, + &priv->pkstats.broadcast, "Broadcast packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio0", CTLFLAG_RD, + &priv->pkstats.tx_prio[0], "TX Priority 0 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio1", CTLFLAG_RD, + &priv->pkstats.tx_prio[1], "TX Priority 1 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio2", CTLFLAG_RD, + &priv->pkstats.tx_prio[2], "TX Priority 2 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio3", CTLFLAG_RD, + &priv->pkstats.tx_prio[3], "TX Priority 3 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio4", CTLFLAG_RD, + &priv->pkstats.tx_prio[4], "TX Priority 4 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio5", CTLFLAG_RD, + &priv->pkstats.tx_prio[5], "TX Priority 5 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio6", CTLFLAG_RD, + &priv->pkstats.tx_prio[6], "TX Priority 6 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio7", CTLFLAG_RD, + &priv->pkstats.tx_prio[7], "TX Priority 7 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio0", CTLFLAG_RD, + &priv->pkstats.rx_prio[0], "RX Priority 0 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio1", CTLFLAG_RD, + &priv->pkstats.rx_prio[1], "RX Priority 1 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio2", CTLFLAG_RD, + &priv->pkstats.rx_prio[2], "RX Priority 2 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio3", CTLFLAG_RD, + &priv->pkstats.rx_prio[3], "RX Priority 3 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio4", CTLFLAG_RD, + &priv->pkstats.rx_prio[4], "RX Priority 4 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio5", CTLFLAG_RD, + &priv->pkstats.rx_prio[5], "RX Priority 5 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio6", CTLFLAG_RD, + &priv->pkstats.rx_prio[6], "RX Priority 6 packets"); + SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio7", CTLFLAG_RD, + &priv->pkstats.rx_prio[7], "RX Priority 7 packets"); + + for (i = 0; i < priv->tx_ring_num; i++) { + tx_ring = &priv->tx_ring[i]; + snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i); + ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "TX Ring"); + ring_list = SYSCTL_CHILDREN(ring_node); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets", + CTLFLAG_RD, &tx_ring->packets, "TX packets"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes", + CTLFLAG_RD, &tx_ring->bytes, "TX bytes"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error", + CTLFLAG_RD, &tx_ring->errors, "TX soft errors"); + + } + for (i = 0; i < priv->rx_ring_num; i++) { + rx_ring = &priv->rx_ring[i]; + snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i); + ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "RX Ring"); + ring_list = SYSCTL_CHILDREN(ring_node); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets", + CTLFLAG_RD, &rx_ring->packets, "RX packets"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes", + CTLFLAG_RD, &rx_ring->bytes, "RX bytes"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error", + CTLFLAG_RD, &rx_ring->errors, "RX soft errors"); + SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_queued", + CTLFLAG_RD, &rx_ring->lro.lro_queued, 0, "LRO Queued"); + SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_flushed", + CTLFLAG_RD, &rx_ring->lro.lro_flushed, 0, "LRO Flushed"); + } +} + int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, struct mlx4_en_port_profile *prof) { + static volatile int mlx4_en_unit; struct net_device *dev; struct mlx4_en_priv *priv; + uint8_t dev_addr[ETHER_ADDR_LEN]; + int err; int i; - int err; - dev = alloc_etherdev_mq(sizeof(struct mlx4_en_priv), prof->tx_ring_num); + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + dev = priv->dev = if_alloc(IFT_ETHER); if (dev == NULL) { mlx4_err(mdev, "Net device allocation failed\n"); + kfree(priv); return -ENOMEM; } + dev->if_softc = priv; + if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1)); + dev->if_mtu = ETHERMTU; + dev->if_baudrate = 1000000000; + dev->if_init = mlx4_en_init; + dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + dev->if_ioctl = mlx4_en_ioctl; + dev->if_transmit = mlx4_en_transmit; + dev->if_qflush = mlx4_en_qflush; + dev->if_snd.ifq_maxlen = prof->tx_ring_size; - SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev); - dev->dev_id = port - 1; - /* * Initialize driver private data */ - - priv = netdev_priv(dev); - memset(priv, 0, sizeof(struct mlx4_en_priv)); priv->dev = dev; priv->mdev = mdev; priv->prof = prof; @@ -1047,20 +1417,20 @@ priv->tx_ring_num = prof->tx_ring_num; priv->rx_ring_num = prof->rx_ring_num; priv->udp_rings = mdev->profile.udp_rss ? prof->rx_ring_num / 2 : 1; - priv->mc_list = NULL; priv->mac_index = -1; priv->msg_enable = MLX4_EN_MSG_LEVEL; - spin_lock_init(&priv->stats_lock); - spin_lock_init(&priv->vlan_lock); + mtx_init(&priv->stats_lock.m, "mlx4 stats", NULL, MTX_DEF); + mtx_init(&priv->vlan_lock.m, "mlx4 vlan", NULL, MTX_DEF); INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast); - INIT_WORK(&priv->mac_task, mlx4_en_do_set_mac); INIT_WORK(&priv->watchdog_task, mlx4_en_restart); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); + callout_init(&priv->watchdog_timer, 1); /* Query for default mac and max mtu */ priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; priv->mac = mdev->dev->caps.def_mac[priv->port]; + if (ILLEGAL_MAC(priv->mac)) { en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n", priv->port, priv->mac); @@ -1068,6 +1438,8 @@ goto out; } + mlx4_en_sysctl_conf(priv); + err = mlx4_en_alloc_resources(priv); if (err) goto out; @@ -1081,63 +1453,55 @@ } priv->allocated = 1; - /* Populate Tx priority mappings */ - mlx4_en_set_prio_map(priv, priv->tx_prio_map, - prof->tx_ring_num - MLX4_EN_NUM_HASH_RINGS); - /* - * Initialize netdev entry points + * Set driver features */ - dev->netdev_ops = &mlx4_netdev_ops; - dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT; + dev->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM; + dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; + dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; + dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; +#if 0 /* Not yet */ + dev->if_capabilities |= IFCAP_WOL; +#endif + if (mdev->LSO_support) + dev->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; - SET_ETHTOOL_OPS(dev, &mlx4_en_ethtool_ops); + /* Don't enable LOR unless the user requests. */ + dev->if_capenable = dev->if_capabilities; - /* Set defualt MAC */ - dev->addr_len = ETH_ALEN; - for (i = 0; i < ETH_ALEN; i++) { - dev->dev_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i)); - dev->perm_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i)); - } - - /* - * Set driver features - */ - dev->features |= NETIF_F_SG; - dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; -#ifdef HAVE_NETDEV_VLAN_FEATURES - dev->vlan_features |= NETIF_F_SG; - dev->vlan_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; -#endif - dev->features |= NETIF_F_HIGHDMA; - dev->features |= NETIF_F_HW_VLAN_TX | - NETIF_F_HW_VLAN_RX | - NETIF_F_HW_VLAN_FILTER; if (mdev->profile.num_lro) - dev->features |= NETIF_F_LRO; - if (mdev->LSO_support) { - dev->features |= NETIF_F_TSO; - dev->features |= NETIF_F_TSO6; -#ifdef HAVE_NETDEV_VLAN_FEATURES - dev->vlan_features |= NETIF_F_TSO; - dev->vlan_features |= NETIF_F_TSO6; -#endif - } + dev->if_capabilities |= IFCAP_LRO; - mdev->pndev[port] = dev; + /* Register for VLAN events */ + priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); + priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); - netif_carrier_off(dev); - err = register_netdev(dev); - if (err) { - mlx4_err(mdev, "Netdev registration failed for port %d\n", port); - goto out; - } + mdev->pndev[priv->port] = dev; + priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN; + if_link_state_change(dev, LINK_STATE_DOWN); + + /* Set default MAC */ + for (i = 0; i < ETHER_ADDR_LEN; i++) + dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i)); + + ether_ifattach(dev, dev_addr); + ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, + mlx4_en_media_change, mlx4_en_media_status); + ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL); + ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL); + ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL); + ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); + en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); priv->registered = 1; queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); + return 0; out: Index: sys/ofed/drivers/net/mlx4/en_selftest.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_selftest.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_selftest.c (.../head) (revision 219811) @@ -31,15 +31,15 @@ * */ +#include "mlx4_en.h" + #include #include #include #include #include -#include "mlx4_en.h" - static int mlx4_en_test_registers(struct mlx4_en_priv *priv) { return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK, @@ -48,7 +48,7 @@ static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv) { - struct sk_buff *skb; + struct mbuf *mb; struct ethhdr *ethh; unsigned char *packet; unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD; @@ -57,24 +57,24 @@ /* build the pkt before xmit */ - skb = netdev_alloc_skb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN); - if (!skb) { - en_err(priv, "-LOOPBACK_TEST_XMIT- failed to create skb for xmit\n"); + mb = netdev_alloc_mb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN); + if (!mb) { + en_err(priv, "-LOOPBACK_TEST_XMIT- failed to create mb for xmit\n"); return -ENOMEM; } - skb_reserve(skb, NET_IP_ALIGN); + mb_reserve(mb, NET_IP_ALIGN); - ethh = (struct ethhdr *)skb_put(skb, sizeof(struct ethhdr)); - packet = (unsigned char *)skb_put(skb, packet_size); + ethh = (struct ethhdr *)mb_put(mb, sizeof(struct ethhdr)); + packet = (unsigned char *)mb_put(mb, packet_size); memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN); memset(ethh->h_source, 0, ETH_ALEN); ethh->h_proto = htons(ETH_P_ARP); - skb_set_mac_header(skb, 0); + mb_set_mac_header(mb, 0); for (i = 0; i < packet_size; ++i) /* fill our packet */ packet[i] = (unsigned char)(i & 0xff); /* xmit the pkt */ - err = mlx4_en_xmit(skb, priv->dev); + err = mlx4_en_xmit(mb, priv->dev); return err; } Index: sys/ofed/drivers/net/mlx4/en_rx.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_rx.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_rx.c (.../head) (revision 219811) @@ -31,120 +31,51 @@ * */ +#include "mlx4_en.h" + #include #include -#include -#include -#include -#include -#include "mlx4_en.h" +#include +#include +#include - -static int mlx4_en_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr, - void **ip_hdr, void **tcpudp_hdr, - u64 *hdr_flags, void *priv) -{ - *mac_hdr = page_address(frags->page) + frags->page_offset; - *ip_hdr = *mac_hdr + ETH_HLEN; - *tcpudp_hdr = (struct tcphdr *)(*ip_hdr + sizeof(struct iphdr)); - *hdr_flags = LRO_IPV4 | LRO_TCP; - - return 0; -} - enum { MIN_RX_ARM = 1024, }; -static int mlx4_en_alloc_frag(struct mlx4_en_priv *priv, - struct mlx4_en_rx_desc *rx_desc, - struct skb_frag_struct *skb_frags, - struct mlx4_en_rx_alloc *ring_alloc, - int i) +static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct mbuf **mb_list, + int i) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; - struct mlx4_en_rx_alloc *page_alloc = &ring_alloc[i]; - struct page *page; + struct mbuf *mb; dma_addr_t dma; - if (page_alloc->offset == frag_info->last_offset) { - /* Allocate new page */ - page = alloc_pages(GFP_ATOMIC | __GFP_COMP, MLX4_EN_ALLOC_ORDER); - if (!page) - return -ENOMEM; - - skb_frags[i].page = page_alloc->page; - skb_frags[i].page_offset = page_alloc->offset; - page_alloc->page = page; - page_alloc->offset = frag_info->frag_align; - } else { - page = page_alloc->page; - get_page(page); - - skb_frags[i].page = page; - skb_frags[i].page_offset = page_alloc->offset; - page_alloc->offset += frag_info->frag_stride; + if (i == 0) + mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size); + else + mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size); + if (mb == NULL) { + priv->port_stats.rx_alloc_failed++; + return -ENOMEM; } - dma = pci_map_single(mdev->pdev, page_address(skb_frags[i].page) + - skb_frags[i].page_offset, frag_info->frag_size, + dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size, PCI_DMA_FROMDEVICE); rx_desc->data[i].addr = cpu_to_be64(dma); + mb_list[i] = mb; return 0; } -static int mlx4_en_init_allocator(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring) -{ - struct mlx4_en_rx_alloc *page_alloc; - int i; - - for (i = 0; i < priv->num_frags; i++) { - page_alloc = &ring->page_alloc[i]; - page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, - MLX4_EN_ALLOC_ORDER); - if (!page_alloc->page) - goto out; - - page_alloc->offset = priv->frag_info[i].frag_align; - en_dbg(DRV, priv, "Initialized allocator:%d with page:%p\n", - i, page_alloc->page); - } - return 0; - -out: - while (i--) { - page_alloc = &ring->page_alloc[i]; - put_page(page_alloc->page); - page_alloc->page = NULL; - } - return -ENOMEM; -} - -static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring) -{ - struct mlx4_en_rx_alloc *page_alloc; - int i; - - for (i = 0; i < priv->num_frags; i++) { - page_alloc = &ring->page_alloc[i]; - en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n", - i, page_count(page_alloc->page)); - - put_page(page_alloc->page); - page_alloc->page = NULL; - } -} - static void -mlx4_en_init_rx_desc_skb(struct mlx4_en_priv *priv, +mlx4_en_init_rx_desc_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; - rx_desc->data->byte_count = cpu_to_be32(priv->rx_skb_size); + rx_desc->data->byte_count = cpu_to_be32(priv->rx_mb_size); rx_desc->data->lkey = cpu_to_be32(priv->mdev->mr.key); } @@ -152,14 +83,11 @@ struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; - struct skb_frag_struct *skb_frags = ring->rx_info + - (index << priv->log_rx_info); int possible_frags; int i; /* Set size and memtype fields */ for (i = 0; i < priv->num_frags; i++) { - skb_frags[i].size = priv->frag_info[i].frag_size; rx_desc->data[i].byte_count = cpu_to_be32(priv->frag_info[i].frag_size); rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key); @@ -177,57 +105,57 @@ } static int -mlx4_en_alloc_rx_skb(struct mlx4_en_priv *priv, +mlx4_en_alloc_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, - struct sk_buff **pskb, int unmap) + struct mbuf **pmb, int unmap) { struct mlx4_en_dev *mdev = priv->mdev; dma_addr_t dma; - int size = priv->rx_skb_size + NET_IP_ALIGN; - struct sk_buff *new_skb = dev_alloc_skb(size); + int size = priv->rx_mb_size; + struct mbuf *new_mb; - if (unlikely(new_skb == NULL)) + new_mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); + if (unlikely(new_mb == NULL)) { + priv->port_stats.rx_alloc_failed++; return -ENOMEM; + } if (unmap) pci_unmap_single(mdev->pdev, be64_to_cpu(rx_desc->data->addr), be32_to_cpu(rx_desc->data->byte_count), PCI_DMA_FROMDEVICE); - new_skb->dev = priv->dev; - skb_reserve(new_skb, NET_IP_ALIGN); - dma = pci_map_single(priv->mdev->pdev, new_skb->data, size, DMA_FROM_DEVICE); - *pskb = new_skb; + dma = pci_map_single(priv->mdev->pdev, new_mb->m_data, size, DMA_FROM_DEVICE); + *pmb = new_mb; rx_desc->data->addr = cpu_to_be64(dma); return 0; } static int -mlx4_en_prepare_rx_desc_skb(struct mlx4_en_priv *priv, +mlx4_en_prepare_rx_desc_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); - struct sk_buff **pskb = (struct sk_buff **) ring->rx_info + index; + struct mbuf **pmb = (struct mbuf **) ring->rx_info + index; - return mlx4_en_alloc_rx_skb(priv, rx_desc, pskb, 0); + return mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 0); } static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); - struct skb_frag_struct *skb_frags = ring->rx_info + - (index << priv->log_rx_info); + struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info); int i; for (i = 0; i < priv->num_frags; i++) - if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, ring->page_alloc, i)) + if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i)) goto err; return 0; err: while (i--) - put_page(skb_frags[i].page); + m_free(mb_list[i]); return -ENOMEM; } @@ -240,31 +168,33 @@ struct mlx4_en_rx_ring *ring, int index) { + struct mlx4_en_frag_info *frag_info; struct mlx4_en_dev *mdev = priv->mdev; - struct skb_frag_struct *skb_frags; - struct sk_buff *skb; + struct mbuf **mb_list; + struct mbuf *mb; struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride); dma_addr_t dma; int nr; if (ring->use_frags) { - skb_frags = ring->rx_info + (index << priv->log_rx_info); + mb_list = ring->rx_info + (index << priv->log_rx_info); for (nr = 0; nr < priv->num_frags; nr++) { en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); + frag_info = &priv->frag_info[nr]; dma = be64_to_cpu(rx_desc->data[nr].addr); en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma); - pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size, + pci_unmap_single(mdev->pdev, dma, frag_info->frag_size, PCI_DMA_FROMDEVICE); - put_page(skb_frags[nr].page); + m_free(mb_list[nr]); } } else { - skb = *((struct sk_buff **) ring->rx_info + index); + mb = *((struct mbuf **) ring->rx_info + index); dma = be64_to_cpu(rx_desc->data->addr); pci_unmap_single(mdev->pdev, dma, - priv->rx_skb_size + NET_IP_ALIGN, + priv->rx_mb_size, PCI_DMA_FROMDEVICE); - kfree_skb(skb); + m_free(mb); } } @@ -284,10 +214,10 @@ err = mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size); else - err = mlx4_en_prepare_rx_desc_skb(priv, ring, + err = mlx4_en_prepare_rx_desc_mb(priv, ring, ring->actual_size); if (err) { - if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { + if (ring->actual_size == 0) { en_err(priv, "Failed to allocate " "enough rx buffers\n"); return -ENOMEM; @@ -357,17 +287,17 @@ if (ring->use_frags) tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * - sizeof(struct skb_frag_struct)); + sizeof(struct mbuf *)); else - tmp = size * sizeof(struct sk_buff *); + tmp = size * sizeof(struct mbuf *); - ring->rx_info = vmalloc(tmp); + ring->rx_info = kmalloc(tmp, GFP_KERNEL); if (!ring->rx_info) { en_err(priv, "Failed allocating rx_info ring\n"); return -ENOMEM; } - en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n", - ring->rx_info, tmp); + en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d stride:%d (%d)\n", + ring->rx_info, tmp, ring->stride, ring->log_stride); err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); @@ -381,32 +311,13 @@ } ring->buf = ring->wqres.buf.direct.buf; - /* Configure lro mngr */ - memset(&ring->lro, 0, sizeof(struct net_lro_mgr)); - ring->lro.dev = priv->dev; - ring->lro.features = LRO_F_NAPI; - ring->lro.frag_align_pad = NET_IP_ALIGN; - ring->lro.ip_summed = CHECKSUM_UNNECESSARY; - ring->lro.ip_summed_aggr = CHECKSUM_UNNECESSARY; - ring->lro.max_desc = mdev->profile.num_lro; - ring->lro.max_aggr = MAX_SKB_FRAGS; - ring->lro.lro_arr = kzalloc(mdev->profile.num_lro * - sizeof(struct net_lro_desc), - GFP_KERNEL); - if (!ring->lro.lro_arr) { - en_err(priv, "Failed to allocate lro array\n"); - goto err_map; - } - ring->lro.get_frag_header = mlx4_en_get_frag_header; - return 0; -err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_ring: - vfree(ring->rx_info); + kfree(ring->rx_info); ring->rx_info = NULL; return err; } @@ -443,18 +354,17 @@ /* Initailize all descriptors */ for (i = 0; i < ring->size; i++) mlx4_en_init_rx_desc(priv, ring, i); - - /* Initialize page allocators */ - err = mlx4_en_init_allocator(priv, ring); - if (err) { - en_err(priv, "Failed initializing ring allocator\n"); - ring_ind--; - goto err_allocator; - } } else { for (i = 0; i < ring->size; i++) - mlx4_en_init_rx_desc_skb(priv, ring, i); + mlx4_en_init_rx_desc_mb(priv, ring, i); } + /* Configure lro mngr */ + if (priv->dev->if_capenable & IFCAP_LRO) { + if (tcp_lro_init(&ring->lro)) + priv->dev->if_capenable &= ~IFCAP_LRO; + else + ring->lro.ifp = priv->dev; + } } err = mlx4_en_fill_rx_buffers(priv); if (err) @@ -467,6 +377,7 @@ mlx4_en_update_rx_prod_db(ring); } + return 0; err_buffers: @@ -474,12 +385,6 @@ mlx4_en_free_rx_buf(priv, &priv->rx_ring[ring_ind]); ring_ind = priv->rx_ring_num - 1; -err_allocator: - while (ring_ind >= 0) { - if (priv->rx_ring[ring_ind].use_frags) - mlx4_en_destroy_allocator(priv, &priv->rx_ring[ring_ind]); - ring_ind--; - } return err; } @@ -488,133 +393,84 @@ { struct mlx4_en_dev *mdev = priv->mdev; - kfree(ring->lro.lro_arr); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size + TXBB_SIZE); - vfree(ring->rx_info); + kfree(ring->rx_info); ring->rx_info = NULL; } void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { + tcp_lro_free(&ring->lro); mlx4_en_free_rx_buf(priv, ring); if (ring->stride <= TXBB_SIZE) ring->buf -= TXBB_SIZE; - if (ring->use_frags) - mlx4_en_destroy_allocator(priv, ring); } /* Unmap a completed descriptor and free unused pages */ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, - struct skb_frag_struct *skb_frags, - struct skb_frag_struct *skb_frags_rx, - struct mlx4_en_rx_alloc *page_alloc, + struct mbuf **mb_list, int length) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_frag_info *frag_info; - int nr; dma_addr_t dma; + struct mbuf *mb; + int nr; + mb = mb_list[0]; + mb->m_pkthdr.len = length; /* Collect used fragments while replacing them in the HW descirptors */ for (nr = 0; nr < priv->num_frags; nr++) { frag_info = &priv->frag_info[nr]; if (length <= frag_info->frag_prefix_size) break; - - /* Save page reference in skb */ - skb_frags_rx[nr].page = skb_frags[nr].page; - skb_frags_rx[nr].size = skb_frags[nr].size; - skb_frags_rx[nr].page_offset = skb_frags[nr].page_offset; + if (nr) + mb->m_next = mb_list[nr]; + mb = mb_list[nr]; + mb->m_len = frag_info[nr].frag_size; dma = be64_to_cpu(rx_desc->data[nr].addr); /* Allocate a replacement page */ - if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, page_alloc, nr)) + if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr)) goto fail; /* Unmap buffer */ - pci_unmap_single(mdev->pdev, dma, skb_frags_rx[nr].size, + pci_unmap_single(mdev->pdev, dma, frag_info[nr].frag_size, PCI_DMA_FROMDEVICE); } /* Adjust size of last fragment to match actual length */ - skb_frags_rx[nr - 1].size = length - - priv->frag_info[nr - 1].frag_prefix_size; - return nr; + mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size; + mb->m_next = NULL; + return 0; fail: /* Drop all accumulated fragments (which have already been replaced in * the descriptor) of this packet; remaining fragments are reused... */ while (nr > 0) { nr--; - put_page(skb_frags_rx[nr].page); + m_free(mb_list[nr]); } - return 0; + return -ENOMEM; } -static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, +static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, - struct skb_frag_struct *skb_frags, - struct mlx4_en_rx_alloc *page_alloc, + struct mbuf **mb_list, unsigned int length) { - struct mlx4_en_dev *mdev = priv->mdev; - struct sk_buff *skb; - void *va; - int used_frags; - dma_addr_t dma; + struct mbuf *mb; - skb = dev_alloc_skb(SMALL_PACKET_SIZE + NET_IP_ALIGN); - if (!skb) { - en_dbg(RX_ERR, priv, "Failed allocating skb\n"); + mb = mb_list[0]; + /* Move relevant fragments to mb */ + if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length))) return NULL; - } - skb->dev = priv->dev; - skb_reserve(skb, NET_IP_ALIGN); - skb->len = length; - skb->truesize = length + sizeof(struct sk_buff); - /* Get pointer to first fragment so we could copy the headers into the - * (linear part of the) skb */ - va = page_address(skb_frags[0].page) + skb_frags[0].page_offset; - - if (length <= SMALL_PACKET_SIZE) { - /* We are copying all relevant data to the skb - temporarily - * synch buffers for the copy */ - dma = be64_to_cpu(rx_desc->data[0].addr); - dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0, - length, DMA_FROM_DEVICE); - skb_copy_to_linear_data(skb, va, length); - dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0, - length, DMA_FROM_DEVICE); - skb->tail += length; - } else { - - /* Move relevant fragments to skb */ - used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags, - skb_shinfo(skb)->frags, - page_alloc, length); - if (unlikely(!used_frags)) { - kfree_skb(skb); - return NULL; - } - skb_shinfo(skb)->nr_frags = used_frags; - - /* Copy headers into the skb linear buffer */ - memcpy(skb->data, va, HEADER_COPY_SIZE); - skb->tail += HEADER_COPY_SIZE; - - /* Skip headers in first fragment */ - skb_shinfo(skb)->frags[0].page_offset += HEADER_COPY_SIZE; - - /* Adjust size of first fragment */ - skb_shinfo(skb)->frags[0].size -= HEADER_COPY_SIZE; - skb->data_len = length - HEADER_COPY_SIZE; - } - return skb; + return mb; } static inline int invalid_cqe(struct mlx4_en_priv *priv, @@ -637,69 +493,65 @@ return 0; } -static struct sk_buff * -mlx4_en_get_rx_skb(struct mlx4_en_priv *priv, +static struct mbuf * +mlx4_en_get_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, - struct sk_buff **pskb, + struct mbuf **pmb, unsigned int length) { struct mlx4_en_dev *mdev = priv->mdev; - struct sk_buff *skb; + struct mbuf *mb; dma_addr_t dma; if (length <= SMALL_PACKET_SIZE) { - skb = dev_alloc_skb(length + NET_IP_ALIGN); - if (unlikely(!skb)) + mb = m_gethdr(M_WAITOK, MT_DATA); + if (unlikely(mb == NULL)) return NULL; - - skb->dev = priv->dev; - skb_reserve(skb, NET_IP_ALIGN); - /* We are copying all relevant data to the skb - temporarily + /* We are copying all relevant data to the mb - temporarily * synch buffers for the copy */ dma = be64_to_cpu(rx_desc->data->addr); dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0, length, DMA_FROM_DEVICE); - skb_copy_to_linear_data(skb, (*pskb)->data, length); + memcpy(mb->m_data, (*pmb)->m_data, length); dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0, length, DMA_FROM_DEVICE); } else { - skb = *pskb; - if (unlikely(mlx4_en_alloc_rx_skb(priv, rx_desc, pskb, 1))) + mb = *pmb; + if (unlikely(mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 1))) return NULL; } - skb->tail += length; - skb->len = length; - skb->truesize = length + sizeof(struct sk_buff); - return skb; + mb->m_len = length; + mb->m_pkthdr.len = length; + return mb; } -static void validate_loopback(struct mlx4_en_priv *priv, struct sk_buff *skb) +static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb) { int i; - int offset = ETH_HLEN; + int offset = ETHER_HDR_LEN; for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) { - if (*(skb->data + offset) != (unsigned char) (i & 0xff)) + if (*(mb->m_data + offset) != (unsigned char) (i & 0xff)) goto out_loopback; } /* Loopback found */ priv->loopback_ok = 1; out_loopback: - dev_kfree_skb_any(skb); + m_freem(mb); } -int mlx4_en_process_rx_cq_skb(struct net_device *dev, +int mlx4_en_process_rx_cq_mb(struct net_device *dev, struct mlx4_en_cq *cq, int budget) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring]; struct mlx4_en_rx_desc *rx_desc; - struct sk_buff **pskb; - struct sk_buff *skb; + struct mbuf **pmb; + struct mbuf *mb; int index; unsigned int length; int polled = 0; @@ -717,7 +569,7 @@ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cq->mcq.cons_index & cq->size)) { - pskb = (struct sk_buff **) ring->rx_info + index; + pmb = (struct mbuf **) ring->rx_info + index; rx_desc = ring->buf + (index << ring->log_stride); /* @@ -732,55 +584,57 @@ * Packet is OK - process it. */ length = be32_to_cpu(cqe->byte_cnt); - ring->bytes += length; - ring->packets++; - skb = mlx4_en_get_rx_skb(priv, rx_desc, pskb, length); - if (unlikely(!skb)){ - priv->stats.rx_dropped++; + mb = mlx4_en_get_rx_mb(priv, rx_desc, pmb, length); + if (unlikely(!mb)){ + ring->errors++; goto next; } + ring->bytes += length; + ring->packets++; + if (unlikely(priv->validate_loopback)) { - validate_loopback(priv, skb); + validate_loopback(priv, mb); goto next; } - skb->protocol = eth_type_trans(skb, dev); + mb->m_pkthdr.flowid = cq->ring; + mb->m_flags |= M_FLOWID; + mb->m_pkthdr.rcvif = dev; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid); + mb->m_flags |= M_VLANTAG; + } if (likely(priv->rx_csum && cqe->checksum == 0xffff)) { priv->port_stats.rx_chksum_good++; - skb->ip_summed = CHECKSUM_UNNECESSARY; + mb->m_pkthdr.csum_flags = + CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + mb->m_pkthdr.csum_data = htons(0xffff); } else { priv->port_stats.rx_chksum_none++; - skb->ip_summed = CHECKSUM_NONE; + mb->m_pkthdr.csum_flags = 0; if (priv->mdev->profile.ip_reasm && cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4) && - !mlx4_en_rx_frags(priv, ring, skb, cqe)) + !mlx4_en_rx_frags(priv, ring, mb, cqe)) goto next; } - /* Push it up the stack */ - if (priv->vlgrp && (be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK)) { - vlan_hwaccel_receive_skb(skb, priv->vlgrp, - be16_to_cpu(cqe->sl_vid)); - } else - netif_receive_skb(skb); + dev->if_input(dev, mb); - dev->last_rx = jiffies; - next: ++cq->mcq.cons_index; index = (cq->mcq.cons_index) & ring->size_mask; cqe = &cq->buf[index]; if (++polled == budget) - goto out; + break; } /* If CQ is empty, flush all pending IP reassembly sessions */ mlx4_en_flush_frags(priv, ring); -out: AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mlx4_cq_set_ci(&cq->mcq); wmb(); /* ensure HW sees CQ consumer before we post new buffers */ @@ -795,15 +649,13 @@ struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring]; - struct skb_frag_struct *skb_frags; - struct skb_frag_struct lro_frags[MLX4_EN_MAX_RX_FRAGS]; + struct mbuf **mb_list; struct mlx4_en_rx_desc *rx_desc; - struct sk_buff *skb; + struct mbuf *mb; + struct lro_entry *queued; int index; - int nr; unsigned int length; int polled = 0; - int ip_summed; if (!priv->port_up) return 0; @@ -818,7 +670,7 @@ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cq->mcq.cons_index & cq->size)) { - skb_frags = ring->rx_info + (index << priv->log_rx_info); + mb_list = ring->rx_info + (index << priv->log_rx_info); rx_desc = ring->buf + (index << ring->log_stride); /* @@ -833,98 +685,70 @@ * Packet is OK - process it. */ length = be32_to_cpu(cqe->byte_cnt); + mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length); + if (!mb) { + ring->errors++; + goto next; + } + ring->bytes += length; ring->packets++; - if (likely(priv->rx_csum)) { - if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && - (cqe->checksum == cpu_to_be16(0xffff))) { - priv->port_stats.rx_chksum_good++; - /* This packet is eligible for LRO if it is: - * - DIX Ethernet (type interpretation) - * - TCP/IP (v4) - * - without IP options - * - not an IP fragment */ - if (mlx4_en_can_lro(cqe->status) && - dev->features & NETIF_F_LRO) { + if (unlikely(priv->validate_loopback)) { + validate_loopback(priv, mb); + goto next; + } - nr = mlx4_en_complete_rx_desc( - priv, rx_desc, - skb_frags, lro_frags, - ring->page_alloc, length); - if (!nr) - goto next; - - if (priv->vlgrp && (cqe->vlan_my_qpn & - cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK))) { - lro_vlan_hwaccel_receive_frags( - &ring->lro, lro_frags, - length, length, - priv->vlgrp, - be16_to_cpu(cqe->sl_vid), - NULL, 0); - } else - lro_receive_frags(&ring->lro, - lro_frags, - length, - length, - NULL, 0); - + mb->m_pkthdr.flowid = cq->ring; + mb->m_flags |= M_FLOWID; + mb->m_pkthdr.rcvif = dev; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid); + mb->m_flags |= M_VLANTAG; + } + if (likely(priv->rx_csum) && + (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && + (cqe->checksum == cpu_to_be16(0xffff))) { + priv->port_stats.rx_chksum_good++; + mb->m_pkthdr.csum_flags = + CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + mb->m_pkthdr.csum_data = htons(0xffff); + /* This packet is eligible for LRO if it is: + * - DIX Ethernet (type interpretation) + * - TCP/IP (v4) + * - without IP options + * - not an IP fragment + */ + if (mlx4_en_can_lro(cqe->status) && + (dev->if_capenable & IFCAP_LRO)) { + if (ring->lro.lro_cnt != 0 && + tcp_lro_rx(&ring->lro, mb, 0) == 0) goto next; - } - - /* LRO not possible, complete processing here */ - ip_summed = CHECKSUM_UNNECESSARY; - INC_PERF_COUNTER(priv->pstats.lro_misses); - } else { - ip_summed = CHECKSUM_NONE; - priv->port_stats.rx_chksum_none++; } + + /* LRO not possible, complete processing here */ + INC_PERF_COUNTER(priv->pstats.lro_misses); } else { - ip_summed = CHECKSUM_NONE; + mb->m_pkthdr.csum_flags = 0; priv->port_stats.rx_chksum_none++; } - skb = mlx4_en_rx_skb(priv, rx_desc, skb_frags, - ring->page_alloc, length); - if (!skb) { - priv->stats.rx_dropped++; - goto next; - } - - if (unlikely(priv->validate_loopback)) { - validate_loopback(priv, skb); - goto next; - } - - skb->ip_summed = ip_summed; - skb->protocol = eth_type_trans(skb, dev); - skb_record_rx_queue(skb, cq->ring); - /* Push it up the stack */ - if (priv->vlgrp && (be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK)) { - vlan_hwaccel_receive_skb(skb, priv->vlgrp, - be16_to_cpu(cqe->sl_vid)); - } else - netif_receive_skb(skb); + dev->if_input(dev, mb); next: ++cq->mcq.cons_index; index = (cq->mcq.cons_index) & ring->size_mask; cqe = &cq->buf[index]; - if (++polled == budget) { - /* We are here because we reached the NAPI budget - - * flush only pending LRO sessions */ - lro_flush_all(&ring->lro); - goto out; - } + if (++polled == budget) + break; } - - /* If CQ is empty flush all LRO sessions unconditionally */ - lro_flush_all(&ring->lro); - -out: + while ((queued = SLIST_FIRST(&ring->lro.lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&ring->lro.lro_active, next); + tcp_lro_flush(&ring->lro, queued); + } AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mlx4_cq_set_ci(&cq->mcq); wmb(); /* ensure HW sees CQ consumer before we post new buffers */ @@ -935,21 +759,9 @@ } -void mlx4_en_rx_irq(struct mlx4_cq *mcq) -{ - struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); - struct mlx4_en_priv *priv = netdev_priv(cq->dev); - - if (priv->port_up) - napi_schedule(&cq->napi); - else - mlx4_en_arm_cq(priv, cq); -} - /* Rx CQ polling - called by NAPI */ -int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) +static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget) { - struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); struct net_device *dev = cq->dev; struct mlx4_en_priv *priv = netdev_priv(dev); int done; @@ -957,89 +769,92 @@ if (priv->rx_ring[cq->ring].use_frags) done = mlx4_en_process_rx_cq(dev, cq, budget); else - done = mlx4_en_process_rx_cq_skb(dev, cq, budget); + done = mlx4_en_process_rx_cq_mb(dev, cq, budget); - /* If we used up all the quota - we're probably not done yet... */ cq->tot_rx += done; - if (done == budget) { - INC_PERF_COUNTER(priv->pstats.napi_quota); - if (cq->tot_rx >= MIN_RX_ARM) { - cq->tot_rx -= MIN_RX_ARM; - mlx4_en_arm_cq(priv, cq); - } - } - else { - /* Done for now */ - napi_complete(napi); - mlx4_en_arm_cq(priv, cq); - cq->tot_rx = 0; - } + return done; } +void mlx4_en_rx_que(void *context, int pending) +{ + struct mlx4_en_cq *cq; -/* Calculate the last offset position that accomodates a full fragment - * (assuming fagment size = stride-align) */ -static int mlx4_en_last_alloc_offset(struct mlx4_en_priv *priv, u16 stride, u16 align) + cq = context; + while (mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL) + == MLX4_EN_MAX_RX_POLL); + mlx4_en_arm_cq(cq->dev->if_softc, cq); +} + +void mlx4_en_rx_irq(struct mlx4_cq *mcq) { - u16 res = MLX4_EN_ALLOC_SIZE % stride; - u16 offset = MLX4_EN_ALLOC_SIZE - stride - res + align; + struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); + struct mlx4_en_priv *priv = netdev_priv(cq->dev); + int done; - en_dbg(DRV, priv, "Calculated last offset for stride:%d align:%d " - "res:%d offset:%d\n", stride, align, res, offset); - return offset; + done = mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL); + if (done == MLX4_EN_MAX_RX_POLL) + taskqueue_enqueue(cq->tq, &cq->cq_task); + else + mlx4_en_arm_cq(priv, cq); } +#if MLX4_EN_MAX_RX_FRAGS == 3 static int frag_sizes[] = { FRAG_SZ0, FRAG_SZ1, FRAG_SZ2, - FRAG_SZ3 }; +#elif MLX4_EN_MAX_RX_FRAGS == 2 +static int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, +}; +#else +#error "Unknown MAX_RX_FRAGS" +#endif void mlx4_en_calc_rx_buf(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); - int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN + ETH_LLC_SNAP_SIZE; + int eff_mtu = dev->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETH_LLC_SNAP_SIZE; int buf_size = 0; - int i = 0; + int i, frag; - while (buf_size < eff_mtu) { - priv->frag_info[i].frag_size = - (eff_mtu > buf_size + frag_sizes[i]) ? - frag_sizes[i] : eff_mtu - buf_size; - priv->frag_info[i].frag_prefix_size = buf_size; - if (!i) { - priv->frag_info[i].frag_align = NET_IP_ALIGN; - priv->frag_info[i].frag_stride = - ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES); - } else { - priv->frag_info[i].frag_align = 0; - priv->frag_info[i].frag_stride = - ALIGN(frag_sizes[i], SMP_CACHE_BYTES); - } - priv->frag_info[i].last_offset = mlx4_en_last_alloc_offset( - priv, priv->frag_info[i].frag_stride, - priv->frag_info[i].frag_align); - buf_size += priv->frag_info[i].frag_size; - i++; + for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) { + /* + * Allocate small to large but only as much as is needed for + * the tail. + */ + while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1]) + i--; + priv->frag_info[frag].frag_size = frag_sizes[i]; + priv->frag_info[frag].frag_prefix_size = buf_size; + buf_size += priv->frag_info[frag].frag_size; } - priv->num_frags = i; - priv->rx_skb_size = eff_mtu; - priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct)); + priv->num_frags = frag; + /* + * For use_frags == 0 calculate the size extbuf we require. + */ + if (eff_mtu <= MCLBYTES) + priv->rx_mb_size = MCLBYTES; + else if (eff_mtu <= MJUMPAGESIZE) + priv->rx_mb_size = MJUMPAGESIZE; + else if (eff_mtu <= MJUM9BYTES) + priv->rx_mb_size = MJUM9BYTES; + else + priv->rx_mb_size = MJUM16BYTES; + priv->log_rx_info = + ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *)); en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d " "num_frags:%d):\n", eff_mtu, priv->num_frags); for (i = 0; i < priv->num_frags; i++) { - en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d align:%d " - "stride:%d last_offset:%d\n", i, + en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d\n", i, priv->frag_info[i].frag_size, - priv->frag_info[i].frag_prefix_size, - priv->frag_info[i].frag_align, - priv->frag_info[i].frag_stride, - priv->frag_info[i].last_offset); + priv->frag_info[i].frag_prefix_size) } } @@ -1189,8 +1004,3 @@ } mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); } - - - - - Index: sys/ofed/drivers/net/mlx4/mlx4_en.h =================================================================== --- sys/ofed/drivers/net/mlx4/mlx4_en.h (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/mlx4_en.h (.../head) (revision 219811) @@ -34,11 +34,13 @@ #ifndef _MLX4_EN_H_ #define _MLX4_EN_H_ +#include + +#include #include #include #include #include -#include #include #include @@ -47,19 +49,30 @@ #include #include +#include +#include + #include "en_port.h" #define DRV_NAME "mlx4_en" #define DRV_VERSION "1.5.2" #define DRV_RELDATE "July 2010" +/* XXX */ +#define NETIF_MSG_LINK 0x1 +#define NETIF_MSG_IFDOWN 0x2 +#define NETIF_MSG_HW 0x4 +#define NETIF_MSG_DRV 0x8 +#define NETIF_MSG_INTR 0x10 +#define NETIF_MSG_RX_ERR 0x20 + #define MLX4_EN_MSG_LEVEL (NETIF_MSG_LINK | NETIF_MSG_IFDOWN) #define en_print(level, priv, format, arg...) \ { \ if ((priv)->registered) \ printk(level "%s: %s: " format, DRV_NAME, \ - (priv->dev)->name, ## arg); \ + (priv->dev)->if_xname, ## arg); \ else \ printk(level "%s: %s: Port %d: " format, \ DRV_NAME, dev_name(&priv->mdev->pdev->dev), \ @@ -113,28 +126,34 @@ #define MLX4_EN_WATCHDOG_TIMEOUT (15 * HZ) -#define MLX4_EN_ALLOC_ORDER 2 -#define MLX4_EN_ALLOC_SIZE (PAGE_SIZE << MLX4_EN_ALLOC_ORDER) - #define MLX4_EN_MAX_LRO_DESCRIPTORS 32 #define MLX4_EN_NUM_IPFRAG_SESSIONS 16 -/* Receive fragment sizes; we use at most 4 fragments (for 9600 byte MTU +/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU * and 4K allocations) */ +#if MJUMPAGESIZE == 4096 enum { - FRAG_SZ0 = 512 - NET_IP_ALIGN, - FRAG_SZ1 = 1024, - FRAG_SZ2 = 4096, - FRAG_SZ3 = MLX4_EN_ALLOC_SIZE + FRAG_SZ0 = MCLBYTES, + FRAG_SZ1 = MJUMPAGESIZE, + FRAG_SZ2 = MJUMPAGESIZE, }; -#define MLX4_EN_MAX_RX_FRAGS 4 +#define MLX4_EN_MAX_RX_FRAGS 3 +#elif MJUMPAGESIZE == 8192 +enum { + FRAG_SZ0 = MCLBYTES, + FRAG_SZ1 = MJUMPAGESIZE, +}; +#define MLX4_EN_MAX_RX_FRAGS 2 +#elif MJUMPAGESIZE == 8192 +#else +#error "Unknown PAGE_SIZE" +#endif /* Maximum ring sizes */ #define MLX4_EN_MAX_TX_SIZE 8192 #define MLX4_EN_MAX_RX_SIZE 8192 -/* Minimum ring size for our page-allocation sceme to work */ -#define MLX4_EN_MIN_RX_SIZE (MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES) +#define MLX4_EN_MIN_RX_SIZE (128) #define MLX4_EN_MIN_TX_SIZE (4096 / TXBB_SIZE) #define MLX4_EN_SMALL_PKT_SIZE 64 @@ -143,7 +162,9 @@ #define MLX4_EN_NUM_HASH_RINGS 4 #define MLX4_EN_NUM_PPP_RINGS 8 #define MLX4_EN_DEF_TX_RING_SIZE 512 +#define MLX4_EN_DEF_TX_QUEUE_SIZE 4096 #define MLX4_EN_DEF_RX_RING_SIZE 1024 +#define MLX4_EN_MAX_RX_POLL 16 /* Target number of bytes to coalesce with interrupt moderation */ #define MLX4_EN_RX_COAL_TARGET 0x20000 @@ -173,9 +194,9 @@ #define ETH_LLC_SNAP_SIZE 8 -#define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) -#define HEADER_COPY_SIZE (128 - NET_IP_ALIGN) -#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN) +#define SMALL_PACKET_SIZE (MHLEN) +#define HEADER_COPY_SIZE (128) +#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETHER_HDR_LEN) #define MLX4_EN_MIN_MTU 46 #define ETH_BCAST 0xffffffffffffULL @@ -225,9 +246,9 @@ struct mlx4_en_tx_info { - struct sk_buff *skb; + struct mbuf *mb; u32 nr_txbb; - u8 linear; + u8 nr_segs; u8 data_offset; u8 inl; }; @@ -250,12 +271,8 @@ #define MLX4_EN_USE_SRQ 0x01000000 -struct mlx4_en_rx_alloc { - struct page *page; - u16 offset; -}; - struct mlx4_en_tx_ring { + spinlock_t tx_lock; struct mlx4_hwq_resources wqres; u32 size ; /* number of TXBBs */ u32 size_mask; @@ -268,6 +285,7 @@ void *buf; u16 poll_cnt; int blocked; + struct buf_ring *br; struct mlx4_en_tx_info *tx_info; u8 *bounce_buf; u32 last_nr_txbb; @@ -278,22 +296,22 @@ struct mlx4_srq dummy; unsigned long bytes; unsigned long packets; + unsigned long errors; spinlock_t comp_lock; struct mlx4_bf bf; bool bf_enabled; + u64 watchdog_time; }; struct mlx4_en_ipfrag { - struct sk_buff *fragments; - struct sk_buff *last; + struct mbuf *fragments; + struct mbuf *last; __be32 saddr; __be32 daddr; __be16 id; u8 protocol; int total_len; u16 offset; - unsigned int vlan; - __be16 sl_vid; }; struct mlx4_en_rx_desc { @@ -303,8 +321,6 @@ struct mlx4_en_rx_ring { struct mlx4_hwq_resources wqres; - struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; - struct net_lro_mgr lro; u32 size ; /* number of Rx descs*/ u32 actual_size; u32 size_mask; @@ -318,8 +334,10 @@ void *rx_info; unsigned long bytes; unsigned long packets; + unsigned long errors; + unsigned int use_frags; + struct lro_ctrl lro; struct mlx4_en_ipfrag ipfrag[MLX4_EN_NUM_IPFRAG_SESSIONS]; - unsigned int use_frags; }; @@ -343,7 +361,6 @@ int ring; spinlock_t lock; struct net_device *dev; - struct napi_struct napi; /* Per-core Tx cq processing support */ struct timer_list timer; int size; @@ -353,6 +370,8 @@ u16 moder_time; u16 moder_cnt; struct mlx4_cqe *buf; + struct task cq_task; + struct taskqueue *tq; #define MLX4_EN_OPCODE_ERROR 0x1e u32 tot_rx; }; @@ -434,9 +453,6 @@ }; struct mlx4_en_port_stats { - unsigned long lro_aggregated; - unsigned long lro_flushed; - unsigned long lro_no_desc; unsigned long tso_packets; unsigned long queue_stopped; unsigned long wake_queue; @@ -445,26 +461,19 @@ unsigned long rx_chksum_good; unsigned long rx_chksum_none; unsigned long tx_chksum_offload; -#define NUM_PORT_STATS 11 }; struct mlx4_en_perf_stats { u32 tx_poll; u64 tx_pktsz_avg; u32 inflight_avg; - u16 tx_coal_avg; - u16 rx_coal_avg; - u32 napi_quota; -#define NUM_PERF_COUNTERS 6 + u32 tx_coal_avg; + u32 rx_coal_avg; }; struct mlx4_en_frag_info { u16 frag_size; u16 frag_prefix_size; - u16 frag_stride; - u16 frag_align; - u16 last_offset; - }; struct mlx4_en_tx_hash_entry { @@ -478,14 +487,11 @@ struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; struct net_device *dev; - struct vlan_group *vlgrp; bool vlgrp_modified; -#define MLX4_VLREG_SIZE 512 - u8 vlan_register[MLX4_VLREG_SIZE]; - u8 vlan_unregister[MLX4_VLREG_SIZE]; + u32 vlan_register[VLAN_FLTR_SIZE]; + u32 vlan_unregister[VLAN_FLTR_SIZE]; + u32 vlans[VLAN_FLTR_SIZE]; spinlock_t vlan_lock; - struct net_device_stats stats; - struct net_device_stats ret_stats; struct mlx4_en_port_state port_state; spinlock_t stats_lock; @@ -528,7 +534,7 @@ u32 tx_ring_num; u32 rx_ring_num; u32 udp_rings; - u32 rx_skb_size; + u32 rx_mb_size; struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; u16 num_frags; u16 log_rx_info; @@ -539,21 +545,29 @@ struct mlx4_en_cq rx_cq[MAX_RX_RINGS]; struct mlx4_en_tx_hash_entry tx_hash[MLX4_EN_TX_HASH_SIZE]; struct work_struct mcast_task; - struct work_struct mac_task; struct work_struct watchdog_task; struct work_struct linkstate_task; struct delayed_work stats_task; struct mlx4_en_perf_stats pstats; struct mlx4_en_pkt_stats pkstats; struct mlx4_en_port_stats port_stats; - struct dev_mc_list *mc_list; struct mlx4_en_stat_out_mbox hw_stats; - int vids[128]; + struct ifmedia media; + eventhandler_tag vlan_attach; + eventhandler_tag vlan_detach; + struct callout watchdog_timer; + volatile int blocked; + struct sysctl_oid *sysctl; + struct sysctl_ctx_list conf_ctx; + struct sysctl_ctx_list stat_ctx; }; +int mlx4_en_transmit(struct net_device *dev, struct mbuf *mb); +void mlx4_en_qflush(struct net_device *dev); + int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, - struct sk_buff *skb, struct mlx4_cqe *cqe); + struct mbuf *mb, struct mlx4_cqe *cqe); void mlx4_en_flush_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring); void mlx4_en_destroy_netdev(struct net_device *dev); @@ -576,8 +590,7 @@ void mlx4_en_poll_tx_cq(unsigned long data); void mlx4_en_tx_irq(struct mlx4_cq *mcq); -u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb); -int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); +u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 size, u16 stride); @@ -598,10 +611,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget); -int mlx4_en_process_rx_cq_skb(struct net_device *dev, +int mlx4_en_process_rx_cq_mb(struct net_device *dev, struct mlx4_en_cq *cq, int budget); -int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); +void mlx4_en_tx_que(void *context, int pending); +void mlx4_en_rx_que(void *context, int pending); void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, int is_tx, int rss, int qpn, int cqn, struct mlx4_qp_context *context); @@ -617,7 +631,7 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq); int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); -int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp); +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans); int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, Index: sys/ofed/drivers/net/mlx4/en_tx.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_tx.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_tx.c (.../head) (revision 219811) @@ -31,21 +31,31 @@ * */ -#include +#include "mlx4_en.h" + #include #include -#include -#include #include -#include "mlx4_en.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ MAX_BF = 256, }; -static int inline_thold __read_mostly = MAX_INLINE; +static int inline_thold = MAX_INLINE; module_param_named(inline_thold, inline_thold, int, 0444); MODULE_PARM_DESC(inline_thold, "treshold for using inline data"); @@ -64,13 +74,23 @@ inline_thold = min(inline_thold, MAX_INLINE); - spin_lock_init(&ring->comp_lock); + mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); + mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); + /* Allocate the buf ring */ + ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, + M_WAITOK, &ring->tx_lock.m); + if (ring->br == NULL) { + en_err(priv, "Failed allocating tx_info ring\n"); + return -ENOMEM; + } + tmp = size * sizeof(struct mlx4_en_tx_info); - ring->tx_info = vmalloc(tmp); + ring->tx_info = kmalloc(tmp, GFP_KERNEL); if (!ring->tx_info) { en_err(priv, "Failed allocating tx_info ring\n"); - return -ENOMEM; + err = -ENOMEM; + goto err_tx; } en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", ring->tx_info, tmp); @@ -135,7 +155,8 @@ kfree(ring->bounce_buf); ring->bounce_buf = NULL; err_tx: - vfree(ring->tx_info); + buf_ring_free(ring->br, M_DEVBUF); + kfree(ring->tx_info); ring->tx_info = NULL; return err; } @@ -146,6 +167,7 @@ struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); + buf_ring_free(ring->br, M_DEVBUF); if (ring->bf_enabled) mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); @@ -155,8 +177,10 @@ mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); kfree(ring->bounce_buf); ring->bounce_buf = NULL; - vfree(ring->tx_info); + kfree(ring->tx_info); ring->tx_info = NULL; + mtx_destroy(&ring->tx_lock.m); + mtx_destroy(&ring->comp_lock.m); } int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, @@ -207,10 +231,9 @@ struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; - struct sk_buff *skb = tx_info->skb; - struct skb_frag_struct *frag; + struct mbuf *mb = tx_info->mb; void *end = ring->buf + ring->buf_size; - int frags = skb_shinfo(skb)->nr_frags; + int frags = tx_info->nr_segs; int i; __be32 *ptr = (__be32 *)tx_desc; __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT)); @@ -218,19 +241,10 @@ /* Optimize the common case when there are no wraparounds */ if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { if (!tx_info->inl) { - if (tx_info->linear) { + for (i = 0; i < frags; i++) { pci_unmap_single(mdev->pdev, - (dma_addr_t) be64_to_cpu(data->addr), - be32_to_cpu(data->byte_count), - PCI_DMA_TODEVICE); - ++data; - } - - for (i = 0; i < frags; i++) { - frag = &skb_shinfo(skb)->frags[i]; - pci_unmap_page(mdev->pdev, (dma_addr_t) be64_to_cpu(data[i].addr), - frag->size, PCI_DMA_TODEVICE); + data[i].byte_count, PCI_DMA_TODEVICE); } } /* Stamp the freed descriptor */ @@ -241,27 +255,13 @@ } else { if (!tx_info->inl) { - if ((void *) data >= end) { - data = (struct mlx4_wqe_data_seg *) - (ring->buf + ((void *) data - end)); - } - - if (tx_info->linear) { - pci_unmap_single(mdev->pdev, - (dma_addr_t) be64_to_cpu(data->addr), - be32_to_cpu(data->byte_count), - PCI_DMA_TODEVICE); - ++data; - } - for (i = 0; i < frags; i++) { /* Check for wraparound before unmapping */ if ((void *) data >= end) data = (struct mlx4_wqe_data_seg *) ring->buf; - frag = &skb_shinfo(skb)->frags[i]; - pci_unmap_page(mdev->pdev, + pci_unmap_single(mdev->pdev, (dma_addr_t) be64_to_cpu(data->addr), - frag->size, PCI_DMA_TODEVICE); + data->byte_count, PCI_DMA_TODEVICE); ++data; } } @@ -276,7 +276,7 @@ } } - dev_kfree_skb_any(skb); + m_freem(mb); return tx_info->nr_txbb; } @@ -292,8 +292,7 @@ ring->cons, ring->prod); if ((u32) (ring->prod - ring->cons) > ring->size) { - if (netif_msg_tx_err(priv)) - en_warn(priv, "Tx consumer passed producer!\n"); + en_warn(priv, "Tx consumer passed producer!\n"); return 0; } @@ -401,7 +400,9 @@ if ((u32) (ring->prod - ring->cons) <= ring->size - HEADROOM - MAX_DESC_TXBBS) { ring->blocked = 0; - netif_tx_wake_queue(netdev_get_tx_queue(dev, cq->ring)); + if (atomic_fetchadd_int(&priv->blocked, -1) == 1) + atomic_clear_int(&dev->if_drv_flags, + IFF_DRV_OACTIVE); priv->port_stats.wake_queue++; } } @@ -430,7 +431,7 @@ INC_PERF_COUNTER(priv->pstats.tx_poll); - if (!spin_trylock_irq(&ring->comp_lock)) { + if (!spin_trylock(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } @@ -443,7 +444,7 @@ if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); - spin_unlock_irq(&ring->comp_lock); + spin_unlock(&ring->comp_lock); } static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, @@ -486,165 +487,178 @@ /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) - if (spin_trylock_irq(&ring->comp_lock)) { + if (spin_trylock(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); - spin_unlock_irq(&ring->comp_lock); + spin_unlock(&ring->comp_lock); } } -static void *get_frag_ptr(struct sk_buff *skb) +static int is_inline(struct mbuf *mb) { - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; - struct page *page = frag->page; - void *ptr; - ptr = page_address(page); - if (unlikely(!ptr)) - return NULL; + if (inline_thold && mb->m_pkthdr.len <= inline_thold && + (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0) + return 1; - return ptr + frag->page_offset; + return 0; } -static int is_inline(struct sk_buff *skb, void **pfrag) +static int inline_size(struct mbuf *mb) { - void *ptr; + int len; - if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) { - if (skb_shinfo(skb)->nr_frags == 1) { - ptr = get_frag_ptr(skb); - if (unlikely(!ptr)) - return 0; + len = mb->m_pkthdr.len; + if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) + <= MLX4_INLINE_ALIGN) + return ALIGN(len + CTRL_SIZE + + sizeof(struct mlx4_wqe_inline_seg), 16); + else + return ALIGN(len + CTRL_SIZE + 2 * + sizeof(struct mlx4_wqe_inline_seg), 16); +} - if (pfrag) - *pfrag = ptr; +static int get_head_size(struct mbuf *mb) +{ + struct tcphdr *th; + struct ip *ip; + int ip_hlen, tcp_hlen; + int len; - return 1; - } else if (unlikely(skb_shinfo(skb)->nr_frags)) - return 0; - else - return 1; - } - - return 0; + len = ETHER_HDR_LEN; + if (mb->m_len < len + sizeof(struct ip)) + return (0); + ip = (struct ip *)(mtod(mb, char *) + len); + if (ip->ip_p != IPPROTO_TCP) + return (0); + ip_hlen = ip->ip_hl << 2; + len += ip_hlen; + if (mb->m_len < len + sizeof(struct tcphdr)) + return (0); + th = (struct tcphdr *)(mtod(mb, char *) + len); + tcp_hlen = th->th_off << 2; + len += tcp_hlen; + if (mb->m_len < len) + return (0); + return (len); } -static int inline_size(struct sk_buff *skb) +static int get_real_size(struct mbuf *mb, struct net_device *dev, int *segsp, + int *lso_header_size) { - if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) - <= MLX4_INLINE_ALIGN) - return ALIGN(skb->len + CTRL_SIZE + - sizeof(struct mlx4_wqe_inline_seg), 16); - else - return ALIGN(skb->len + CTRL_SIZE + 2 * - sizeof(struct mlx4_wqe_inline_seg), 16); + struct mbuf *m; + int nr_segs; + + nr_segs = 0; + for (m = mb; m != NULL; m = m->m_next) + if (m->m_len) + nr_segs++; + + if (mb->m_pkthdr.csum_flags & CSUM_TSO) { + *lso_header_size = get_head_size(mb); + if (*lso_header_size) { + if (mb->m_len == *lso_header_size) + nr_segs--; + *segsp = nr_segs; + return CTRL_SIZE + nr_segs * DS_SIZE + + ALIGN(*lso_header_size + 4, DS_SIZE); + } + } else + *lso_header_size = 0; + *segsp = nr_segs; + if (is_inline(mb)) + return inline_size(mb); + return (CTRL_SIZE + nr_segs * DS_SIZE); } -static int get_real_size(struct sk_buff *skb, struct net_device *dev, - int *lso_header_size) +static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len) { - struct mlx4_en_priv *priv = netdev_priv(dev); - int real_size; + int bytes; + int off; - if (skb_is_gso(skb)) { - *lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb); - real_size = CTRL_SIZE + skb_shinfo(skb)->nr_frags * DS_SIZE + - ALIGN(*lso_header_size + 4, DS_SIZE); - if (unlikely(*lso_header_size != skb_headlen(skb))) { - /* We add a segment for the skb linear buffer only if - * it contains data */ - if (*lso_header_size < skb_headlen(skb)) - real_size += DS_SIZE; - else { - if (netif_msg_tx_err(priv)) - en_warn(priv, "Non-linear headers\n"); - return 0; - } + off = *offp; + while (len) { + bytes = min(mb->m_len - off, len); + if (bytes) + memcpy(data, mb->m_data + off, bytes); + len -= bytes; + data += bytes; + off += bytes; + if (off == mb->m_len) { + off = 0; + mb = mb->m_next; } - } else { - *lso_header_size = 0; - if (!is_inline(skb, NULL)) - real_size = CTRL_SIZE + (skb_shinfo(skb)->nr_frags + 1) * DS_SIZE; - else - real_size = inline_size(skb); } - - return real_size; + *offp = off; + return (mb); } -static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb, - int real_size, u16 *vlan_tag, int tx_ind, void *fragptr) +static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb, + int real_size, u16 *vlan_tag, int tx_ind) { struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; + int len; + int off; - if (skb->len <= spc) { - inl->byte_count = cpu_to_be32(1 << 31 | skb->len); - skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); - if (skb_shinfo(skb)->nr_frags) - memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr, - skb_shinfo(skb)->frags[0].size); - + off = 0; + len = mb->m_pkthdr.len; + if (len <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | len); + mb_copy(mb, &off, (void *)(inl + 1), len); } else { inl->byte_count = cpu_to_be32(1 << 31 | spc); - if (skb_headlen(skb) <= spc) { - skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); - if (skb_headlen(skb) < spc) { - memcpy(((void *)(inl + 1)) + skb_headlen(skb), - fragptr, spc - skb_headlen(skb)); - fragptr += spc - skb_headlen(skb); - } - inl = (void *) (inl + 1) + spc; - memcpy(((void *)(inl + 1)), fragptr, skb->len - spc); - } else { - skb_copy_from_linear_data(skb, inl + 1, spc); - inl = (void *) (inl + 1) + spc; - skb_copy_from_linear_data_offset(skb, spc, inl + 1, - skb_headlen(skb) - spc); - if (skb_shinfo(skb)->nr_frags) - memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc, - fragptr, skb_shinfo(skb)->frags[0].size); - } - + mb = mb_copy(mb, &off, (void *)(inl + 1), spc); + inl = (void *) (inl + 1) + spc; + mb_copy(mb, &off, (void *)(inl + 1), len - spc); wmb(); - inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc)); + inl->byte_count = cpu_to_be32(1 << 31 | (len - spc)); } tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag); tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; } -u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb) +u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(dev); - u16 vlan_tag = 0; - int tx_ind = 0; - struct tcphdr *th = tcp_hdr(skb); - struct iphdr *iph = ip_hdr(skb); struct mlx4_en_tx_hash_entry *entry; + struct ether_header *eth; + struct tcphdr *th; + struct ip *iph; u32 hash_index; + int tx_ind = 0; + u16 vlan_tag = 0; + int len; /* Obtain VLAN information if present */ - if (priv->vlgrp && vlan_tx_tag_present(skb)) { - vlan_tag = vlan_tx_tag_get(skb); + if (mb->m_flags & M_VLANTAG) { + vlan_tag = mb->m_pkthdr.ether_vtag; /* Set the Tx ring to use according to vlan priority */ tx_ind = priv->tx_prio_map[vlan_tag >> 13]; if (tx_ind) return tx_ind; } - + if (mb->m_len < + ETHER_HDR_LEN + sizeof(struct ip) + sizeof(struct tcphdr)) + return MLX4_EN_NUM_HASH_RINGS; + eth = mtod(mb, struct ether_header *); /* Hashing is only done for TCP/IP or UDP/IP packets */ - if (be16_to_cpu(skb->protocol) != ETH_P_IP) + if (be16_to_cpu(eth->ether_type) != ETHERTYPE_IP) return MLX4_EN_NUM_HASH_RINGS; - - hash_index = be32_to_cpu(iph->daddr) & MLX4_EN_TX_HASH_MASK; - switch(iph->protocol) { + len = ETHER_HDR_LEN; + iph = (struct ip *)(mtod(mb, char *) + len); + len += iph->ip_hl << 2; + th = (struct tcphdr *)(mtod(mb, char *) + len); + hash_index = be32_to_cpu(iph->ip_dst.s_addr) & MLX4_EN_TX_HASH_MASK; + switch(iph->ip_p) { case IPPROTO_UDP: break; case IPPROTO_TCP: - if (th) { - hash_index = (hash_index ^ be16_to_cpu(th->dest ^ th->source)) & - MLX4_EN_TX_HASH_MASK; - } + if (mb->m_len < len + sizeof(struct tcphdr)) + return MLX4_EN_NUM_HASH_RINGS; + hash_index = + (hash_index ^ be16_to_cpu(th->th_dport ^ th->th_sport)) & + MLX4_EN_TX_HASH_MASK; break; default: return MLX4_EN_NUM_HASH_RINGS; @@ -660,7 +674,7 @@ } entry->cnt++; - if (skb->len > MLX4_EN_SMALL_PKT_SIZE) + if (mb->m_pkthdr.len > MLX4_EN_SMALL_PKT_SIZE) entry->big_pkts++; else entry->small_pkts++; @@ -672,7 +686,7 @@ __iowrite64_copy(dst, src, bytecnt / 8); } -int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) +static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; @@ -680,13 +694,10 @@ struct mlx4_en_cq *cq; struct mlx4_en_tx_desc *tx_desc; struct mlx4_wqe_data_seg *data; - struct skb_frag_struct *frag; struct mlx4_en_tx_info *tx_info; - struct ethhdr *ethh; - u64 mac; - u32 mac_l, mac_h; - int tx_ind = 0; + struct mbuf *m; int nr_txbb; + int nr_segs; int desc_size; int real_size; dma_addr_t dma; @@ -695,13 +706,17 @@ u16 vlan_tag = 0; int i; int lso_header_size; - void *fragptr; bool bounce = false; + struct mbuf *mb; + int defrag = 1; + ring = &priv->tx_ring[tx_ind]; + mb = *mbp; if (!priv->port_up) goto tx_drop; - real_size = get_real_size(skb, dev, &lso_header_size); +retry: + real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size); if (unlikely(!real_size)) goto tx_drop; @@ -709,28 +724,33 @@ desc_size = ALIGN(real_size, TXBB_SIZE); nr_txbb = desc_size / TXBB_SIZE; if (unlikely(nr_txbb > MAX_DESC_TXBBS)) { - if (netif_msg_tx_err(priv)) - en_warn(priv, "Oversized header or SG list\n"); + if (defrag) { + mb = m_defrag(*mbp, M_DONTWAIT); + if (mb == NULL) { + mb = *mbp; + goto tx_drop; + } + *mbp = mb; + defrag = 0; + goto retry; + } goto tx_drop; } - tx_ind = skb->queue_mapping; - ring = &priv->tx_ring[tx_ind]; - if (priv->vlgrp && vlan_tx_tag_present(skb)) - vlan_tag = vlan_tx_tag_get(skb); - /* Check available TXBBs And 2K spare for prefetch */ if (unlikely(((int)(ring->prod - ring->cons)) > ring->size - HEADROOM - MAX_DESC_TXBBS)) { /* every full Tx ring stops queue */ - netif_tx_stop_queue(netdev_get_tx_queue(dev, tx_ind)); + if (ring->blocked == 0) + atomic_add_int(&priv->blocked, 1); + atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); ring->blocked = 1; priv->port_stats.queue_stopped++; /* Use interrupts to find out when queue opened */ cq = &priv->tx_cq[tx_ind]; mlx4_en_arm_cq(priv, cq); - return NETDEV_TX_BUSY; + return EBUSY; } /* Track current inflight packets for performance analysis */ @@ -750,19 +770,16 @@ bounce = true; } - /* Save skb in tx_info ring */ - tx_info = &ring->tx_info[index]; - tx_info->skb = skb; - tx_info->nr_txbb = nr_txbb; - /* Prepare ctrl segement apart opcode+ownership, which depends on * whether LSO is used */ + if (mb->m_flags & M_VLANTAG) + vlan_tag = mb->m_pkthdr.ether_vtag; tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag; tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICITED); - if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) { tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM); priv->port_stats.tx_chksum_offload++; @@ -770,10 +787,13 @@ if (unlikely(priv->validate_loopback)) { /* Copy dst mac address to wqe */ - skb_reset_mac_header(skb); - ethh = eth_hdr(skb); - if (ethh && ethh->h_dest) { - mac = mlx4_en_mac_to_u64(ethh->h_dest); + struct ether_header *ethh; + u64 mac; + u32 mac_l, mac_h; + + ethh = mtod(mb, struct ether_header *); + mac = mlx4_en_mac_to_u64(ethh->ether_dhost); + if (mac) { mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16); mac_l = (u32) (mac & 0xffffffff); tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h); @@ -783,6 +803,8 @@ /* Handle LSO (TSO) packets */ if (lso_header_size) { + int segsz; + /* Mark opcode as LSO */ op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | ((ring->prod & ring->size) ? @@ -790,63 +812,64 @@ /* Fill in the LSO prefix */ tx_desc->lso.mss_hdr_size = cpu_to_be32( - skb_shinfo(skb)->gso_size << 16 | lso_header_size); + mb->m_pkthdr.tso_segsz << 16 | lso_header_size); /* Copy headers; * note that we already verified that it is linear */ - memcpy(tx_desc->lso.header, skb->data, lso_header_size); + memcpy(tx_desc->lso.header, mb->m_data, lso_header_size); data = ((void *) &tx_desc->lso + ALIGN(lso_header_size + 4, DS_SIZE)); priv->port_stats.tso_packets++; - i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) + - !!((skb->len - lso_header_size) % skb_shinfo(skb)->gso_size); - ring->bytes += skb->len + (i - 1) * lso_header_size; + segsz = mb->m_pkthdr.tso_segsz; + i = ((mb->m_pkthdr.len - lso_header_size) / segsz) + + !!((mb->m_pkthdr.len - lso_header_size) % segsz); + ring->bytes += mb->m_pkthdr.len + (i - 1) * lso_header_size; ring->packets += i; + mb->m_data += lso_header_size; + mb->m_len -= lso_header_size; } else { /* Normal (Non LSO) packet */ op_own = cpu_to_be32(MLX4_OPCODE_SEND) | ((ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); data = &tx_desc->data; - ring->bytes += max(skb->len, (unsigned int) ETH_ZLEN); + ring->bytes += max(mb->m_pkthdr.len, + (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN); ring->packets++; } - AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); + AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); - - /* valid only for none inline segments */ + /* Save mb in tx_info ring */ + tx_info = &ring->tx_info[index]; + tx_info->mb = mb; + tx_info->nr_txbb = nr_txbb; + tx_info->nr_segs = nr_segs; + /* valid only for non inline segments */ tx_info->data_offset = (void *) data - (void *) tx_desc; - tx_info->linear = (lso_header_size < skb_headlen(skb) && !is_inline(skb, NULL)) ? 1 : 0; - data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; - - if (!is_inline(skb, &fragptr)) { - /* Map fragments */ - for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { - frag = &skb_shinfo(skb)->frags[i]; - dma = pci_map_page(mdev->dev->pdev, frag->page, frag->page_offset, - frag->size, PCI_DMA_TODEVICE); + if (!is_inline(mb)) { + for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) { + if (m->m_len == 0) { + i--; + continue; + } + dma = pci_map_single(mdev->dev->pdev, m->m_data, + m->m_len, PCI_DMA_TODEVICE); data->addr = cpu_to_be64(dma); data->lkey = cpu_to_be32(mdev->mr.key); wmb(); - data->byte_count = cpu_to_be32(frag->size); - --data; + data->byte_count = cpu_to_be32(m->m_len); + data++; } - - /* Map linear part */ - if (tx_info->linear) { - dma = pci_map_single(mdev->dev->pdev, skb->data + lso_header_size, - skb_headlen(skb) - lso_header_size, PCI_DMA_TODEVICE); - data->addr = cpu_to_be64(dma); - data->lkey = cpu_to_be32(mdev->mr.key); - wmb(); - data->byte_count = cpu_to_be32(skb_headlen(skb) - lso_header_size); + if (lso_header_size) { + mb->m_data -= lso_header_size; + mb->m_len += lso_header_size; } tx_info->inl = 0; } else { - build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr); + build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind); tx_info->inl = 1; } @@ -856,10 +879,6 @@ if (bounce) tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); - /* Run destructor before passing skb to HW */ - if (likely(!skb_shared(skb))) - skb_orphan(skb); - if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) { *(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn; op_own |= htonl((bf_index & 0xffff) << 8); @@ -884,16 +903,133 @@ wmb(); writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL); } - dev->trans_start = jiffies; - /* Poll CQ here */ - mlx4_en_xmit_poll(priv, tx_ind); - return 0; tx_drop: - dev_kfree_skb_any(skb); - priv->stats.tx_dropped++; - return NETDEV_TX_OK; + *mbp = NULL; + m_freem(mb); + ring->errors++; + return EINVAL; } + +static int +mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_tx_ring *ring; + struct mbuf *next; + int enqueued, err = 0; + + ring = &priv->tx_ring[tx_ind]; + if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || priv->port_up == 0) { + if (m != NULL) + err = drbr_enqueue(dev, ring->br, m); + return (err); + } + + enqueued = 0; + if (m == NULL) { + next = drbr_dequeue(dev, ring->br); + } else if (drbr_needs_enqueue(dev, ring->br)) { + if ((err = drbr_enqueue(dev, ring->br, m)) != 0) + return (err); + next = drbr_dequeue(dev, ring->br); + } else + next = m; + + /* Process the queue */ + while (next != NULL) { + if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) { + if (next != NULL) + err = drbr_enqueue(dev, ring->br, next); + break; + } + enqueued++; + drbr_stats_update(dev, next->m_pkthdr.len, next->m_flags); + /* Send a copy of the frame to the BPF listener */ + ETHER_BPF_MTAP(dev, next); + if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) + break; + next = drbr_dequeue(dev, ring->br); + } + + if (enqueued > 0) + ring->watchdog_time = ticks; + + return (err); +} + +void +mlx4_en_tx_que(void *context, int pending) +{ + struct mlx4_en_tx_ring *ring; + struct mlx4_en_priv *priv; + struct net_device *dev; + struct mlx4_en_cq *cq; + int tx_ind; + + cq = context; + dev = cq->dev; + priv = dev->if_softc; + tx_ind = cq->ring; + ring = &priv->tx_ring[tx_ind]; + if (dev->if_drv_flags & IFF_DRV_RUNNING) { + mlx4_en_xmit_poll(priv, tx_ind); + spin_lock(&ring->tx_lock); + if (!drbr_empty(dev, ring->br)) + mlx4_en_transmit_locked(dev, tx_ind, NULL); + spin_unlock(&ring->tx_lock); + } +} + +int +mlx4_en_transmit(struct ifnet *dev, struct mbuf *m) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_tx_ring *ring; + struct mlx4_en_cq *cq; + int i = 0, err = 0; + + /* Which queue to use */ + if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID) + i = m->m_pkthdr.flowid % (MLX4_EN_NUM_HASH_RINGS - 1); + else + i = mlx4_en_select_queue(dev, m); + + ring = &priv->tx_ring[i]; + + if (spin_trylock(&ring->tx_lock)) { + err = mlx4_en_transmit_locked(dev, i, m); + spin_unlock(&ring->tx_lock); + /* Poll CQ here */ + mlx4_en_xmit_poll(priv, i); + } else { + err = drbr_enqueue(dev, ring->br, m); + cq = &priv->tx_cq[i]; + taskqueue_enqueue(cq->tq, &cq->cq_task); + } + + return (err); +} + +/* + * Flush ring buffers. + */ +void +mlx4_en_qflush(struct ifnet *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_tx_ring *ring = priv->tx_ring; + struct mbuf *m; + + for (int i = 0; i < priv->tx_ring_num; i++, ring++) { + spin_lock(&ring->tx_lock); + while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) + m_freem(m); + spin_unlock(&ring->tx_lock); + } + if_qflush(dev); +} Index: sys/ofed/drivers/net/mlx4/icm.h =================================================================== --- sys/ofed/drivers/net/mlx4/icm.h (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/icm.h (.../head) (revision 219811) @@ -71,12 +71,6 @@ gfp_t gfp_mask, int coherent); void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); -void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, u64 virt, int obj_size, int nobj, int reserved, int use_lowmem, int use_coherent); Index: sys/ofed/drivers/net/mlx4/fw.h =================================================================== --- sys/ofed/drivers/net/mlx4/fw.h (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/fw.h (.../head) (revision 219811) @@ -180,8 +180,6 @@ int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic); int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt); int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages); -int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); -int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); int mlx4_NOP(struct mlx4_dev *dev); int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); Index: sys/ofed/drivers/net/mlx4/main.c =================================================================== --- sys/ofed/drivers/net/mlx4/main.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/main.c (.../head) (revision 219811) @@ -128,7 +128,7 @@ MODULE_PARM_DESC(log_num_mtt, "log maximum number of memory translation table segments per HCA"); -static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); +static int log_mtts_per_seg = 0; module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); @@ -259,7 +259,7 @@ if (dev_cap->min_page_sz > PAGE_SIZE) { mlx4_err(dev, "HCA minimum page size of %d bigger than " - "kernel PAGE_SIZE of %ld, aborting.\n", + "kernel PAGE_SIZE of %d, aborting.\n", dev_cap->min_page_sz, PAGE_SIZE); return -ENODEV; } @@ -1647,6 +1647,8 @@ return -1; } + if (log_mtts_per_seg == 0) + log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; @@ -1683,5 +1685,20 @@ destroy_workqueue(mlx4_wq); } -module_init(mlx4_init); +module_init_order(mlx4_init, SI_ORDER_MIDDLE); module_exit(mlx4_cleanup); + +#undef MODULE_VERSION +#include +static int +mlx4_evhand(module_t mod, int event, void *arg) +{ + return (0); +} + +static moduledata_t mlx4_mod = { + .name = "mlx4", + .evhand = mlx4_evhand, +}; +MODULE_VERSION(mlx4, 1); +DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_SMP, SI_ORDER_ANY); Index: sys/ofed/drivers/net/mlx4/en_cq.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_cq.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_cq.c (.../head) (revision 219811) @@ -31,12 +31,12 @@ * */ +#include "mlx4_en.h" + #include #include #include -#include "mlx4_en.h" - static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event) { return; @@ -55,14 +55,20 @@ cq->buf_size = cq->size * sizeof(struct mlx4_cqe); cq->vector = (ring + priv->port) % mdev->dev->caps.num_comp_vectors; + TASK_INIT(&cq->cq_task, 0, mlx4_en_rx_que, cq); } else { cq->buf_size = sizeof(struct mlx4_cqe); cq->vector = MLX4_LEAST_ATTACHED_VECTOR; + TASK_INIT(&cq->cq_task, 0, mlx4_en_tx_que, cq); } + cq->tq = taskqueue_create_fast("mlx4_en_que", M_NOWAIT, + taskqueue_thread_enqueue, &cq->tq); + taskqueue_start_threads(&cq->tq, 1, PI_NET, "%s cq", + if_name(priv->dev)); cq->ring = ring; cq->is_tx = mode; - spin_lock_init(&cq->lock); + mtx_init(&cq->lock.m, "mlx4 cq", NULL, MTX_DEF); err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres, cq->buf_size, 2 * PAGE_SIZE); @@ -105,9 +111,6 @@ init_timer(&cq->timer); cq->timer.function = mlx4_en_poll_tx_cq; cq->timer.data = (unsigned long) cq; - } else { - netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); - napi_enable(&cq->napi); } return 0; @@ -117,22 +120,22 @@ { struct mlx4_en_dev *mdev = priv->mdev; + taskqueue_drain(cq->tq, &cq->cq_task); + taskqueue_free(cq->tq); mlx4_en_unmap_buffer(&cq->wqres.buf); mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); cq->buf_size = 0; cq->buf = NULL; + mtx_destroy(&cq->lock.m); } void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) { struct mlx4_en_dev *mdev = priv->mdev; + taskqueue_drain(cq->tq, &cq->cq_task); if (cq->is_tx) del_timer(&cq->timer); - else { - napi_disable(&cq->napi); - netif_napi_del(&cq->napi); - } mlx4_cq_free(mdev->dev, &cq->mcq); } Index: sys/ofed/drivers/net/mlx4/alloc.c =================================================================== --- sys/ofed/drivers/net/mlx4/alloc.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/alloc.c (.../head) (revision 219811) @@ -207,6 +207,7 @@ { dma_addr_t t; + buf->direct.buf = NULL; if (size <= max_direct) { buf->nbufs = 1; buf->npages = 1; @@ -228,6 +229,7 @@ int i; buf->direct.buf = NULL; + buf->direct.map = 0; buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; @@ -289,6 +291,7 @@ buf->page_list[i].map); kfree(buf->page_list); } + buf->direct.buf = NULL; } EXPORT_SYMBOL_GPL(mlx4_buf_free); Index: sys/ofed/drivers/net/mlx4/en_resources.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_resources.c (.../base) (revision 219811) +++ sys/ofed/drivers/net/mlx4/en_resources.c (.../head) (revision 219811) @@ -69,7 +69,7 @@ struct page **pages; int i; - if (BITS_PER_LONG == 64 || buf->nbufs == 1) + if (buf->direct.buf != NULL || buf->nbufs == 1) return 0; pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); @@ -89,10 +89,11 @@ void mlx4_en_unmap_buffer(struct mlx4_buf *buf) { - if (BITS_PER_LONG == 64 || buf->nbufs == 1) + if (buf->direct.buf != NULL || buf->nbufs == 1) return; vunmap(buf->direct.buf); + buf->direct.buf = NULL; } void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event) Property changes on: sys/ofed ___________________________________________________________________ Added: svn:mergeinfo Merged /user/mav/ata/sys/ofed:r189793-190578 Merged /projects/cambria/sys/ofed:r186008-186350 Merged /user/piso/sys/ofed:r186543,186723,186725-186726,186742,186770-186771,186774,186777-186779,187984-187985,190555,190572,190589,190592,190614,190625,190830 Merged /projects/quota64/sys/ofed:r184125-207707 Merged /user/piso/ipfw/sys/ofed:r190918,190921,190923,190926 Merged /head/sys/contrib/dev/acpica/ofed:r207340 Merged /projects/ofed/base/sys/ofed:r207767-219808 Merged /head/sys/ofed:r2-4 Merged /user/thompsa/usb/sys/ofed:r187190 Merged /user/dfr/xenhvm/6/sys/ofed:r189304,189451 Merged /user/peter/kinfo/sys/ofed:r185413-185547 Merged /user/dfr/xenhvm/7/sys/ofed:r188574-189614 Index: sys/dev/hptmv/hptproc.c =================================================================== --- sys/dev/hptmv/hptproc.c (.../base) (revision 219811) +++ sys/dev/hptmv/hptproc.c (.../head) (revision 219811) @@ -51,8 +51,8 @@ static char hptproc_buffer[256]; extern char DRIVER_VERSION[]; -#define FORMAL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, int arg2, \ - struct sysctl_req *req +#define FORMAL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ + intptr_t arg2, struct sysctl_req *req #define REAL_HANDLER_ARGS oidp, arg1, arg2, req typedef struct sysctl_req HPT_GET_INFO; Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c (.../base) (revision 219811) +++ sys/vm/vm_map.c (.../head) (revision 219811) @@ -2324,7 +2324,11 @@ unsigned int last_timestamp; int rv; boolean_t fictitious, need_wakeup, result, user_wire; + vm_prot_t prot; + prot = 0; + if (flags & VM_MAP_WIRE_WRITE) + prot |= VM_PROT_WRITE; user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); @@ -2392,20 +2396,17 @@ * above.) */ entry->eflags |= MAP_ENTRY_IN_TRANSITION; - /* - * - */ + if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 + || (entry->protection & prot) != prot) { + entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; + if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { + end = entry->end; + rv = KERN_INVALID_ADDRESS; + goto done; + } + goto next_entry; + } if (entry->wired_count == 0) { - if ((entry->protection & (VM_PROT_READ|VM_PROT_EXECUTE)) - == 0) { - entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; - if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { - end = entry->end; - rv = KERN_INVALID_ADDRESS; - goto done; - } - goto next_entry; - } entry->wired_count++; saved_start = entry->start; saved_end = entry->end; Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h (.../base) (revision 219811) +++ sys/vm/vm_map.h (.../head) (revision 219811) @@ -346,6 +346,8 @@ #define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ #define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ +#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ + #ifdef _KERNEL boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c (.../base) (revision 219811) +++ sys/vm/uma_core.c (.../head) (revision 219811) @@ -112,7 +112,7 @@ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ -static int uma_align_cache = 64 - 1; +int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); Index: sys/net/if.c =================================================================== --- sys/net/if.c (.../base) (revision 219811) +++ sys/net/if.c (.../head) (revision 219811) @@ -1881,6 +1881,11 @@ void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ +struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); +struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); +int (*vlan_tag_p)(struct ifnet *, uint16_t *); +int (*vlan_setcookie_p)(struct ifnet *, void *); +void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs @@ -1935,6 +1940,7 @@ if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); + EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state); CURVNET_RESTORE(); } Index: sys/net/if_types.h =================================================================== --- sys/net/if_types.h (.../base) (revision 219811) +++ sys/net/if_types.h (.../head) (revision 219811) @@ -238,6 +238,7 @@ #define IFT_ATMVCIENDPT 0xc2 /* ATM VCI End Point */ #define IFT_OPTICALCHANNEL 0xc3 /* Optical Channel */ #define IFT_OPTICALTRANSPORT 0xc4 /* Optical Transport */ +#define IFT_INFINIBAND 0xc7 /* Infiniband */ #define IFT_BRIDGE 0xd1 /* Transparent bridge interface */ #define IFT_STF 0xd7 /* 6to4 interface */ Index: sys/net/if_vlan_var.h =================================================================== --- sys/net/if_vlan_var.h (.../base) (revision 219811) +++ sys/net/if_vlan_var.h (.../head) (revision 219811) @@ -131,7 +131,25 @@ (*vlan_trunk_cap_p)(_ifp); \ } while (0) +#define VLAN_TRUNKDEV(_ifp) \ + (_ifp)->if_type == IFT_L2VLAN ? (*vlan_trunkdev_p)((_ifp)) : NULL +#define VLAN_TAG(_ifp, _tag) \ + (_ifp)->if_type == IFT_L2VLAN ? (*vlan_tag_p)((_ifp), (_tag)) : EINVAL +#define VLAN_COOKIE(_ifp) \ + (_ifp)->if_type == IFT_L2VLAN ? (*vlan_cookie_p)((_ifp)) : NULL +#define VLAN_SETCOOKIE(_ifp, _cookie) \ + (_ifp)->if_type == IFT_L2VLAN ? \ + (*vlan_setcookie_p)((_ifp), (_cookie)) : EINVAL +#define VLAN_DEVAT(_ifp, _tag) \ + (_ifp)->if_vlantrunk != NULL ? (*vlan_devat_p)((_ifp), (_tag)) : NULL + extern void (*vlan_trunk_cap_p)(struct ifnet *); +extern struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); +extern struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); +extern int (*vlan_tag_p)(struct ifnet *, uint16_t *); +extern int (*vlan_setcookie_p)(struct ifnet *, void *); +extern void *(*vlan_cookie_p)(struct ifnet *); + #endif /* _KERNEL */ #endif /* _NET_IF_VLAN_VAR_H_ */ Index: sys/net/if_arp.h =================================================================== --- sys/net/if_arp.h (.../base) (revision 219811) +++ sys/net/if_arp.h (.../head) (revision 219811) @@ -50,6 +50,7 @@ #define ARPHRD_ARCNET 7 /* arcnet hardware format */ #define ARPHRD_FRELAY 15 /* frame relay hardware format */ #define ARPHRD_IEEE1394 24 /* firewire hardware format */ +#define ARPHRD_INFINIBAND 32 /* infiniband hardware format */ u_short ar_pro; /* format of protocol address */ u_char ar_hln; /* length of hardware address */ u_char ar_pln; /* length of protocol address */ Index: sys/net/if_vlan.c =================================================================== --- sys/net/if_vlan.c (.../base) (revision 219811) +++ sys/net/if_vlan.c (.../head) (revision 219811) @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -90,13 +91,14 @@ }; struct vlan_mc_entry { - struct ether_addr mc_addr; + struct sockaddr_dl mc_addr; SLIST_ENTRY(vlan_mc_entry) mc_entries; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; + void *ifv_cookie; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) int ifv_pflags; /* special flags we have set on parent */ @@ -153,12 +155,12 @@ * however on practice it does not. Probably this is because array * is too big to fit into CPU cache. */ -static struct mtx ifv_mtx; -#define VLAN_LOCK_INIT() mtx_init(&ifv_mtx, "vlan_global", NULL, MTX_DEF) -#define VLAN_LOCK_DESTROY() mtx_destroy(&ifv_mtx) -#define VLAN_LOCK_ASSERT() mtx_assert(&ifv_mtx, MA_OWNED) -#define VLAN_LOCK() mtx_lock(&ifv_mtx) -#define VLAN_UNLOCK() mtx_unlock(&ifv_mtx) +static struct sx ifv_lock; +#define VLAN_LOCK_INIT() sx_init(&ifv_lock, "vlan_global") +#define VLAN_LOCK_DESTROY() sx_destroy(&ifv_lock) +#define VLAN_LOCK_ASSERT() sx_assert(&ifv_lock, SA_LOCKED) +#define VLAN_LOCK() sx_xlock(&ifv_lock) +#define VLAN_UNLOCK() sx_xunlock(&ifv_lock) #define TRUNK_LOCK_INIT(trunk) rw_init(&(trunk)->rw, VLANNAME) #define TRUNK_LOCK_DESTROY(trunk) rw_destroy(&(trunk)->rw) #define TRUNK_LOCK(trunk) rw_wlock(&(trunk)->rw) @@ -386,6 +388,47 @@ } } #endif /* 0 */ +#else + +static __inline struct ifvlan * +vlan_gethash(struct ifvlantrunk *trunk, uint16_t tag) +{ + + return trunk->vlans[tag]; +} + +static __inline int +vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) +{ + + if (trunk->vlans[ifv->ifv_tag] != NULL) + return EEXIST; + trunk->vlans[ifv->ifv_tag] = ifv; + trunk->refcnt++; + + return (0); +} + +static __inline int +vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) +{ + + trunk->vlans[ifv->ifv_tag] = NULL; + trunk->refcnt--; + + return (0); +} + +static __inline void +vlan_freehash(struct ifvlantrunk *trunk) +{ +} + +static __inline void +vlan_inithash(struct ifvlantrunk *trunk) +{ +} + #endif /* !VLAN_ARRAY */ static void @@ -394,9 +437,7 @@ VLAN_LOCK_ASSERT(); TRUNK_LOCK(trunk); -#ifndef VLAN_ARRAY vlan_freehash(trunk); -#endif trunk->parent->if_vlantrunk = NULL; TRUNK_UNLOCK(trunk); TRUNK_LOCK_DESTROY(trunk); @@ -421,7 +462,6 @@ struct ifmultiaddr *ifma, *rifma = NULL; struct ifvlan *sc; struct vlan_mc_entry *mc; - struct sockaddr_dl sdl; int error; /*VLAN_LOCK_ASSERT();*/ @@ -432,17 +472,9 @@ CURVNET_SET_QUIET(ifp_p->if_vnet); - bzero((char *)&sdl, sizeof(sdl)); - sdl.sdl_len = sizeof(sdl); - sdl.sdl_family = AF_LINK; - sdl.sdl_index = ifp_p->if_index; - sdl.sdl_type = IFT_ETHER; - sdl.sdl_alen = ETHER_ADDR_LEN; - /* First, remove any existing filter entries. */ while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { - bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN); - error = if_delmulti(ifp_p, (struct sockaddr *)&sdl); + error = if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); if (error) return (error); SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); @@ -456,12 +488,11 @@ mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) return (ENOMEM); - bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), - (char *)&mc->mc_addr, ETHER_ADDR_LEN); + bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); + mc->mc_addr.sdl_index = ifp_p->if_index; SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); - bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), - LLADDR(&sdl), ETHER_ADDR_LEN); - error = if_addmulti(ifp_p, (struct sockaddr *)&sdl, &rifma); + error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, + &rifma); if (error) return (error); } @@ -503,7 +534,8 @@ LIST_FOREACH_SAFE(ifv, &ifp->if_vlantrunk->hash[i], ifv_list, next) { #endif /* VLAN_ARRAY */ VLAN_UNLOCK(); - if_setlladdr(ifv->ifv_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); + if_setlladdr(ifv->ifv_ifp, IF_LLADDR(ifp), + ifp->if_addrlen); VLAN_LOCK(); } VLAN_UNLOCK(); @@ -564,6 +596,92 @@ } /* + * Return the trunk device for a virtual interface. + */ +static struct ifnet * +vlan_trunkdev(struct ifnet *ifp) +{ + struct ifvlan *ifv; + + if (ifp->if_type != IFT_L2VLAN) + return (NULL); + ifv = ifp->if_softc; + ifp = NULL; + VLAN_LOCK(); + if (ifv->ifv_trunk) + ifp = PARENT(ifv); + VLAN_UNLOCK(); + return (ifp); +} + +/* + * Return the 16bit vlan tag for this interface. + */ +static int +vlan_tag(struct ifnet *ifp, uint16_t *tagp) +{ + struct ifvlan *ifv; + + if (ifp->if_type != IFT_L2VLAN) + return (EINVAL); + ifv = ifp->if_softc; + *tagp = ifv->ifv_tag; + return (0); +} + +/* + * Return a driver specific cookie for this interface. Synchronization + * with setcookie must be provided by the driver. + */ +static void * +vlan_cookie(struct ifnet *ifp) +{ + struct ifvlan *ifv; + + if (ifp->if_type != IFT_L2VLAN) + return (NULL); + ifv = ifp->if_softc; + return (ifv->ifv_cookie); +} + +/* + * Store a cookie in our softc that drivers can use to store driver + * private per-instance data in. + */ +static int +vlan_setcookie(struct ifnet *ifp, void *cookie) +{ + struct ifvlan *ifv; + + if (ifp->if_type != IFT_L2VLAN) + return (EINVAL); + ifv = ifp->if_softc; + ifv->ifv_cookie = cookie; + return (0); +} + +/* + * Return the vlan device present at the specific tag. + */ +static struct ifnet * +vlan_devat(struct ifnet *ifp, uint16_t tag) +{ + struct ifvlantrunk *trunk; + struct ifvlan *ifv; + + trunk = ifp->if_vlantrunk; + if (trunk == NULL) + return (NULL); + ifp = NULL; + TRUNK_RLOCK(trunk); + ifv = vlan_gethash(trunk, tag); + if (ifv) + ifp = ifv->ifv_ifp; + TRUNK_RUNLOCK(trunk); + return (ifp); +} + +/* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and @@ -593,6 +711,11 @@ vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; + vlan_trunkdev_p = vlan_trunkdev; + vlan_cookie_p = vlan_cookie; + vlan_setcookie_p = vlan_setcookie; + vlan_tag_p = vlan_tag; + vlan_devat_p = vlan_devat; #ifndef VIMAGE if_clone_attach(&vlan_cloner); #endif @@ -615,6 +738,11 @@ vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; + vlan_trunkdev_p = NULL; + vlan_tag_p = NULL; + vlan_cookie_p = vlan_cookie; + vlan_setcookie_p = vlan_setcookie; + vlan_devat_p = NULL; VLAN_LOCK_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); @@ -665,7 +793,12 @@ /* Check for . style interface names. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type != IFT_ETHER) + /* + * We can handle non-ethernet hardware types as long as + * they handle the tagging and headers themselves. + */ + if (ifp->if_type != IFT_ETHER && + (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) continue; if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) continue; @@ -916,7 +1049,7 @@ * devices that just discard such runts instead or mishandle * them somehow. */ - if (soft_pad) { + if (soft_pad && p->if_type == IFT_ETHER) { static char pad[8]; /* just zeros */ int n; @@ -1020,11 +1153,7 @@ } TRUNK_RLOCK(trunk); -#ifdef VLAN_ARRAY - ifv = trunk->vlans[tag]; -#else ifv = vlan_gethash(trunk, tag); -#endif if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { TRUNK_RUNLOCK(trunk); m_freem(m); @@ -1050,7 +1179,8 @@ /* VID numbers 0x0 and 0xFFF are reserved */ if (tag == 0 || tag == 0xFFF) return (EINVAL); - if (p->if_type != IFT_ETHER) + if (p->if_type != IFT_ETHER && + (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); @@ -1060,15 +1190,11 @@ if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); -#ifndef VLAN_ARRAY vlan_inithash(trunk); -#endif VLAN_LOCK(); if (p->if_vlantrunk != NULL) { /* A race that that is very unlikely to be hit. */ -#ifndef VLAN_ARRAY vlan_freehash(trunk); -#endif free(trunk, M_VLAN); goto exists; } @@ -1084,18 +1210,9 @@ } ifv->ifv_tag = tag; /* must set this before vlan_inshash() */ -#ifdef VLAN_ARRAY - if (trunk->vlans[tag] != NULL) { - error = EEXIST; - goto done; - } - trunk->vlans[tag] = ifv; - trunk->refcnt++; -#else error = vlan_inshash(trunk, ifv); if (error) goto done; -#endif ifv->ifv_proto = ETHERTYPE_VLAN; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; @@ -1125,8 +1242,19 @@ ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; + /* + * Initialize fields from our parent. This duplicates some + * work with ether_ifattach() but allows for non-ethernet + * interfaces to also work. + */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; + ifp->if_output = p->if_output; + ifp->if_input = p->if_input; + ifp->if_resolvemulti = p->if_resolvemulti; + ifp->if_addrlen = p->if_addrlen; + ifp->if_broadcastaddr = p->if_broadcastaddr; + /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. @@ -1141,10 +1269,12 @@ vlan_capabilities(ifv); /* - * Set up our ``Ethernet address'' to reflect the underlying + * Set up our interface address to reflect the underlying * physical interface's. */ - bcopy(IF_LLADDR(p), IF_LLADDR(ifp), ETHER_ADDR_LEN); + bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); + ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = + p->if_addrlen; /* * Configure multicast addresses that may already be @@ -1187,7 +1317,6 @@ parent = NULL; if (trunk != NULL) { - struct sockaddr_dl sdl; TRUNK_LOCK(trunk); parent = trunk->parent; @@ -1197,17 +1326,7 @@ * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ - bzero((char *)&sdl, sizeof(sdl)); - sdl.sdl_len = sizeof(sdl); - sdl.sdl_family = AF_LINK; - sdl.sdl_index = parent->if_index; - sdl.sdl_type = IFT_ETHER; - sdl.sdl_alen = ETHER_ADDR_LEN; - while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { - bcopy((char *)&mc->mc_addr, LLADDR(&sdl), - ETHER_ADDR_LEN); - /* * This may fail if the parent interface is * being detached. Regardless, we should do a @@ -1215,18 +1334,14 @@ * as possible as all callers expect vlan * destruction to succeed. */ - (void)if_delmulti(parent, (struct sockaddr *)&sdl); + (void)if_delmulti(parent, + (struct sockaddr *)&mc->mc_addr); SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); free(mc, M_VLAN); } vlan_setflags(ifp, 0); /* clear special flags on parent */ -#ifdef VLAN_ARRAY - trunk->vlans[ifv->ifv_tag] = NULL; - trunk->refcnt--; -#else vlan_remhash(trunk, ifv); -#endif ifv->ifv_trunk = NULL; /* @@ -1407,14 +1522,31 @@ { struct ifnet *p; struct ifreq *ifr; + struct ifaddr *ifa; struct ifvlan *ifv; struct vlanreq vlr; int error = 0; ifr = (struct ifreq *)data; + ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; +#ifdef INET + if (ifa->ifa_addr->sa_family == AF_INET) + arp_ifinit(ifp, ifa); +#endif + break; + case SIOCGIFADDR: + { + struct sockaddr *sa; + + sa = (struct sockaddr *)&ifr->ifr_data; + bcopy(IF_LLADDR(ifp), sa->sa_data, ifp->if_addrlen); + } + break; case SIOCGIFMEDIA: VLAN_LOCK(); if (TRUNK(ifv) != NULL) { @@ -1534,7 +1666,8 @@ break; default: - error = ether_ioctl(ifp, cmd, data); + error = EINVAL; + break; } return (error); Index: sys/net/if_llatbl.h =================================================================== --- sys/net/if_llatbl.h (.../base) (revision 219811) +++ sys/net/if_llatbl.h (.../head) (revision 219811) @@ -30,6 +30,8 @@ #ifndef _NET_IF_LLATBL_H_ #define _NET_IF_LLATBL_H_ +#include "opt_ofed.h" + #include #include @@ -72,6 +74,9 @@ union { uint64_t mac_aligned; uint16_t mac16[3]; +#ifdef OFED + uint8_t mac8[20]; /* IB needs 20 bytes. */ +#endif } ll_addr; /* XXX af-private? */ Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h (.../base) (revision 219811) +++ sys/net/if_var.h (.../head) (revision 219811) @@ -352,6 +352,9 @@ /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); +/* Interface link state change event */ +typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); +EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* * interface groups Index: sys/i386/include/endian.h =================================================================== --- sys/i386/include/endian.h (.../base) (revision 219811) +++ sys/i386/include/endian.h (.../head) (revision 219811) @@ -69,50 +69,59 @@ #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE_BUILTIN_CONSTANT_P) -#define __byte_swap_int_var(x) \ -__extension__ ({ register __uint32_t __X = (x); \ - __asm ("bswap %0" : "+r" (__X)); \ - __X; }) +#define __bswap64_const(_x) \ + (((_x) >> 56) | \ + (((_x) >> 40) & (0xffULL << 8)) | \ + (((_x) >> 24) & (0xffULL << 16)) | \ + (((_x) >> 8) & (0xffULL << 24)) | \ + (((_x) << 8) & (0xffULL << 32)) | \ + (((_x) << 24) & (0xffULL << 40)) | \ + (((_x) << 40) & (0xffULL << 48)) | \ + ((_x) << 56)) -#ifdef __OPTIMIZE__ +#define __bswap32_const(_x) \ + (((_x) >> 24) | \ + (((_x) & (0xff << 16)) >> 8) | \ + (((_x) & (0xff << 8)) << 8) | \ + ((_x) << 24)) -#define __byte_swap_int_const(x) \ - ((((x) & 0xff000000) >> 24) | \ - (((x) & 0x00ff0000) >> 8) | \ - (((x) & 0x0000ff00) << 8) | \ - (((x) & 0x000000ff) << 24)) -#define __byte_swap_int(x) (__builtin_constant_p(x) ? \ - __byte_swap_int_const(x) : __byte_swap_int_var(x)) +#define __bswap16_const(_x) (__uint16_t)((_x) << 8 | (_x) >> 8) -#else /* __OPTIMIZE__ */ - -#define __byte_swap_int(x) __byte_swap_int_var(x) - -#endif /* __OPTIMIZE__ */ - static __inline __uint64_t -__bswap64(__uint64_t _x) +__bswap64_var(__uint64_t __x) { - return ((_x >> 56) | ((_x >> 40) & 0xff00) | ((_x >> 24) & 0xff0000) | - ((_x >> 8) & 0xff000000) | ((_x << 8) & ((__uint64_t)0xff << 32)) | - ((_x << 24) & ((__uint64_t)0xff << 40)) | - ((_x << 40) & ((__uint64_t)0xff << 48)) | ((_x << 56))); + return __bswap64_const(__x); } + static __inline __uint32_t -__bswap32(__uint32_t _x) +__bswap32_var(__uint32_t _x) { - return (__byte_swap_int(_x)); + __asm ("bswap %0" : "+r" (_x)); + return (_x); } static __inline __uint16_t -__bswap16(__uint16_t _x) +__bswap16_var(__uint16_t _x) { - return (_x << 8 | _x >> 8); + + return (__bswap16_const(_x)); } +#define __bswap64(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap64_const((__uint64_t)(_x)) : __bswap64_var(_x)) + +#define __bswap32(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap32_const((__uint32_t)(_x)) : __bswap32_var(_x)) + +#define __bswap16(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap16_const((__uint16_t)(_x)) : __bswap16_var(_x)) + #define __htonl(x) __bswap32(x) #define __htons(x) __bswap16(x) #define __ntohl(x) __bswap32(x) Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c (.../base) (revision 219811) +++ sys/netinet6/nd6.c (.../head) (revision 219811) @@ -2100,6 +2100,7 @@ #ifdef IFT_CARP case IFT_CARP: #endif + case IFT_INFINIBAND: case IFT_GIF: /* XXX need more cases? */ case IFT_PPP: case IFT_TUNNEL: Index: sys/netinet6/nd6_nbr.c =================================================================== --- sys/netinet6/nd6_nbr.c (.../base) (revision 219811) +++ sys/netinet6/nd6_nbr.c (.../head) (revision 219811) @@ -1132,6 +1132,7 @@ #ifdef IFT_CARP case IFT_CARP: #endif + case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_ISO88025: return IF_LLADDR(ifp); @@ -1449,6 +1450,7 @@ #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif + case IFT_INFINIBAND: in6 = ia->ia_addr.sin6_addr; if (in6_get_hw_ifid(ifp, &in6) == 0 && IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { Index: sys/netinet6/in6.c =================================================================== --- sys/netinet6/in6.c (.../base) (revision 219811) +++ sys/netinet6/in6.c (.../head) (revision 219811) @@ -2298,6 +2298,7 @@ #ifdef IFT_MIP case IFT_MIP: /* ditto */ #endif + case IFT_INFINIBAND: return (64); case IFT_FDDI: /* RFC2467 */ return (64); Property changes on: sys/contrib/pf ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/contrib/pf:r207767-219808 Property changes on: sys/contrib/octeon-sdk ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /head/sys/contrib/octeon-sdk:r210286-216915 Merged /projects/ofed/base/sys/contrib/octeon-sdk:r211851-219808 Property changes on: sys/contrib/x86emu ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/contrib/x86emu:r207767-219808 Property changes on: sys/contrib/dev/acpica ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/contrib/dev/acpica:r207767-219808 Property changes on: sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/cddl/contrib/opensolaris:r207767-219808 Index: sys/amd64/include/endian.h =================================================================== --- sys/amd64/include/endian.h (.../base) (revision 219811) +++ sys/amd64/include/endian.h (.../head) (revision 219811) @@ -69,73 +69,59 @@ #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE_BUILTIN_CONSTANT_P) -#define __byte_swap_int_var(x) \ -__extension__ ({ register __uint32_t __X = (x); \ - __asm ("bswap %0" : "+r" (__X)); \ - __X; }) +#define __bswap64_const(_x) \ + (((_x) >> 56) | \ + (((_x) >> 40) & (0xffUL << 8)) | \ + (((_x) >> 24) & (0xffUL << 16)) | \ + (((_x) >> 8) & (0xffUL << 24)) | \ + (((_x) << 8) & (0xffUL << 32)) | \ + (((_x) << 24) & (0xffUL << 40)) | \ + (((_x) << 40) & (0xffUL << 48)) | \ + ((_x) << 56)) -#ifdef __OPTIMIZE__ +#define __bswap32_const(_x) \ + (((_x) >> 24) | \ + (((_x) & (0xff << 16)) >> 8) | \ + (((_x) & (0xff << 8)) << 8) | \ + ((_x) << 24)) -#define __byte_swap_int_const(x) \ - ((((x) & 0xff000000) >> 24) | \ - (((x) & 0x00ff0000) >> 8) | \ - (((x) & 0x0000ff00) << 8) | \ - (((x) & 0x000000ff) << 24)) -#define __byte_swap_int(x) (__builtin_constant_p(x) ? \ - __byte_swap_int_const(x) : __byte_swap_int_var(x)) +#define __bswap16_const(_x) (__uint16_t)((_x) << 8 | (_x) >> 8) -#else /* __OPTIMIZE__ */ - -#define __byte_swap_int(x) __byte_swap_int_var(x) - -#endif /* __OPTIMIZE__ */ - -#define __byte_swap_long_var(x) \ -__extension__ ({ register __uint64_t __X = (x); \ - __asm ("bswap %0" : "+r" (__X)); \ - __X; }) - -#ifdef __OPTIMIZE__ - -#define __byte_swap_long_const(x) \ - (((x >> 56) | \ - ((x >> 40) & 0xff00) | \ - ((x >> 24) & 0xff0000) | \ - ((x >> 8) & 0xff000000) | \ - ((x << 8) & (0xfful << 32)) | \ - ((x << 24) & (0xfful << 40)) | \ - ((x << 40) & (0xfful << 48)) | \ - ((x << 56)))) - -#define __byte_swap_long(x) (__builtin_constant_p(x) ? \ - __byte_swap_long_const(x) : __byte_swap_long_var(x)) - -#else /* __OPTIMIZE__ */ - -#define __byte_swap_long(x) __byte_swap_long_var(x) - -#endif /* __OPTIMIZE__ */ - static __inline __uint64_t -__bswap64(__uint64_t _x) +__bswap64_var(__uint64_t _x) { - return (__byte_swap_long(_x)); + __asm ("bswap %0" : "+r" (_x)); + return (_x); } static __inline __uint32_t -__bswap32(__uint32_t _x) +__bswap32_var(__uint32_t _x) { - return (__byte_swap_int(_x)); + __asm ("bswap %0" : "+r" (_x)); + return (_x); } static __inline __uint16_t -__bswap16(__uint16_t _x) +__bswap16_var(__uint16_t _x) { - return (_x << 8 | _x >> 8); + + return (__bswap16_const(_x)); } +#define __bswap64(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap64_const((__uint64_t)(_x)) : __bswap64_var(_x)) + +#define __bswap32(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap32_const((__uint32_t)(_x)) : __bswap32_var(_x)) + +#define __bswap16(_x) \ + (__builtin_constant_p(_x) ? \ + __bswap16_const((__uint16_t)(_x)) : __bswap16_var(_x)) + #define __htonl(x) __bswap32(x) #define __htons(x) __bswap16(x) #define __ntohl(x) __bswap32(x) Property changes on: sys/amd64/include/xen ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys/amd64/include/xen:r207767-219808 Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC (.../base) (revision 219811) +++ sys/amd64/conf/GENERIC (.../head) (revision 219811) @@ -59,6 +59,7 @@ options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +device hwpmc options AUDIT # Security event auditing options MAC # TrustedBSD MAC Framework #options KDTRACE_FRAME # Ensure frames are compiled in @@ -74,6 +75,7 @@ options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +options ALT_BREAK_TO_DEBUGGER options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones # Make an SMP-capable kernel by default @@ -86,6 +88,15 @@ device acpi device pci +# Infiniband Bus and drivers +options OFED # Infiniband protocol stack and support +options SDP # Sockets Direct Protocol for infiniband +device ipoib # IP over IB devices +options IPOIB_CM # Use connect mode ipoib +device mlx4ib # ConnectX Infiniband support +device mlxen # ConnectX Ethernet support +device mthca # Infinihost cards + # Floppy drives device fdc Index: sys/sys/sysctl.h =================================================================== --- sys/sys/sysctl.h (.../base) (revision 219811) +++ sys/sys/sysctl.h (.../head) (revision 219811) @@ -117,8 +117,8 @@ #ifdef _KERNEL #include -#define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, int arg2, \ - struct sysctl_req *req +#define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ + intptr_t arg2, struct sysctl_req *req /* definitions for sysctl_req 'lock' member */ #define REQ_UNWIRED 1 @@ -160,7 +160,7 @@ int oid_number; u_int oid_kind; void *oid_arg1; - int oid_arg2; + intptr_t oid_arg2; const char *oid_name; int (*oid_handler)(SYSCTL_HANDLER_ARGS); const char *oid_fmt; @@ -746,9 +746,11 @@ /* Dynamic oid handling */ struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int nbr, const char *name, - int kind, void *arg1, int arg2, + int kind, void *arg1, intptr_t arg2, int (*handler) (SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr); +int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, + int recurse); void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name); int sysctl_move_oid(struct sysctl_oid *oidp, struct sysctl_oid_list *parent); Index: sys/sys/sx.h =================================================================== --- sys/sys/sx.h (.../base) (revision 219811) +++ sys/sys/sx.h (.../head) (revision 219811) @@ -118,18 +118,22 @@ struct sx_args { struct sx *sa_sx; const char *sa_desc; + int sa_flags; }; -#define SX_SYSINIT(name, sxa, desc) \ +#define SX_SYSINIT_FLAGS(name, sxa, desc, flags) \ static struct sx_args name##_args = { \ (sxa), \ - (desc) \ + (desc), \ + (flags) \ }; \ SYSINIT(name##_sx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ sx_sysinit, &name##_args); \ SYSUNINIT(name##_sx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ sx_destroy, (sxa)) +#define SX_SYSINIT(name, sxa, desc) SX_SYSINIT_FLAGS(name, sxa, desc, 0) + /* * Full lock operations that are suitable to be inlined in non-debug kernels. * If the lock can't be acquired or released trivially then the work is Index: sys/sys/jail.h =================================================================== --- sys/sys/jail.h (.../base) (revision 219811) +++ sys/sys/jail.h (.../head) (revision 219811) @@ -379,7 +379,7 @@ int prison_if(struct ucred *cred, struct sockaddr *sa); char *prison_name(struct prison *, struct prison *); int prison_priv_check(struct ucred *cred, int priv); -int sysctl_jail_param(struct sysctl_oid *, void *, int , struct sysctl_req *); +int sysctl_jail_param(SYSCTL_HANDLER_ARGS); #endif /* _KERNEL */ #endif /* !_SYS_JAIL_H_ */ Index: sys/sys/bus.h =================================================================== --- sys/sys/bus.h (.../base) (revision 219811) +++ sys/sys/bus.h (.../head) (revision 219811) @@ -464,7 +464,10 @@ /* * Access functions for devclass. */ +int devclass_add_driver(devclass_t dc, driver_t *driver, + int pass, devclass_t *dcp); devclass_t devclass_create(const char *classname); +int devclass_delete_driver(devclass_t busclass, driver_t *driver); devclass_t devclass_find(const char *classname); const char *devclass_get_name(devclass_t dc); device_t devclass_get_device(devclass_t dc, int unit); Index: sys/sys/interrupt.h =================================================================== --- sys/sys/interrupt.h (.../base) (revision 219811) +++ sys/sys/interrupt.h (.../head) (revision 219811) @@ -176,6 +176,7 @@ int intr_getaffinity(int irq, void *mask); void *intr_handler_source(void *cookie); int intr_setaffinity(int irq, void *mask); +void _intr_drain(int irq); /* Linux compat only. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep); Index: sys/sys/file.h =================================================================== --- sys/sys/file.h (.../base) (revision 219811) +++ sys/sys/file.h (.../head) (revision 219811) @@ -63,6 +63,7 @@ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ +#define DTYPE_DEV 11 /* Device specific fd type */ #ifdef _KERNEL Property changes on: sys ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base/sys:r207767-219808 Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Merged /projects/ofed/base:r207767-219808