From 73ffd55a19a7bbf886a710c81159aef4e56529ac Mon Sep 17 00:00:00 2001 From: Ryan Stone Date: Mon, 5 May 2014 14:56:18 -0400 Subject: [PATCH 13/21] Pass SR-IOV configuration to kernel using an nvlist Pass all SR-IOV configuration to the kernel using an nvlist. The main benefit that this offers is flexibility. It allows a driver to accept any number of parameters of any type supported by the SR-IOV configuration infrastructure with having to make any changes outside of the driver. It also offers the user very fine-grained control over the configuration of the VFs -- if they want, they can have different configuration applied to every VF. --- sys/dev/pci/pci_if.m | 2 + sys/dev/pci/pci_iov.c | 129 +++++++++++++++++++++++++++++++++++++++++--------- sys/sys/iov.h | 93 +++++++++++++++++++++++++++++++++--- 3 files changed, 195 insertions(+), 29 deletions(-) diff --git a/sys/dev/pci/pci_if.m b/sys/dev/pci/pci_if.m index aa6bc6c..dce4395 100644 --- a/sys/dev/pci/pci_if.m +++ b/sys/dev/pci/pci_if.m @@ -184,6 +184,7 @@ METHOD int iov_detach { METHOD int init_iov { device_t dev; uint16_t num_vfs; + const struct nvlist *config; }; METHOD void uninit_iov { @@ -193,5 +194,6 @@ METHOD void uninit_iov { METHOD int add_vf { device_t dev; uint16_t vfnum; + const struct nvlist *config; }; diff --git a/sys/dev/pci/pci_iov.c b/sys/dev/pci/pci_iov.c index 33c177f..c90d7b1 100755 --- a/sys/dev/pci/pci_iov.c +++ b/sys/dev/pci/pci_iov.c @@ -70,6 +70,19 @@ static struct cdevsw iov_cdevsw = { .d_ioctl = pci_iov_ioctl }; +SYSCTL_DECL(_hw_pci); + +/* + * The maximum amount of memory we will allocate for user configuration of an + * SR-IOV device. 1MB ought to be enough for anyone, but leave this + * configurable just in case. + */ +static u_long pci_iov_max_config = 1024 * 1024; +TUNABLE_ULONG("hw.pci.iov_max_config", &pci_iov_max_config); +SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RW, &pci_iov_max_config, + 0, "Maximum allowed size of SR-IOV configuration."); + + #define IOV_READ(d, r, w) \ pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w) @@ -348,6 +361,51 @@ pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo) } } +static int +pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg, + nvlist_t **ret) +{ + void *packed_config; + nvlist_t *config; + int error; + + config = NULL; + packed_config = NULL; + + if (arg->len > pci_iov_max_config) { + error = EMSGSIZE; + goto out; + } + + packed_config = malloc(arg->len, M_SRIOV, M_WAITOK); + + error = copyin(arg->config, packed_config, arg->len); + if (error != 0) + goto out; + + config = nvlist_unpack(packed_config, arg->len); + if (config == NULL) { + error = EINVAL; + goto out; + } + + error = pci_iov_schema_validate_config(iov->iov_schema, config); + if (error != 0) + goto out; + + error = nvlist_error(config); + if (error != 0) + goto out; + + *ret = config; + config = NULL; + +out: + nvlist_destroy(config); + free(packed_config, M_SRIOV); + return (error); +} + /* * Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV * capability. This bit is only writeable on the lowest-numbered PF but @@ -422,6 +480,16 @@ pci_iov_config_page_size(struct pci_devinfo *dinfo) } static int +pci_init_iov(device_t dev, uint16_t num_vfs, const nvlist_t *config) +{ + const nvlist_t *device, *driver_config; + + device = nvlist_get_nvlist(config, PF_CONFIG_NAME); + driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME); + return (PCI_INIT_IOV(dev, num_vfs, driver_config)); +} + +static int pci_iov_init_rman(struct pcicfg_iov *iov) { int error; @@ -477,12 +545,15 @@ pci_iov_setup_bars(struct pci_devinfo *dinfo) } static void -pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver, +pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config, uint16_t first_rid, uint16_t rid_stride) { + char device_name[VF_MAX_NAME]; + const nvlist_t *device, *driver_config, *iov_config; device_t bus, dev, vf; struct pcicfg_iov *iov; struct pci_devinfo *vfinfo; + const char *driver; int i, error; uint16_t vid, did, next_rid; @@ -494,6 +565,20 @@ pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver, did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2); for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) { + snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i); + device = nvlist_get_nvlist(config, device_name); + iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME); + driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME); + + /* + * If we are creating passthrough devices then force the ppt + * driver to attach to prevent a VF driver from claiming the + * VFs. + */ + if (nvlist_get_bool(iov_config, "passthrough")) + driver = "ppt"; + else + driver = NULL; vf = pci_add_iov_child(bus, sizeof(*vfinfo), next_rid, vid, did, driver); @@ -504,7 +589,7 @@ pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver, pci_iov_add_bars(iov, vfinfo); - error = PCI_ADD_VF(dev, i); + error = PCI_ADD_VF(dev, i, driver_config); if (error != 0) { device_printf(dev, "Failed to add VF %d\n", i); pci_delete_child(bus, vf); @@ -518,14 +603,14 @@ static int pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) { device_t bus, dev; - const char *driver; struct pci_devinfo *dinfo; struct pcicfg_iov *iov; + nvlist_t *config; int i, error; uint16_t rid_off, rid_stride; uint16_t first_rid, last_rid; uint16_t iov_ctl; - uint16_t total_vfs; + uint16_t num_vfs, total_vfs; int iov_inited; mtx_lock(&Giant); @@ -534,6 +619,7 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) dev = dinfo->cfg.dev; bus = device_get_parent(dev); iov_inited = 0; + config = NULL; if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) { mtx_unlock(&Giant); @@ -541,22 +627,17 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) } iov->iov_flags |= IOV_BUSY; - total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2); + error = pci_iov_parse_config(iov, arg, &config); + if (error != 0) + goto out; - if (arg->num_vfs > total_vfs) { + num_vfs = pci_iov_config_get_num_vfs(config); + total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2); + if (num_vfs > total_vfs) { error = EINVAL; goto out; } - /* - * If we are creating passthrough devices then force the ppt driver to - * attach to prevent a VF driver from claming the VFs. - */ - if (arg->passthrough) - driver = "ppt"; - else - driver = NULL; - error = pci_iov_config_page_size(dinfo); if (error != 0) goto out; @@ -565,19 +646,18 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) if (error != 0) goto out; - error = PCI_INIT_IOV(dev, arg->num_vfs); - + error = pci_init_iov(dev, num_vfs, config); if (error != 0) goto out; - iov_inited = 1; - IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, arg->num_vfs, 2); + + IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2); rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2); rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2); first_rid = pci_get_rid(dev) + rid_off; - last_rid = first_rid + (arg->num_vfs - 1) * rid_stride; + last_rid = first_rid + (num_vfs - 1) * rid_stride; /* We don't yet support allocating extra bus numbers for VFs. */ if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) { @@ -593,7 +673,7 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) if (error != 0) goto out; - iov->iov_num_vfs = arg->num_vfs; + iov->iov_num_vfs = num_vfs; error = pci_iov_setup_bars(dinfo); if (error != 0) @@ -605,7 +685,10 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg) /* Per specification, we must wait 100ms before accessing VFs. */ msleep(iov, &Giant, 0, "iov", hz/10); - pci_iov_enumerate_vfs(dinfo, driver, first_rid, rid_stride); + pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride); + + nvlist_destroy(config); + iov->iov_flags &= ~IOV_BUSY; mtx_unlock(&Giant); return (0); @@ -628,6 +711,8 @@ out: rman_fini(&iov->rman); iov->iov_flags &= ~IOV_RMAN_INITED; } + + nvlist_destroy(config); iov->iov_num_vfs = 0; iov->iov_flags &= ~IOV_BUSY; mtx_unlock(&Giant); diff --git a/sys/sys/iov.h b/sys/sys/iov.h index ad23692..c6ef532 100755 --- a/sys/sys/iov.h +++ b/sys/sys/iov.h @@ -46,12 +46,6 @@ #define DEFAULT_SCHEMA_NAME "DEFAULT" #define REQUIRED_SCHEMA_NAME "REQUIRED" -struct pci_iov_arg -{ - int num_vfs; - int passthrough; -}; - /* * Because each PF device is expected to expose a unique set of possible * configurations, the SR-IOV infrastructure dynamically queries the PF @@ -168,7 +162,92 @@ struct pci_iov_schema int error; }; -#define IOV_CONFIG _IOWR('p', 10, struct pci_iov_arg) +/* + * SR-IOV configuration is passed to the kernel as a packed nvlist. See nv(3) + * for the details of the nvlist API. The expected format of the nvlist is: + * + * BASIC RULES + * 1) All keys are case-insensitive. + * 2) No keys that are not specified below may exist at any level of the + * config nvlist. + * 3) Unless otherwise specified, all keys are optional. It should go witout + * saying a key being mandatory is transitive: that is, if a key is + * specified to contain a sub-nodes that contains a mandatory key, then + * the outer key is implicitly mandatory. If a key is mandatory then the + * associated value is also mandatory. + * 4) Order of keys is irrelevant. + * + * TOP LEVEL OF CONFIG NVLIST + * 1) All keys specified in this section are mandatory. + * 2) There must be a top-level key with the name PF_CONFIG_NAME. The value + * associated is an nvlist that follows the "device node" format. The + * parameters in this node specify parameters that apply to the PF. + * 3) For every VF being configured (this is set via the "num_vfs" parameter + * in the PF section), there must be a top-level key whose name is VF_PREFIX + * immediately followed by the index of the VF as a decimal integer. For + * example, this would be VF-0 for the first VF. VFs are numbered starting + * from 0. The value associated with this key follows the "device node" + * format. The parameters in this node specify configuration that applies + * to the VF specified in the key. + * + * DEVICE NODES + * 1) All keys specified in this section are mandatory. + * 2) The device node must contain a key with the name DRIVER_CONFIG_NAME. The + * value associated with this key is an nvlist following the subsystem node + * format. The parameters in this key specify configuration that is specific + * to a particular device driver. + * 3) The device node must contain a key with the name IOV_CONFIG_NAME. The + * value associated with this key is an nvlist following the subsystem node + * format. The parameters in this key specify configuration that is consumed + * by the SR-IOV infrastructure. + * + * SUBSYSTEM NODES + * 1) A subsystem node specifies configuration parameters that apply to a + * particular subsystem (driver or infrastructure) of a particular device + * (PF or individual VF). + * Note: We will refer to the section of the configuration schema that + * specifies the parameters for this subsystem and device + * configuration as the device/subystem schema. + * 2) The subsystem node must contain only keys that correspond to parameters + * that are specified in the device/subsystem schema. + * 3) Every parameter specified as required in the device/subsystem schema is + * a mandatory key in the subsystem node. + * Note: All parameters that are not required in device/subsystem schema are + * optional keys. In particular, any parameter specified to have a + * default value in the device/subsystem schema is optional. The + * kernel is responsible for applying default values. + * 4) The value of every parameter in the device node must conform to the + * restrictions of the type specified for that parameter in the device/ + * subsystem schema. + * + * The following is an example of a valid configuration, when validated against + * the schema example given above. + * + * PF (NVLIST): + * driver (NVLIST): + * iov (NVLIST): + * num_vfs (NUMBER): 3 (3) (0x3) + * device (STRING): [ix0] + * VF-0 (NVLIST): + * driver (NVLIST): + * vlan (NUMBER): 1000 (1000) (0x3e8) + * iov (NVLIST): + * passthrough (BOOL): TRUE + * VF-1 (NVLIST): + * driver (NVLIST): + * iov (NVLIST): + * VF-2 (NVLIST): + * driver (NVLIST): + * mac-addr (BINARY): 6 020102030405 + * iov (NVLIST): + */ +struct pci_iov_arg +{ + void *config; + size_t len; +}; + +#define IOV_CONFIG _IOW('p', 10, struct pci_iov_arg) #define IOV_DELETE _IO('p', 11) #define IOV_GET_SCHEMA _IOWR('p', 12, struct pci_iov_schema) -- 1.9.2