commit 6b9fde2b470326a240154f2c8b6ad16cb86c8371 Author: Ryan Stone Date: Thu Aug 4 14:11:36 2016 Move debug counters to per-ring structure Currently the mlx4_en driver increments a number of port-wide counters in the rx and tx paths. This causes the cache line containing those counters to ping-pong between CPUs when more than one ring is being used simultaneously, lowering perf. The driver already has the infrastructure to maintain separate per-ring counters and then aggregate them together periodically, so make use of that to fix the cache problems. diff --git a/sys/ofed/drivers/net/mlx4/en_port.c b/sys/ofed/drivers/net/mlx4/en_port.c index 2a1c4ba..ec7d3f9 100644 --- a/sys/ofed/drivers/net/mlx4/en_port.c +++ b/sys/ofed/drivers/net/mlx4/en_port.c @@ -191,11 +191,14 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) priv->port_stats.tx_chksum_offload = 0; priv->port_stats.queue_stopped = 0; priv->port_stats.wake_queue = 0; + priv->port_stats.oversized_packets = 0; + priv->port_stats.tso_packets = 0; for (i = 0; i < priv->tx_ring_num; i++) { priv->port_stats.tx_chksum_offload += priv->tx_ring[i]->tx_csum; priv->port_stats.queue_stopped += priv->tx_ring[i]->queue_stopped; priv->port_stats.wake_queue += priv->tx_ring[i]->wake_queue; priv->port_stats.oversized_packets += priv->tx_ring[i]->oversized_packets; + priv->port_stats.tso_packets += priv->tx_ring[i]->tso_packets; } /* RX Statistics */ priv->pkstats.rx_packets = be64_to_cpu(mlx4_en_stats->RTOT_prio_0) + diff --git a/sys/ofed/drivers/net/mlx4/en_rx.c b/sys/ofed/drivers/net/mlx4/en_rx.c index 2711b22..954278c 100644 --- a/sys/ofed/drivers/net/mlx4/en_rx.c +++ b/sys/ofed/drivers/net/mlx4/en_rx.c @@ -630,7 +630,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) && (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && (cqe->checksum == cpu_to_be16(0xffff))) { - priv->port_stats.rx_chksum_good++; + ring->csum_ok++; mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -654,7 +654,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud INC_PERF_COUNTER(priv->pstats.lro_misses); } else { mb->m_pkthdr.csum_flags = 0; - priv->port_stats.rx_chksum_none++; + ring->csum_none++; } /* Push it up the stack */ diff --git a/sys/ofed/drivers/net/mlx4/en_tx.c b/sys/ofed/drivers/net/mlx4/en_tx.c index 9090c51..ae21d33 100644 --- a/sys/ofed/drivers/net/mlx4/en_tx.c +++ b/sys/ofed/drivers/net/mlx4/en_tx.c @@ -448,7 +448,6 @@ static int mlx4_en_process_tx_cq(struct net_device *dev, if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); ring->wake_queue++; - priv->port_stats.wake_queue++; } return (0); } @@ -714,7 +713,6 @@ static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); priv->port_stats.queue_stopped++; ring->blocked = 1; - priv->port_stats.queue_stopped++; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ @@ -758,7 +756,6 @@ static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp /* do statistics */ if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) { - priv->port_stats.tx_chksum_offload++; ring->tx_csum++; } @@ -796,7 +793,7 @@ static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp num_pkts = DIV_ROUND_UP(payload_len, mss); ring->bytes += payload_len + (num_pkts * ihs); ring->packets += num_pkts; - priv->port_stats.tso_packets++; + ring->tso_packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ diff --git a/sys/ofed/drivers/net/mlx4/mlx4_en.h b/sys/ofed/drivers/net/mlx4/mlx4_en.h index 17afa9e..f6ea011 100644 --- a/sys/ofed/drivers/net/mlx4/mlx4_en.h +++ b/sys/ofed/drivers/net/mlx4/mlx4_en.h @@ -278,6 +278,7 @@ struct mlx4_en_tx_ring { unsigned long queue_stopped; unsigned long oversized_packets; unsigned long wake_queue; + unsigned long tso_packets; struct mlx4_bf bf; bool bf_enabled; int hwtstamp_tx_type;