diff options
Diffstat (limited to 'drivers/net/ethernet/sfc')
| -rw-r--r-- | drivers/net/ethernet/sfc/efx.c | 267 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/efx.h | 14 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/enum.h | 12 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/ethtool.c | 4 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/falcon.c | 17 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/filter.c | 247 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/mcdi_pcol.h | 1 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/net_driver.h | 97 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/nic.c | 94 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/ptp.c | 116 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/rx.c | 783 | ||||
| -rw-r--r-- | drivers/net/ethernet/sfc/siena.c | 25 | 
12 files changed, 1065 insertions, 612 deletions
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 0bc00991d31..01b99206139 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -22,6 +22,7 @@  #include <linux/topology.h>  #include <linux/gfp.h>  #include <linux/cpu_rmap.h> +#include <linux/aer.h>  #include "net_driver.h"  #include "efx.h"  #include "nic.h" @@ -71,21 +72,21 @@ const char *const efx_loopback_mode_names[] = {  const unsigned int efx_reset_type_max = RESET_TYPE_MAX;  const char *const efx_reset_type_names[] = { -	[RESET_TYPE_INVISIBLE]     = "INVISIBLE", -	[RESET_TYPE_ALL]           = "ALL", -	[RESET_TYPE_WORLD]         = "WORLD", -	[RESET_TYPE_DISABLE]       = "DISABLE", -	[RESET_TYPE_TX_WATCHDOG]   = "TX_WATCHDOG", -	[RESET_TYPE_INT_ERROR]     = "INT_ERROR", -	[RESET_TYPE_RX_RECOVERY]   = "RX_RECOVERY", -	[RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH", -	[RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH", -	[RESET_TYPE_TX_SKIP]       = "TX_SKIP", -	[RESET_TYPE_MC_FAILURE]    = "MC_FAILURE", +	[RESET_TYPE_INVISIBLE]          = "INVISIBLE", +	[RESET_TYPE_ALL]                = "ALL", +	[RESET_TYPE_RECOVER_OR_ALL]     = "RECOVER_OR_ALL", +	[RESET_TYPE_WORLD]              = "WORLD", +	[RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE", +	[RESET_TYPE_DISABLE]            = "DISABLE", +	[RESET_TYPE_TX_WATCHDOG]        = "TX_WATCHDOG", +	[RESET_TYPE_INT_ERROR]          = "INT_ERROR", +	[RESET_TYPE_RX_RECOVERY]        = "RX_RECOVERY", +	[RESET_TYPE_RX_DESC_FETCH]      = "RX_DESC_FETCH", +	[RESET_TYPE_TX_DESC_FETCH]      = "TX_DESC_FETCH", +	[RESET_TYPE_TX_SKIP]            = "TX_SKIP", +	[RESET_TYPE_MC_FAILURE]         = "MC_FAILURE",  }; -#define EFX_MAX_MTU (9 * 1024) -  /* Reset workqueue. If any NIC has a hardware failure then a reset will be   * queued onto this work queue. This is not a per-nic work queue, because   * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised. @@ -117,9 +118,12 @@ MODULE_PARM_DESC(separate_tx_channels,  static int napi_weight = 64;  /* This is the time (in jiffies) between invocations of the hardware - * monitor.  On Falcon-based NICs, this will: + * monitor. + * On Falcon-based NICs, this will:   * - Check the on-board hardware monitor;   * - Poll the link state and reconfigure the hardware as necessary. + * On Siena-based NICs for power systems with EEH support, this will give EEH a + * chance to start.   */  static unsigned int efx_monitor_interval = 1 * HZ; @@ -203,13 +207,14 @@ static void efx_stop_all(struct efx_nic *efx);  #define EFX_ASSERT_RESET_SERIALISED(efx)		\  	do {						\  		if ((efx->state == STATE_READY) ||	\ +		    (efx->state == STATE_RECOVERY) ||	\  		    (efx->state == STATE_DISABLED))	\  			ASSERT_RTNL();			\  	} while (0)  static int efx_check_disabled(struct efx_nic *efx)  { -	if (efx->state == STATE_DISABLED) { +	if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) {  		netif_err(efx, drv, efx->net_dev,  			  "device is disabled due to earlier errors\n");  		return -EIO; @@ -242,15 +247,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget)  		struct efx_rx_queue *rx_queue =  			efx_channel_get_rx_queue(channel); -		/* Deliver last RX packet. */ -		if (channel->rx_pkt) { -			__efx_rx_packet(channel, channel->rx_pkt); -			channel->rx_pkt = NULL; -		} -		if (rx_queue->enabled) { -			efx_rx_strategy(channel); +		efx_rx_flush_packet(channel); +		if (rx_queue->enabled)  			efx_fast_push_rx_descriptors(rx_queue); -		}  	}  	return spent; @@ -625,20 +624,51 @@ fail:   */  static void efx_start_datapath(struct efx_nic *efx)  { +	bool old_rx_scatter = efx->rx_scatter;  	struct efx_tx_queue *tx_queue;  	struct efx_rx_queue *rx_queue;  	struct efx_channel *channel; +	size_t rx_buf_len;  	/* Calculate the rx buffer allocation parameters required to  	 * support the current MTU, including padding for header  	 * alignment and overruns.  	 */ -	efx->rx_buffer_len = (max(EFX_PAGE_IP_ALIGN, NET_IP_ALIGN) + -			      EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + -			      efx->type->rx_buffer_hash_size + -			      efx->type->rx_buffer_padding); -	efx->rx_buffer_order = get_order(efx->rx_buffer_len + -					 sizeof(struct efx_rx_page_state)); +	efx->rx_dma_len = (efx->type->rx_buffer_hash_size + +			   EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + +			   efx->type->rx_buffer_padding); +	rx_buf_len = (sizeof(struct efx_rx_page_state) + +		      EFX_PAGE_IP_ALIGN + efx->rx_dma_len); +	if (rx_buf_len <= PAGE_SIZE) { +		efx->rx_scatter = false; +		efx->rx_buffer_order = 0; +	} else if (efx->type->can_rx_scatter) { +		BUILD_BUG_ON(sizeof(struct efx_rx_page_state) + +			     EFX_PAGE_IP_ALIGN + EFX_RX_USR_BUF_SIZE > +			     PAGE_SIZE / 2); +		efx->rx_scatter = true; +		efx->rx_dma_len = EFX_RX_USR_BUF_SIZE; +		efx->rx_buffer_order = 0; +	} else { +		efx->rx_scatter = false; +		efx->rx_buffer_order = get_order(rx_buf_len); +	} + +	efx_rx_config_page_split(efx); +	if (efx->rx_buffer_order) +		netif_dbg(efx, drv, efx->net_dev, +			  "RX buf len=%u; page order=%u batch=%u\n", +			  efx->rx_dma_len, efx->rx_buffer_order, +			  efx->rx_pages_per_batch); +	else +		netif_dbg(efx, drv, efx->net_dev, +			  "RX buf len=%u step=%u bpp=%u; page batch=%u\n", +			  efx->rx_dma_len, efx->rx_page_buf_step, +			  efx->rx_bufs_per_page, efx->rx_pages_per_batch); + +	/* RX filters also have scatter-enabled flags */ +	if (efx->rx_scatter != old_rx_scatter) +		efx_filter_update_rx_scatter(efx);  	/* We must keep at least one descriptor in a TX ring empty.  	 * We could avoid this when the queue size does not exactly @@ -655,16 +685,12 @@ static void efx_start_datapath(struct efx_nic *efx)  		efx_for_each_channel_tx_queue(tx_queue, channel)  			efx_init_tx_queue(tx_queue); -		/* The rx buffer allocation strategy is MTU dependent */ -		efx_rx_strategy(channel); -  		efx_for_each_channel_rx_queue(rx_queue, channel) {  			efx_init_rx_queue(rx_queue);  			efx_nic_generate_fill_event(rx_queue);  		} -		WARN_ON(channel->rx_pkt != NULL); -		efx_rx_strategy(channel); +		WARN_ON(channel->rx_pkt_n_frags);  	}  	if (netif_device_present(efx->net_dev)) @@ -683,7 +709,7 @@ static void efx_stop_datapath(struct efx_nic *efx)  	BUG_ON(efx->port_enabled);  	/* Only perform flush if dma is enabled */ -	if (dev->is_busmaster) { +	if (dev->is_busmaster && efx->state != STATE_RECOVERY) {  		rc = efx_nic_flush_queues(efx);  		if (rc && EFX_WORKAROUND_7803(efx)) { @@ -1596,13 +1622,15 @@ static void efx_start_all(struct efx_nic *efx)  	efx_start_port(efx);  	efx_start_datapath(efx); -	/* Start the hardware monitor if there is one. Otherwise (we're link -	 * event driven), we have to poll the PHY because after an event queue -	 * flush, we could have a missed a link state change */ -	if (efx->type->monitor != NULL) { +	/* Start the hardware monitor if there is one */ +	if (efx->type->monitor != NULL)  		queue_delayed_work(efx->workqueue, &efx->monitor_work,  				   efx_monitor_interval); -	} else { + +	/* If link state detection is normally event-driven, we have +	 * to poll now because we could have missed a change +	 */ +	if (efx_nic_rev(efx) >= EFX_REV_SIENA_A0) {  		mutex_lock(&efx->mac_lock);  		if (efx->phy_op->poll(efx))  			efx_link_status_changed(efx); @@ -2309,7 +2337,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)  out:  	/* Leave device stopped if necessary */ -	disabled = rc || method == RESET_TYPE_DISABLE; +	disabled = rc || +		method == RESET_TYPE_DISABLE || +		method == RESET_TYPE_RECOVER_OR_DISABLE;  	rc2 = efx_reset_up(efx, method, !disabled);  	if (rc2) {  		disabled = true; @@ -2328,13 +2358,48 @@ out:  	return rc;  } +/* Try recovery mechanisms. + * For now only EEH is supported. + * Returns 0 if the recovery mechanisms are unsuccessful. + * Returns a non-zero value otherwise. + */ +static int efx_try_recovery(struct efx_nic *efx) +{ +#ifdef CONFIG_EEH +	/* A PCI error can occur and not be seen by EEH because nothing +	 * happens on the PCI bus. In this case the driver may fail and +	 * schedule a 'recover or reset', leading to this recovery handler. +	 * Manually call the eeh failure check function. +	 */ +	struct eeh_dev *eehdev = +		of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev)); + +	if (eeh_dev_check_failure(eehdev)) { +		/* The EEH mechanisms will handle the error and reset the +		 * device if necessary. +		 */ +		return 1; +	} +#endif +	return 0; +} +  /* The worker thread exists so that code that cannot sleep can   * schedule a reset for later.   */  static void efx_reset_work(struct work_struct *data)  {  	struct efx_nic *efx = container_of(data, struct efx_nic, reset_work); -	unsigned long pending = ACCESS_ONCE(efx->reset_pending); +	unsigned long pending; +	enum reset_type method; + +	pending = ACCESS_ONCE(efx->reset_pending); +	method = fls(pending) - 1; + +	if ((method == RESET_TYPE_RECOVER_OR_DISABLE || +	     method == RESET_TYPE_RECOVER_OR_ALL) && +	    efx_try_recovery(efx)) +		return;  	if (!pending)  		return; @@ -2346,7 +2411,7 @@ static void efx_reset_work(struct work_struct *data)  	 * it cannot change again.  	 */  	if (efx->state == STATE_READY) -		(void)efx_reset(efx, fls(pending) - 1); +		(void)efx_reset(efx, method);  	rtnl_unlock();  } @@ -2355,11 +2420,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)  {  	enum reset_type method; +	if (efx->state == STATE_RECOVERY) { +		netif_dbg(efx, drv, efx->net_dev, +			  "recovering: skip scheduling %s reset\n", +			  RESET_TYPE(type)); +		return; +	} +  	switch (type) {  	case RESET_TYPE_INVISIBLE:  	case RESET_TYPE_ALL: +	case RESET_TYPE_RECOVER_OR_ALL:  	case RESET_TYPE_WORLD:  	case RESET_TYPE_DISABLE: +	case RESET_TYPE_RECOVER_OR_DISABLE:  		method = type;  		netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n",  			  RESET_TYPE(method)); @@ -2569,6 +2643,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)  	efx_fini_struct(efx);  	pci_set_drvdata(pci_dev, NULL);  	free_netdev(efx->net_dev); + +	pci_disable_pcie_error_reporting(pci_dev);  };  /* NIC VPD information @@ -2741,6 +2817,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev,  		netif_warn(efx, probe, efx->net_dev,  			   "failed to create MTDs (%d)\n", rc); +	rc = pci_enable_pcie_error_reporting(pci_dev); +	if (rc && rc != -EINVAL) +		netif_warn(efx, probe, efx->net_dev, +			   "pci_enable_pcie_error_reporting failed (%d)\n", rc); +  	return 0;   fail4: @@ -2865,12 +2946,112 @@ static const struct dev_pm_ops efx_pm_ops = {  	.restore	= efx_pm_resume,  }; +/* A PCI error affecting this device was detected. + * At this point MMIO and DMA may be disabled. + * Stop the software path and request a slot reset. + */ +static pci_ers_result_t efx_io_error_detected(struct pci_dev *pdev, +					      enum pci_channel_state state) +{ +	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED; +	struct efx_nic *efx = pci_get_drvdata(pdev); + +	if (state == pci_channel_io_perm_failure) +		return PCI_ERS_RESULT_DISCONNECT; + +	rtnl_lock(); + +	if (efx->state != STATE_DISABLED) { +		efx->state = STATE_RECOVERY; +		efx->reset_pending = 0; + +		efx_device_detach_sync(efx); + +		efx_stop_all(efx); +		efx_stop_interrupts(efx, false); + +		status = PCI_ERS_RESULT_NEED_RESET; +	} else { +		/* If the interface is disabled we don't want to do anything +		 * with it. +		 */ +		status = PCI_ERS_RESULT_RECOVERED; +	} + +	rtnl_unlock(); + +	pci_disable_device(pdev); + +	return status; +} + +/* Fake a successfull reset, which will be performed later in efx_io_resume. */ +static pci_ers_result_t efx_io_slot_reset(struct pci_dev *pdev) +{ +	struct efx_nic *efx = pci_get_drvdata(pdev); +	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED; +	int rc; + +	if (pci_enable_device(pdev)) { +		netif_err(efx, hw, efx->net_dev, +			  "Cannot re-enable PCI device after reset.\n"); +		status =  PCI_ERS_RESULT_DISCONNECT; +	} + +	rc = pci_cleanup_aer_uncorrect_error_status(pdev); +	if (rc) { +		netif_err(efx, hw, efx->net_dev, +		"pci_cleanup_aer_uncorrect_error_status failed (%d)\n", rc); +		/* Non-fatal error. Continue. */ +	} + +	return status; +} + +/* Perform the actual reset and resume I/O operations. */ +static void efx_io_resume(struct pci_dev *pdev) +{ +	struct efx_nic *efx = pci_get_drvdata(pdev); +	int rc; + +	rtnl_lock(); + +	if (efx->state == STATE_DISABLED) +		goto out; + +	rc = efx_reset(efx, RESET_TYPE_ALL); +	if (rc) { +		netif_err(efx, hw, efx->net_dev, +			  "efx_reset failed after PCI error (%d)\n", rc); +	} else { +		efx->state = STATE_READY; +		netif_dbg(efx, hw, efx->net_dev, +			  "Done resetting and resuming IO after PCI error.\n"); +	} + +out: +	rtnl_unlock(); +} + +/* For simplicity and reliability, we always require a slot reset and try to + * reset the hardware when a pci error affecting the device is detected. + * We leave both the link_reset and mmio_enabled callback unimplemented: + * with our request for slot reset the mmio_enabled callback will never be + * called, and the link_reset callback is not used by AER or EEH mechanisms. + */ +static struct pci_error_handlers efx_err_handlers = { +	.error_detected = efx_io_error_detected, +	.slot_reset	= efx_io_slot_reset, +	.resume		= efx_io_resume, +}; +  static struct pci_driver efx_pci_driver = {  	.name		= KBUILD_MODNAME,  	.id_table	= efx_pci_table,  	.probe		= efx_pci_probe,  	.remove		= efx_pci_remove,  	.driver.pm	= &efx_pm_ops, +	.err_handler	= &efx_err_handlers,  };  /************************************************************************** diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index d2f790df6dc..8372da239b4 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -33,17 +33,22 @@ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);  extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);  /* RX */ +extern void efx_rx_config_page_split(struct efx_nic *efx);  extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);  extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);  extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);  extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); -extern void efx_rx_strategy(struct efx_channel *channel);  extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue);  extern void efx_rx_slow_fill(unsigned long context); -extern void __efx_rx_packet(struct efx_channel *channel, -			    struct efx_rx_buffer *rx_buf); -extern void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, +extern void __efx_rx_packet(struct efx_channel *channel); +extern void efx_rx_packet(struct efx_rx_queue *rx_queue, +			  unsigned int index, unsigned int n_frags,  			  unsigned int len, u16 flags); +static inline void efx_rx_flush_packet(struct efx_channel *channel) +{ +	if (channel->rx_pkt_n_frags) +		__efx_rx_packet(channel); +}  extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);  #define EFX_MAX_DMAQ_SIZE 4096UL @@ -67,6 +72,7 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);  extern int efx_probe_filters(struct efx_nic *efx);  extern void efx_restore_filters(struct efx_nic *efx);  extern void efx_remove_filters(struct efx_nic *efx); +extern void efx_filter_update_rx_scatter(struct efx_nic *efx);  extern s32 efx_filter_insert_filter(struct efx_nic *efx,  				    struct efx_filter_spec *spec,  				    bool replace); diff --git a/drivers/net/ethernet/sfc/enum.h b/drivers/net/ethernet/sfc/enum.h index 182dbe2cc6e..ab8fb5889e5 100644 --- a/drivers/net/ethernet/sfc/enum.h +++ b/drivers/net/ethernet/sfc/enum.h @@ -137,8 +137,12 @@ enum efx_loopback_mode {   * Reset methods are numbered in order of increasing scope.   *   * @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only) + * @RESET_TYPE_RECOVER_OR_ALL: Try to recover. Apply RESET_TYPE_ALL + * if unsuccessful.   * @RESET_TYPE_ALL: Reset datapath, MAC and PHY   * @RESET_TYPE_WORLD: Reset as much as possible + * @RESET_TYPE_RECOVER_OR_DISABLE: Try to recover. Apply RESET_TYPE_DISABLE if + * unsuccessful.   * @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled   * @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog   * @RESET_TYPE_INT_ERROR: reset due to internal error @@ -150,9 +154,11 @@ enum efx_loopback_mode {   */  enum reset_type {  	RESET_TYPE_INVISIBLE = 0, -	RESET_TYPE_ALL = 1, -	RESET_TYPE_WORLD = 2, -	RESET_TYPE_DISABLE = 3, +	RESET_TYPE_RECOVER_OR_ALL = 1, +	RESET_TYPE_ALL = 2, +	RESET_TYPE_WORLD = 3, +	RESET_TYPE_RECOVER_OR_DISABLE = 4, +	RESET_TYPE_DISABLE = 5,  	RESET_TYPE_MAX_METHOD,  	RESET_TYPE_TX_WATCHDOG,  	RESET_TYPE_INT_ERROR, diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 8e61cd06f66..6e768175e7e 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -154,6 +154,7 @@ static const struct efx_ethtool_stat efx_ethtool_stats[] = {  	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err),  	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch),  	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), +	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_nodesc_trunc),  };  /* Number of ethtool statistics */ @@ -978,7 +979,8 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx,  	     rule->m_ext.data[1]))  		return -EINVAL; -	efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, 0, +	efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, +			   efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,  			   (rule->ring_cookie == RX_CLS_FLOW_DISC) ?  			   0xfff : rule->ring_cookie); diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c index 49bcd196e10..4486102fa9b 100644 --- a/drivers/net/ethernet/sfc/falcon.c +++ b/drivers/net/ethernet/sfc/falcon.c @@ -1546,10 +1546,6 @@ static int falcon_probe_nic(struct efx_nic *efx)  static void falcon_init_rx_cfg(struct efx_nic *efx)  { -	/* Prior to Siena the RX DMA engine will split each frame at -	 * intervals of RX_USR_BUF_SIZE (32-byte units). We set it to -	 * be so large that that never happens. */ -	const unsigned huge_buf_size = (3 * 4096) >> 5;  	/* RX control FIFO thresholds (32 entries) */  	const unsigned ctrl_xon_thr = 20;  	const unsigned ctrl_xoff_thr = 25; @@ -1557,10 +1553,15 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)  	efx_reado(efx, ®, FR_AZ_RX_CFG);  	if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) { -		/* Data FIFO size is 5.5K */ +		/* Data FIFO size is 5.5K.  The RX DMA engine only +		 * supports scattering for user-mode queues, but will +		 * split DMA writes at intervals of RX_USR_BUF_SIZE +		 * (32-byte units) even for kernel-mode queues.  We +		 * set it to be so large that that never happens. +		 */  		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0);  		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE, -				    huge_buf_size); +				    (3 * 4096) >> 5);  		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8);  		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8);  		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr); @@ -1569,7 +1570,7 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)  		/* Data FIFO size is 80K; register fields moved */  		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0);  		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE, -				    huge_buf_size); +				    EFX_RX_USR_BUF_SIZE >> 5);  		/* Send XON and XOFF at ~3 * max MTU away from empty/full */  		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8);  		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8); @@ -1815,6 +1816,7 @@ const struct efx_nic_type falcon_a1_nic_type = {  	.evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER,  	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),  	.rx_buffer_padding = 0x24, +	.can_rx_scatter = false,  	.max_interrupt_mode = EFX_INT_MODE_MSI,  	.phys_addr_channels = 4,  	.timer_period_max =  1 << FRF_AB_TC_TIMER_VAL_WIDTH, @@ -1865,6 +1867,7 @@ const struct efx_nic_type falcon_b0_nic_type = {  	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),  	.rx_buffer_hash_size = 0x10,  	.rx_buffer_padding = 0, +	.can_rx_scatter = true,  	.max_interrupt_mode = EFX_INT_MODE_MSIX,  	.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy  				   * interrupt handler only supports 32 diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c index 8af42cd1fed..2397f0e8d3e 100644 --- a/drivers/net/ethernet/sfc/filter.c +++ b/drivers/net/ethernet/sfc/filter.c @@ -66,6 +66,10 @@ struct efx_filter_state {  #endif  }; +static void efx_filter_table_clear_entry(struct efx_nic *efx, +					 struct efx_filter_table *table, +					 unsigned int filter_idx); +  /* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit   * key derived from the n-tuple.  The initial LFSR state is 0xffff. */  static u16 efx_filter_hash(u32 key) @@ -168,6 +172,25 @@ static void efx_filter_push_rx_config(struct efx_nic *efx)  			filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED,  			!!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags &  			   EFX_FILTER_FLAG_RX_RSS)); + +		/* There is a single bit to enable RX scatter for all +		 * unmatched packets.  Only set it if scatter is +		 * enabled in both filter specs. +		 */ +		EFX_SET_OWORD_FIELD( +			filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q, +			!!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags & +			   table->spec[EFX_FILTER_INDEX_MC_DEF].flags & +			   EFX_FILTER_FLAG_RX_SCATTER)); +	} else if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) { +		/* We don't expose 'default' filters because unmatched +		 * packets always go to the queue number found in the +		 * RSS table.  But we still need to set the RX scatter +		 * bit here. +		 */ +		EFX_SET_OWORD_FIELD( +			filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q, +			efx->rx_scatter);  	}  	efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL); @@ -409,9 +432,18 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx)  	struct efx_filter_state *state = efx->filter_state;  	struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF];  	struct efx_filter_spec *spec = &table->spec[filter_idx]; +	enum efx_filter_flags flags = 0; + +	/* If there's only one channel then disable RSS for non VF +	 * traffic, thereby allowing VFs to use RSS when the PF can't. +	 */ +	if (efx->n_rx_channels > 1) +		flags |= EFX_FILTER_FLAG_RX_RSS; + +	if (efx->rx_scatter) +		flags |= EFX_FILTER_FLAG_RX_SCATTER; -	efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, -			   EFX_FILTER_FLAG_RX_RSS, 0); +	efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, flags, 0);  	spec->type = EFX_FILTER_UC_DEF + filter_idx;  	table->used_bitmap[0] |= 1 << filter_idx;  } @@ -463,13 +495,6 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec)  		break;  	} -	case EFX_FILTER_TABLE_RX_DEF: -		/* One filter spec per type */ -		BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0); -		BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF != -			     EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF); -		return spec->type - EFX_FILTER_UC_DEF; -  	case EFX_FILTER_TABLE_RX_MAC: {  		bool is_wild = spec->type == EFX_FILTER_MAC_WILD;  		EFX_POPULATE_OWORD_7( @@ -520,42 +545,6 @@ static bool efx_filter_equal(const struct efx_filter_spec *left,  	return true;  } -static int efx_filter_search(struct efx_filter_table *table, -			     struct efx_filter_spec *spec, u32 key, -			     bool for_insert, unsigned int *depth_required) -{ -	unsigned hash, incr, filter_idx, depth, depth_max; - -	hash = efx_filter_hash(key); -	incr = efx_filter_increment(key); - -	filter_idx = hash & (table->size - 1); -	depth = 1; -	depth_max = (for_insert ? -		     (spec->priority <= EFX_FILTER_PRI_HINT ? -		      FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX) : -		     table->search_depth[spec->type]); - -	for (;;) { -		/* Return success if entry is used and matches this spec -		 * or entry is unused and we are trying to insert. -		 */ -		if (test_bit(filter_idx, table->used_bitmap) ? -		    efx_filter_equal(spec, &table->spec[filter_idx]) : -		    for_insert) { -			*depth_required = depth; -			return filter_idx; -		} - -		/* Return failure if we reached the maximum search depth */ -		if (depth == depth_max) -			return for_insert ? -EBUSY : -ENOENT; - -		filter_idx = (filter_idx + incr) & (table->size - 1); -		++depth; -	} -} -  /*   * Construct/deconstruct external filter IDs.  At least the RX filter   * IDs must be ordered by matching priority, for RX NFC semantics. @@ -650,44 +639,111 @@ u32 efx_filter_get_rx_id_limit(struct efx_nic *efx)   * efx_filter_insert_filter - add or replace a filter   * @efx: NIC in which to insert the filter   * @spec: Specification for the filter - * @replace: Flag for whether the specified filter may replace a filter - *	with an identical match expression and equal or lower priority + * @replace_equal: Flag for whether the specified filter may replace an + *	existing filter with equal priority   *   * On success, return the filter ID.   * On failure, return a negative error code. + * + * If an existing filter has equal match values to the new filter + * spec, then the new filter might replace it, depending on the + * relative priorities.  If the existing filter has lower priority, or + * if @replace_equal is set and it has equal priority, then it is + * replaced.  Otherwise the function fails, returning -%EPERM if + * the existing filter has higher priority or -%EEXIST if it has + * equal priority.   */  s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, -			     bool replace) +			     bool replace_equal)  {  	struct efx_filter_state *state = efx->filter_state;  	struct efx_filter_table *table = efx_filter_spec_table(state, spec); -	struct efx_filter_spec *saved_spec;  	efx_oword_t filter; -	unsigned int filter_idx, depth = 0; -	u32 key; +	int rep_index, ins_index; +	unsigned int depth = 0;  	int rc;  	if (!table || table->size == 0)  		return -EINVAL; -	key = efx_filter_build(&filter, spec); -  	netif_vdbg(efx, hw, efx->net_dev,  		   "%s: type %d search_depth=%d", __func__, spec->type,  		   table->search_depth[spec->type]); -	spin_lock_bh(&state->lock); +	if (table->id == EFX_FILTER_TABLE_RX_DEF) { +		/* One filter spec per type */ +		BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0); +		BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF != +			     EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF); +		rep_index = spec->type - EFX_FILTER_INDEX_UC_DEF; +		ins_index = rep_index; -	rc = efx_filter_search(table, spec, key, true, &depth); -	if (rc < 0) -		goto out; -	filter_idx = rc; -	BUG_ON(filter_idx >= table->size); -	saved_spec = &table->spec[filter_idx]; +		spin_lock_bh(&state->lock); +	} else { +		/* Search concurrently for +		 * (1) a filter to be replaced (rep_index): any filter +		 *     with the same match values, up to the current +		 *     search depth for this type, and +		 * (2) the insertion point (ins_index): (1) or any +		 *     free slot before it or up to the maximum search +		 *     depth for this priority +		 * We fail if we cannot find (2). +		 * +		 * We can stop once either +		 * (a) we find (1), in which case we have definitely +		 *     found (2) as well; or +		 * (b) we have searched exhaustively for (1), and have +		 *     either found (2) or searched exhaustively for it +		 */ +		u32 key = efx_filter_build(&filter, spec); +		unsigned int hash = efx_filter_hash(key); +		unsigned int incr = efx_filter_increment(key); +		unsigned int max_rep_depth = table->search_depth[spec->type]; +		unsigned int max_ins_depth = +			spec->priority <= EFX_FILTER_PRI_HINT ? +			FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX; +		unsigned int i = hash & (table->size - 1); + +		ins_index = -1; +		depth = 1; + +		spin_lock_bh(&state->lock); + +		for (;;) { +			if (!test_bit(i, table->used_bitmap)) { +				if (ins_index < 0) +					ins_index = i; +			} else if (efx_filter_equal(spec, &table->spec[i])) { +				/* Case (a) */ +				if (ins_index < 0) +					ins_index = i; +				rep_index = i; +				break; +			} + +			if (depth >= max_rep_depth && +			    (ins_index >= 0 || depth >= max_ins_depth)) { +				/* Case (b) */ +				if (ins_index < 0) { +					rc = -EBUSY; +					goto out; +				} +				rep_index = -1; +				break; +			} + +			i = (i + incr) & (table->size - 1); +			++depth; +		} +	} + +	/* If we found a filter to be replaced, check whether we +	 * should do so +	 */ +	if (rep_index >= 0) { +		struct efx_filter_spec *saved_spec = &table->spec[rep_index]; -	if (test_bit(filter_idx, table->used_bitmap)) { -		/* Should we replace the existing filter? */ -		if (!replace) { +		if (spec->priority == saved_spec->priority && !replace_equal) {  			rc = -EEXIST;  			goto out;  		} @@ -695,11 +751,14 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,  			rc = -EPERM;  			goto out;  		} -	} else { -		__set_bit(filter_idx, table->used_bitmap); +	} + +	/* Insert the filter */ +	if (ins_index != rep_index) { +		__set_bit(ins_index, table->used_bitmap);  		++table->used;  	} -	*saved_spec = *spec; +	table->spec[ins_index] = *spec;  	if (table->id == EFX_FILTER_TABLE_RX_DEF) {  		efx_filter_push_rx_config(efx); @@ -713,13 +772,19 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,  		}  		efx_writeo(efx, &filter, -			   table->offset + table->step * filter_idx); +			   table->offset + table->step * ins_index); + +		/* If we were able to replace a filter by inserting +		 * at a lower depth, clear the replaced filter +		 */ +		if (ins_index != rep_index && rep_index >= 0) +			efx_filter_table_clear_entry(efx, table, rep_index);  	}  	netif_vdbg(efx, hw, efx->net_dev,  		   "%s: filter type %d index %d rxq %u set", -		   __func__, spec->type, filter_idx, spec->dmaq_id); -	rc = efx_filter_make_id(spec, filter_idx); +		   __func__, spec->type, ins_index, spec->dmaq_id); +	rc = efx_filter_make_id(spec, ins_index);  out:  	spin_unlock_bh(&state->lock); @@ -1060,6 +1125,50 @@ void efx_remove_filters(struct efx_nic *efx)  	kfree(state);  } +/* Update scatter enable flags for filters pointing to our own RX queues */ +void efx_filter_update_rx_scatter(struct efx_nic *efx) +{ +	struct efx_filter_state *state = efx->filter_state; +	enum efx_filter_table_id table_id; +	struct efx_filter_table *table; +	efx_oword_t filter; +	unsigned int filter_idx; + +	spin_lock_bh(&state->lock); + +	for (table_id = EFX_FILTER_TABLE_RX_IP; +	     table_id <= EFX_FILTER_TABLE_RX_DEF; +	     table_id++) { +		table = &state->table[table_id]; + +		for (filter_idx = 0; filter_idx < table->size; filter_idx++) { +			if (!test_bit(filter_idx, table->used_bitmap) || +			    table->spec[filter_idx].dmaq_id >= +			    efx->n_rx_channels) +				continue; + +			if (efx->rx_scatter) +				table->spec[filter_idx].flags |= +					EFX_FILTER_FLAG_RX_SCATTER; +			else +				table->spec[filter_idx].flags &= +					~EFX_FILTER_FLAG_RX_SCATTER; + +			if (table_id == EFX_FILTER_TABLE_RX_DEF) +				/* Pushed by efx_filter_push_rx_config() */ +				continue; + +			efx_filter_build(&filter, &table->spec[filter_idx]); +			efx_writeo(efx, &filter, +				   table->offset + table->step * filter_idx); +		} +	} + +	efx_filter_push_rx_config(efx); + +	spin_unlock_bh(&state->lock); +} +  #ifdef CONFIG_RFS_ACCEL  int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h index 9d426d0457b..c5c9747861b 100644 --- a/drivers/net/ethernet/sfc/mcdi_pcol.h +++ b/drivers/net/ethernet/sfc/mcdi_pcol.h @@ -553,6 +553,7 @@  #define          MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */  #define          MC_CMD_PTP_MODE_V2 0x2 /* enum */  #define          MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */ +#define          MC_CMD_PTP_MODE_V2_ENHANCED 0x4 /* enum */  /* MC_CMD_PTP_IN_DISABLE msgrequest */  #define    MC_CMD_PTP_IN_DISABLE_LEN 8 diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 0a90abd2421..9bd433a095c 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -69,6 +69,12 @@  #define EFX_TXQ_TYPES		4  #define EFX_MAX_TX_QUEUES	(EFX_TXQ_TYPES * EFX_MAX_CHANNELS) +/* Maximum possible MTU the driver supports */ +#define EFX_MAX_MTU (9 * 1024) + +/* Size of an RX scatter buffer.  Small enough to pack 2 into a 4K page. */ +#define EFX_RX_USR_BUF_SIZE 1824 +  /* Forward declare Precision Time Protocol (PTP) support structure. */  struct efx_ptp_data; @@ -206,25 +212,23 @@ struct efx_tx_queue {  /**   * struct efx_rx_buffer - An Efx RX data buffer   * @dma_addr: DMA base address of the buffer - * @skb: The associated socket buffer. Valid iff !(@flags & %EFX_RX_BUF_PAGE). - *	Will be %NULL if the buffer slot is currently free. - * @page: The associated page buffer. Valif iff @flags & %EFX_RX_BUF_PAGE. + * @page: The associated page buffer.   *	Will be %NULL if the buffer slot is currently free. - * @page_offset: Offset within page. Valid iff @flags & %EFX_RX_BUF_PAGE. - * @len: Buffer length, in bytes. - * @flags: Flags for buffer and packet state. + * @page_offset: If pending: offset in @page of DMA base address. + *	If completed: offset in @page of Ethernet header. + * @len: If pending: length for DMA descriptor. + *	If completed: received length, excluding hash prefix. + * @flags: Flags for buffer and packet state.  These are only set on the + *	first buffer of a scattered packet.   */  struct efx_rx_buffer {  	dma_addr_t dma_addr; -	union { -		struct sk_buff *skb; -		struct page *page; -	} u; +	struct page *page;  	u16 page_offset;  	u16 len;  	u16 flags;  }; -#define EFX_RX_BUF_PAGE		0x0001 +#define EFX_RX_BUF_LAST_IN_PAGE	0x0001  #define EFX_RX_PKT_CSUMMED	0x0002  #define EFX_RX_PKT_DISCARD	0x0004 @@ -260,14 +264,23 @@ struct efx_rx_page_state {   * @added_count: Number of buffers added to the receive queue.   * @notified_count: Number of buffers given to NIC (<= @added_count).   * @removed_count: Number of buffers removed from the receive queue. + * @scatter_n: Number of buffers used by current packet + * @page_ring: The ring to store DMA mapped pages for reuse. + * @page_add: Counter to calculate the write pointer for the recycle ring. + * @page_remove: Counter to calculate the read pointer for the recycle ring. + * @page_recycle_count: The number of pages that have been recycled. + * @page_recycle_failed: The number of pages that couldn't be recycled because + *      the kernel still held a reference to them. + * @page_recycle_full: The number of pages that were released because the + *      recycle ring was full. + * @page_ptr_mask: The number of pages in the RX recycle ring minus 1.   * @max_fill: RX descriptor maximum fill level (<= ring size)   * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill   *	(<= @max_fill)   * @min_fill: RX descriptor minimum non-zero fill level.   *	This records the minimum fill level observed when a ring   *	refill was triggered. - * @alloc_page_count: RX allocation strategy counter. - * @alloc_skb_count: RX allocation strategy counter. + * @recycle_count: RX buffer recycle counter.   * @slow_fill: Timer used to defer efx_nic_generate_fill_event().   */  struct efx_rx_queue { @@ -279,15 +292,22 @@ struct efx_rx_queue {  	bool enabled;  	bool flush_pending; -	int added_count; -	int notified_count; -	int removed_count; +	unsigned int added_count; +	unsigned int notified_count; +	unsigned int removed_count; +	unsigned int scatter_n; +	struct page **page_ring; +	unsigned int page_add; +	unsigned int page_remove; +	unsigned int page_recycle_count; +	unsigned int page_recycle_failed; +	unsigned int page_recycle_full; +	unsigned int page_ptr_mask;  	unsigned int max_fill;  	unsigned int fast_fill_trigger;  	unsigned int min_fill;  	unsigned int min_overfill; -	unsigned int alloc_page_count; -	unsigned int alloc_skb_count; +	unsigned int recycle_count;  	struct timer_list slow_fill;  	unsigned int slow_fill_count;  }; @@ -336,10 +356,6 @@ enum efx_rx_alloc_method {   * @event_test_cpu: Last CPU to handle interrupt or test event for this channel   * @irq_count: Number of IRQs since last adaptive moderation decision   * @irq_mod_score: IRQ moderation score - * @rx_alloc_level: Watermark based heuristic counter for pushing descriptors - *	and diagnostic counters - * @rx_alloc_push_pages: RX allocation method currently in use for pushing - *	descriptors   * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors   * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors   * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors @@ -347,6 +363,12 @@ enum efx_rx_alloc_method {   * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors   * @n_rx_overlength: Count of RX_OVERLENGTH errors   * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun + * @n_rx_nodesc_trunc: Number of RX packets truncated and then dropped due to + *	lack of descriptors + * @rx_pkt_n_frags: Number of fragments in next packet to be delivered by + *	__efx_rx_packet(), or zero if there is none + * @rx_pkt_index: Ring index of first buffer for next packet to be delivered + *	by __efx_rx_packet(), if @rx_pkt_n_frags != 0   * @rx_queue: RX queue for this channel   * @tx_queue: TX queues for this channel   */ @@ -371,9 +393,6 @@ struct efx_channel {  	unsigned int rfs_filters_added;  #endif -	int rx_alloc_level; -	int rx_alloc_push_pages; -  	unsigned n_rx_tobe_disc;  	unsigned n_rx_ip_hdr_chksum_err;  	unsigned n_rx_tcp_udp_chksum_err; @@ -381,11 +400,10 @@ struct efx_channel {  	unsigned n_rx_frm_trunc;  	unsigned n_rx_overlength;  	unsigned n_skbuff_leaks; +	unsigned int n_rx_nodesc_trunc; -	/* Used to pipeline received packets in order to optimise memory -	 * access with prefetches. -	 */ -	struct efx_rx_buffer *rx_pkt; +	unsigned int rx_pkt_n_frags; +	unsigned int rx_pkt_index;  	struct efx_rx_queue rx_queue;  	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES]; @@ -410,7 +428,7 @@ struct efx_channel_type {  	void (*post_remove)(struct efx_channel *);  	void (*get_name)(struct efx_channel *, char *buf, size_t len);  	struct efx_channel *(*copy)(const struct efx_channel *); -	void (*receive_skb)(struct efx_channel *, struct sk_buff *); +	bool (*receive_skb)(struct efx_channel *, struct sk_buff *);  	bool keep_eventq;  }; @@ -446,6 +464,7 @@ enum nic_state {  	STATE_UNINIT = 0,	/* device being probed/removed or is frozen */  	STATE_READY = 1,	/* hardware ready and netdev registered */  	STATE_DISABLED = 2,	/* device disabled due to hardware errors */ +	STATE_RECOVERY = 3,	/* device recovering from PCI error */  };  /* @@ -684,10 +703,13 @@ struct vfdi_status;   * @n_channels: Number of channels in use   * @n_rx_channels: Number of channels used for RX (= number of RX queues)   * @n_tx_channels: Number of channels used for TX - * @rx_buffer_len: RX buffer length + * @rx_dma_len: Current maximum RX DMA length   * @rx_buffer_order: Order (log2) of number of pages for each RX buffer + * @rx_buffer_truesize: Amortised allocation size of an RX buffer, + *	for use in sk_buff::truesize   * @rx_hash_key: Toeplitz hash key for RSS   * @rx_indir_table: Indirection table for RSS + * @rx_scatter: Scatter mode enabled for receives   * @int_error_count: Number of internal errors seen recently   * @int_error_expire: Time at which error count will be expired   * @irq_status: Interrupt status buffer @@ -800,10 +822,15 @@ struct efx_nic {  	unsigned rss_spread;  	unsigned tx_channel_offset;  	unsigned n_tx_channels; -	unsigned int rx_buffer_len; +	unsigned int rx_dma_len;  	unsigned int rx_buffer_order; +	unsigned int rx_buffer_truesize; +	unsigned int rx_page_buf_step; +	unsigned int rx_bufs_per_page; +	unsigned int rx_pages_per_batch;  	u8 rx_hash_key[40];  	u32 rx_indir_table[128]; +	bool rx_scatter;  	unsigned int_error_count;  	unsigned long int_error_expire; @@ -934,8 +961,9 @@ static inline unsigned int efx_port_num(struct efx_nic *efx)   * @evq_ptr_tbl_base: Event queue pointer table base address   * @evq_rptr_tbl_base: Event queue read-pointer table base address   * @max_dma_mask: Maximum possible DMA mask - * @rx_buffer_hash_size: Size of hash at start of RX buffer - * @rx_buffer_padding: Size of padding at end of RX buffer + * @rx_buffer_hash_size: Size of hash at start of RX packet + * @rx_buffer_padding: Size of padding at end of RX packet + * @can_rx_scatter: NIC is able to scatter packet to multiple buffers   * @max_interrupt_mode: Highest capability interrupt mode supported   *	from &enum efx_init_mode.   * @phys_addr_channels: Number of channels with physically addressed @@ -983,6 +1011,7 @@ struct efx_nic_type {  	u64 max_dma_mask;  	unsigned int rx_buffer_hash_size;  	unsigned int rx_buffer_padding; +	bool can_rx_scatter;  	unsigned int max_interrupt_mode;  	unsigned int phys_addr_channels;  	unsigned int timer_period_max; diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c index eaa8e874a3c..b0503cd8c2a 100644 --- a/drivers/net/ethernet/sfc/nic.c +++ b/drivers/net/ethernet/sfc/nic.c @@ -305,11 +305,11 @@ int efx_nic_alloc_buffer(struct efx_nic *efx, struct efx_buffer *buffer,  			 unsigned int len)  {  	buffer->addr = dma_alloc_coherent(&efx->pci_dev->dev, len, -					  &buffer->dma_addr, GFP_ATOMIC); +					  &buffer->dma_addr, +					  GFP_ATOMIC | __GFP_ZERO);  	if (!buffer->addr)  		return -ENOMEM;  	buffer->len = len; -	memset(buffer->addr, 0, len);  	return 0;  } @@ -592,12 +592,22 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)  	struct efx_nic *efx = rx_queue->efx;  	bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0;  	bool iscsi_digest_en = is_b0; +	bool jumbo_en; + +	/* For kernel-mode queues in Falcon A1, the JUMBO flag enables +	 * DMA to continue after a PCIe page boundary (and scattering +	 * is not possible).  In Falcon B0 and Siena, it enables +	 * scatter. +	 */ +	jumbo_en = !is_b0 || efx->rx_scatter;  	netif_dbg(efx, hw, efx->net_dev,  		  "RX queue %d ring in special buffers %d-%d\n",  		  efx_rx_queue_index(rx_queue), rx_queue->rxd.index,  		  rx_queue->rxd.index + rx_queue->rxd.entries - 1); +	rx_queue->scatter_n = 0; +  	/* Pin RX descriptor ring */  	efx_init_special_buffer(efx, &rx_queue->rxd); @@ -614,8 +624,7 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)  			      FRF_AZ_RX_DESCQ_SIZE,  			      __ffs(rx_queue->rxd.entries),  			      FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ , -			      /* For >=B0 this is scatter so disable */ -			      FRF_AZ_RX_DESCQ_JUMBO, !is_b0, +			      FRF_AZ_RX_DESCQ_JUMBO, jumbo_en,  			      FRF_AZ_RX_DESCQ_EN, 1);  	efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base,  			 efx_rx_queue_index(rx_queue)); @@ -969,13 +978,24 @@ static u16 efx_handle_rx_not_ok(struct efx_rx_queue *rx_queue,  		EFX_RX_PKT_DISCARD : 0;  } -/* Handle receive events that are not in-order. */ -static void +/* Handle receive events that are not in-order. Return true if this + * can be handled as a partial packet discard, false if it's more + * serious. + */ +static bool  efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)  { +	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);  	struct efx_nic *efx = rx_queue->efx;  	unsigned expected, dropped; +	if (rx_queue->scatter_n && +	    index == ((rx_queue->removed_count + rx_queue->scatter_n - 1) & +		      rx_queue->ptr_mask)) { +		++channel->n_rx_nodesc_trunc; +		return true; +	} +  	expected = rx_queue->removed_count & rx_queue->ptr_mask;  	dropped = (index - expected) & rx_queue->ptr_mask;  	netif_info(efx, rx_err, efx->net_dev, @@ -984,6 +1004,7 @@ efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)  	efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ?  			   RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE); +	return false;  }  /* Handle a packet received event @@ -999,7 +1020,7 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)  	unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt;  	unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt;  	unsigned expected_ptr; -	bool rx_ev_pkt_ok; +	bool rx_ev_pkt_ok, rx_ev_sop, rx_ev_cont;  	u16 flags;  	struct efx_rx_queue *rx_queue;  	struct efx_nic *efx = channel->efx; @@ -1007,21 +1028,56 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)  	if (unlikely(ACCESS_ONCE(efx->reset_pending)))  		return; -	/* Basic packet information */ -	rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT); -	rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK); -	rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE); -	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT)); -	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP) != 1); +	rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT); +	rx_ev_sop = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP);  	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) !=  		channel->channel);  	rx_queue = efx_channel_get_rx_queue(channel);  	rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR); -	expected_ptr = rx_queue->removed_count & rx_queue->ptr_mask; -	if (unlikely(rx_ev_desc_ptr != expected_ptr)) -		efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr); +	expected_ptr = ((rx_queue->removed_count + rx_queue->scatter_n) & +			rx_queue->ptr_mask); + +	/* Check for partial drops and other errors */ +	if (unlikely(rx_ev_desc_ptr != expected_ptr) || +	    unlikely(rx_ev_sop != (rx_queue->scatter_n == 0))) { +		if (rx_ev_desc_ptr != expected_ptr && +		    !efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr)) +			return; + +		/* Discard all pending fragments */ +		if (rx_queue->scatter_n) { +			efx_rx_packet( +				rx_queue, +				rx_queue->removed_count & rx_queue->ptr_mask, +				rx_queue->scatter_n, 0, EFX_RX_PKT_DISCARD); +			rx_queue->removed_count += rx_queue->scatter_n; +			rx_queue->scatter_n = 0; +		} + +		/* Return if there is no new fragment */ +		if (rx_ev_desc_ptr != expected_ptr) +			return; + +		/* Discard new fragment if not SOP */ +		if (!rx_ev_sop) { +			efx_rx_packet( +				rx_queue, +				rx_queue->removed_count & rx_queue->ptr_mask, +				1, 0, EFX_RX_PKT_DISCARD); +			++rx_queue->removed_count; +			return; +		} +	} + +	++rx_queue->scatter_n; +	if (rx_ev_cont) +		return; + +	rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT); +	rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK); +	rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);  	if (likely(rx_ev_pkt_ok)) {  		/* If packet is marked as OK and packet type is TCP/IP or @@ -1049,7 +1105,11 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)  	channel->irq_mod_score += 2;  	/* Handle received packet */ -	efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt, flags); +	efx_rx_packet(rx_queue, +		      rx_queue->removed_count & rx_queue->ptr_mask, +		      rx_queue->scatter_n, rx_ev_byte_cnt, flags); +	rx_queue->removed_count += rx_queue->scatter_n; +	rx_queue->scatter_n = 0;  }  /* If this flush done event corresponds to a &struct efx_tx_queue, then diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 3f93624fc27..07f6baa15c0 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -99,6 +99,9 @@  #define PTP_V2_VERSION_LENGTH	1  #define PTP_V2_VERSION_OFFSET	29 +#define PTP_V2_UUID_LENGTH	8 +#define PTP_V2_UUID_OFFSET	48 +  /* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),   * the MC only captures the last six bytes of the clock identity. These values   * reflect those, not the ones used in the standard.  The standard permits @@ -429,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,  	unsigned number_readings = (response_length /  			       MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);  	unsigned i; -	unsigned min; -	unsigned min_set = 0;  	unsigned total;  	unsigned ngood = 0;  	unsigned last_good = 0;  	struct efx_ptp_data *ptp = efx->ptp_data; -	bool min_valid = false;  	u32 last_sec;  	u32 start_sec;  	struct timespec delta; @@ -443,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,  	if (number_readings == 0)  		return -EAGAIN; -	/* Find minimum value in this set of results, discarding clearly -	 * erroneous results. +	/* Read the set of results and increment stats for any results that +	 * appera to be erroneous.  	 */  	for (i = 0; i < number_readings; i++) {  		efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);  		synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN; -		if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) { -			if (min_valid) { -				if (ptp->timeset[i].window < min_set) -					min_set = ptp->timeset[i].window; -			} else { -				min_valid = true; -				min_set = ptp->timeset[i].window; -			} -		} -	} - -	if (min_valid) { -		if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns)) -			min = ptp->base_sync_ns; -		else -			min = min_set; -	} else { -		min = SYNCHRONISATION_GRANULARITY_NS;  	} -	/* Discard excessively long synchronise durations.  The MC times -	 * when it finishes reading the host time so the corrected window -	 * time should be fairly constant for a given platform. +	/* Find the last good host-MC synchronization result. The MC times +	 * when it finishes reading the host time so the corrected window time +	 * should be fairly constant for a given platform.  	 */  	total = 0;  	for (i = 0; i < number_readings; i++) @@ -489,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,  	if (ngood == 0) {  		netif_warn(efx, drv, efx->net_dev, -			   "PTP no suitable synchronisations %dns %dns\n", -			   ptp->base_sync_ns, min_set); +			   "PTP no suitable synchronisations %dns\n", +			   ptp->base_sync_ns);  		return -EAGAIN;  	} @@ -1006,43 +988,53 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)   * the receive timestamp from the MC - this will probably occur after the   * packet arrival because of the processing in the MC.   */ -static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) +static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)  {  	struct efx_nic *efx = channel->efx;  	struct efx_ptp_data *ptp = efx->ptp_data;  	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb; -	u8 *data; +	u8 *match_data_012, *match_data_345;  	unsigned int version;  	match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);  	/* Correct version? */  	if (ptp->mode == MC_CMD_PTP_MODE_V1) { -		if (skb->len < PTP_V1_MIN_LENGTH) { -			netif_receive_skb(skb); -			return; +		if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) { +			return false;  		}  		version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);  		if (version != PTP_VERSION_V1) { -			netif_receive_skb(skb); -			return; +			return false;  		} + +		/* PTP V1 uses all six bytes of the UUID to match the packet +		 * to the timestamp +		 */ +		match_data_012 = skb->data + PTP_V1_UUID_OFFSET; +		match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3;  	} else { -		if (skb->len < PTP_V2_MIN_LENGTH) { -			netif_receive_skb(skb); -			return; +		if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) { +			return false;  		}  		version = skb->data[PTP_V2_VERSION_OFFSET]; - -		BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2); -		BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET); -		BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH); -		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET); -		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); -  		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { -			netif_receive_skb(skb); -			return; +			return false; +		} + +		/* The original V2 implementation uses bytes 2-7 of +		 * the UUID to match the packet to the timestamp. This +		 * discards two of the bytes of the MAC address used +		 * to create the UUID (SF bug 33070).  The PTP V2 +		 * enhanced mode fixes this issue and uses bytes 0-2 +		 * and byte 5-7 of the UUID. +		 */ +		match_data_345 = skb->data + PTP_V2_UUID_OFFSET + 5; +		if (ptp->mode == MC_CMD_PTP_MODE_V2) { +			match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 2; +		} else { +			match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 0; +			BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED);  		}  	} @@ -1056,14 +1048,19 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)  		timestamps = skb_hwtstamps(skb);  		memset(timestamps, 0, sizeof(*timestamps)); +		/* We expect the sequence number to be in the same position in +		 * the packet for PTP V1 and V2 +		 */ +		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET); +		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); +  		/* Extract UUID/Sequence information */ -		data = skb->data + PTP_V1_UUID_OFFSET; -		match->words[0] = (data[0]         | -				   (data[1] << 8)  | -				   (data[2] << 16) | -				   (data[3] << 24)); -		match->words[1] = (data[4]         | -				   (data[5] << 8)  | +		match->words[0] = (match_data_012[0]         | +				   (match_data_012[1] << 8)  | +				   (match_data_012[2] << 16) | +				   (match_data_345[0] << 24)); +		match->words[1] = (match_data_345[1]         | +				   (match_data_345[2] << 8)  |  				   (skb->data[PTP_V1_SEQUENCE_OFFSET +  					      PTP_V1_SEQUENCE_LENGTH - 1] <<  				    16)); @@ -1073,6 +1070,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)  	skb_queue_tail(&ptp->rxq, skb);  	queue_work(ptp->workwq, &ptp->work); + +	return true;  }  /* Transmit a PTP packet.  This has to be transmitted by the MC @@ -1167,7 +1166,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)  	 * timestamped  	 */  		init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT; -		new_mode = MC_CMD_PTP_MODE_V2; +		new_mode = MC_CMD_PTP_MODE_V2_ENHANCED;  		enable_wanted = true;  		break;  	case HWTSTAMP_FILTER_PTP_V2_EVENT: @@ -1186,7 +1185,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)  	if (init->tx_type != HWTSTAMP_TX_OFF)  		enable_wanted = true; +	/* Old versions of the firmware do not support the improved +	 * UUID filtering option (SF bug 33070).  If the firmware does +	 * not accept the enhanced mode, fall back to the standard PTP +	 * v2 UUID filtering. +	 */  	rc = efx_ptp_change_mode(efx, enable_wanted, new_mode); +	if ((rc != 0) && (new_mode == MC_CMD_PTP_MODE_V2_ENHANCED)) +		rc = efx_ptp_change_mode(efx, enable_wanted, MC_CMD_PTP_MODE_V2);  	if (rc != 0)  		return rc; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index bb579a6128c..e73e30bac10 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -16,6 +16,7 @@  #include <linux/udp.h>  #include <linux/prefetch.h>  #include <linux/moduleparam.h> +#include <linux/iommu.h>  #include <net/ip.h>  #include <net/checksum.h>  #include "net_driver.h" @@ -24,85 +25,39 @@  #include "selftest.h"  #include "workarounds.h" -/* Number of RX descriptors pushed at once. */ -#define EFX_RX_BATCH  8 +/* Preferred number of descriptors to fill at once */ +#define EFX_RX_PREFERRED_BATCH 8U -/* Maximum size of a buffer sharing a page */ -#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state)) +/* Number of RX buffers to recycle pages for.  When creating the RX page recycle + * ring, this number is divided by the number of buffers per page to calculate + * the number of pages to store in the RX page recycle ring. + */ +#define EFX_RECYCLE_RING_SIZE_IOMMU 4096 +#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)  /* Size of buffer allocated for skb header area. */  #define EFX_SKB_HEADERS  64u -/* - * rx_alloc_method - RX buffer allocation method - * - * This driver supports two methods for allocating and using RX buffers: - * each RX buffer may be backed by an skb or by an order-n page. - * - * When GRO is in use then the second method has a lower overhead, - * since we don't have to allocate then free skbs on reassembled frames. - * - * Values: - *   - RX_ALLOC_METHOD_AUTO = 0 - *   - RX_ALLOC_METHOD_SKB  = 1 - *   - RX_ALLOC_METHOD_PAGE = 2 - * - * The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count - * controlled by the parameters below. - * - *   - Since pushing and popping descriptors are separated by the rx_queue - *     size, so the watermarks should be ~rxd_size. - *   - The performance win by using page-based allocation for GRO is less - *     than the performance hit of using page-based allocation of non-GRO, - *     so the watermarks should reflect this. - * - * Per channel we maintain a single variable, updated by each channel: - * - *   rx_alloc_level += (gro_performed ? RX_ALLOC_FACTOR_GRO : - *                      RX_ALLOC_FACTOR_SKB) - * Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which - * limits the hysteresis), and update the allocation strategy: - * - *   rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_GRO ? - *                      RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB) - */ -static int rx_alloc_method = RX_ALLOC_METHOD_AUTO; - -#define RX_ALLOC_LEVEL_GRO 0x2000 -#define RX_ALLOC_LEVEL_MAX 0x3000 -#define RX_ALLOC_FACTOR_GRO 1 -#define RX_ALLOC_FACTOR_SKB (-2) -  /* This is the percentage fill level below which new RX descriptors   * will be added to the RX descriptor ring.   */  static unsigned int rx_refill_threshold; +/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */ +#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \ +				      EFX_RX_USR_BUF_SIZE) +  /*   * RX maximum head room required.   * - * This must be at least 1 to prevent overflow and at least 2 to allow - * pipelined receives. + * This must be at least 1 to prevent overflow, plus one packet-worth + * to allow pipelined receives.   */ -#define EFX_RXD_HEAD_ROOM 2 +#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS) -/* Offset of ethernet header within page */ -static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx, -					     struct efx_rx_buffer *buf) +static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)  { -	return buf->page_offset + efx->type->rx_buffer_hash_size; -} -static inline unsigned int efx_rx_buf_size(struct efx_nic *efx) -{ -	return PAGE_SIZE << efx->rx_buffer_order; -} - -static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf) -{ -	if (buf->flags & EFX_RX_BUF_PAGE) -		return page_address(buf->u.page) + efx_rx_buf_offset(efx, buf); -	else -		return (u8 *)buf->u.skb->data + efx->type->rx_buffer_hash_size; +	return page_address(buf->page) + buf->page_offset;  }  static inline u32 efx_rx_buf_hash(const u8 *eh) @@ -119,66 +74,81 @@ static inline u32 efx_rx_buf_hash(const u8 *eh)  #endif  } -/** - * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers - * - * @rx_queue:		Efx RX queue - * - * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a - * struct efx_rx_buffer for each one. Return a negative error code or 0 - * on success. May fail having only inserted fewer than EFX_RX_BATCH - * buffers. - */ -static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue) +static inline struct efx_rx_buffer * +efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)  { -	struct efx_nic *efx = rx_queue->efx; -	struct net_device *net_dev = efx->net_dev; -	struct efx_rx_buffer *rx_buf; -	struct sk_buff *skb; -	int skb_len = efx->rx_buffer_len; -	unsigned index, count; +	if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask))) +		return efx_rx_buffer(rx_queue, 0); +	else +		return rx_buf + 1; +} -	for (count = 0; count < EFX_RX_BATCH; ++count) { -		index = rx_queue->added_count & rx_queue->ptr_mask; -		rx_buf = efx_rx_buffer(rx_queue, index); +static inline void efx_sync_rx_buffer(struct efx_nic *efx, +				      struct efx_rx_buffer *rx_buf, +				      unsigned int len) +{ +	dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len, +				DMA_FROM_DEVICE); +} -		rx_buf->u.skb = skb = netdev_alloc_skb(net_dev, skb_len); -		if (unlikely(!skb)) -			return -ENOMEM; +void efx_rx_config_page_split(struct efx_nic *efx) +{ +	efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + EFX_PAGE_IP_ALIGN, +				      L1_CACHE_BYTES); +	efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 : +		((PAGE_SIZE - sizeof(struct efx_rx_page_state)) / +		 efx->rx_page_buf_step); +	efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) / +		efx->rx_bufs_per_page; +	efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH, +					       efx->rx_bufs_per_page); +} + +/* Check the RX page recycle ring for a page that can be reused. */ +static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue) +{ +	struct efx_nic *efx = rx_queue->efx; +	struct page *page; +	struct efx_rx_page_state *state; +	unsigned index; -		/* Adjust the SKB for padding */ -		skb_reserve(skb, NET_IP_ALIGN); -		rx_buf->len = skb_len - NET_IP_ALIGN; -		rx_buf->flags = 0; +	index = rx_queue->page_remove & rx_queue->page_ptr_mask; +	page = rx_queue->page_ring[index]; +	if (page == NULL) +		return NULL; -		rx_buf->dma_addr = dma_map_single(&efx->pci_dev->dev, -						  skb->data, rx_buf->len, -						  DMA_FROM_DEVICE); -		if (unlikely(dma_mapping_error(&efx->pci_dev->dev, -					       rx_buf->dma_addr))) { -			dev_kfree_skb_any(skb); -			rx_buf->u.skb = NULL; -			return -EIO; -		} +	rx_queue->page_ring[index] = NULL; +	/* page_remove cannot exceed page_add. */ +	if (rx_queue->page_remove != rx_queue->page_add) +		++rx_queue->page_remove; -		++rx_queue->added_count; -		++rx_queue->alloc_skb_count; +	/* If page_count is 1 then we hold the only reference to this page. */ +	if (page_count(page) == 1) { +		++rx_queue->page_recycle_count; +		return page; +	} else { +		state = page_address(page); +		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, +			       PAGE_SIZE << efx->rx_buffer_order, +			       DMA_FROM_DEVICE); +		put_page(page); +		++rx_queue->page_recycle_failed;  	} -	return 0; +	return NULL;  }  /** - * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers + * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers   *   * @rx_queue:		Efx RX queue   * - * This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA, - * and populates struct efx_rx_buffers for each one. Return a negative error - * code or 0 on success. If a single page can be split between two buffers, - * then the page will either be inserted fully, or not at at all. + * This allocates a batch of pages, maps them for DMA, and populates + * struct efx_rx_buffers for each one. Return a negative error code or + * 0 on success. If a single page can be used for multiple buffers, + * then the page will either be inserted fully, or not at all.   */ -static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) +static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)  {  	struct efx_nic *efx = rx_queue->efx;  	struct efx_rx_buffer *rx_buf; @@ -188,150 +158,140 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)  	dma_addr_t dma_addr;  	unsigned index, count; -	/* We can split a page between two buffers */ -	BUILD_BUG_ON(EFX_RX_BATCH & 1); - -	for (count = 0; count < EFX_RX_BATCH; ++count) { -		page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, -				   efx->rx_buffer_order); -		if (unlikely(page == NULL)) -			return -ENOMEM; -		dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0, -					efx_rx_buf_size(efx), -					DMA_FROM_DEVICE); -		if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) { -			__free_pages(page, efx->rx_buffer_order); -			return -EIO; +	count = 0; +	do { +		page = efx_reuse_page(rx_queue); +		if (page == NULL) { +			page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, +					   efx->rx_buffer_order); +			if (unlikely(page == NULL)) +				return -ENOMEM; +			dma_addr = +				dma_map_page(&efx->pci_dev->dev, page, 0, +					     PAGE_SIZE << efx->rx_buffer_order, +					     DMA_FROM_DEVICE); +			if (unlikely(dma_mapping_error(&efx->pci_dev->dev, +						       dma_addr))) { +				__free_pages(page, efx->rx_buffer_order); +				return -EIO; +			} +			state = page_address(page); +			state->dma_addr = dma_addr; +		} else { +			state = page_address(page); +			dma_addr = state->dma_addr;  		} -		state = page_address(page); -		state->refcnt = 0; -		state->dma_addr = dma_addr;  		dma_addr += sizeof(struct efx_rx_page_state);  		page_offset = sizeof(struct efx_rx_page_state); -	split: -		index = rx_queue->added_count & rx_queue->ptr_mask; -		rx_buf = efx_rx_buffer(rx_queue, index); -		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN; -		rx_buf->u.page = page; -		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN; -		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN; -		rx_buf->flags = EFX_RX_BUF_PAGE; -		++rx_queue->added_count; -		++rx_queue->alloc_page_count; -		++state->refcnt; - -		if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) { -			/* Use the second half of the page */ +		do { +			index = rx_queue->added_count & rx_queue->ptr_mask; +			rx_buf = efx_rx_buffer(rx_queue, index); +			rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN; +			rx_buf->page = page; +			rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN; +			rx_buf->len = efx->rx_dma_len; +			rx_buf->flags = 0; +			++rx_queue->added_count;  			get_page(page); -			dma_addr += (PAGE_SIZE >> 1); -			page_offset += (PAGE_SIZE >> 1); -			++count; -			goto split; -		} -	} +			dma_addr += efx->rx_page_buf_step; +			page_offset += efx->rx_page_buf_step; +		} while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE); + +		rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE; +	} while (++count < efx->rx_pages_per_batch);  	return 0;  } +/* Unmap a DMA-mapped page.  This function is only called for the final RX + * buffer in a page. + */  static void efx_unmap_rx_buffer(struct efx_nic *efx, -				struct efx_rx_buffer *rx_buf, -				unsigned int used_len) +				struct efx_rx_buffer *rx_buf)  { -	if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { -		struct efx_rx_page_state *state; +	struct page *page = rx_buf->page; -		state = page_address(rx_buf->u.page); -		if (--state->refcnt == 0) { -			dma_unmap_page(&efx->pci_dev->dev, -				       state->dma_addr, -				       efx_rx_buf_size(efx), -				       DMA_FROM_DEVICE); -		} else if (used_len) { -			dma_sync_single_for_cpu(&efx->pci_dev->dev, -						rx_buf->dma_addr, used_len, -						DMA_FROM_DEVICE); -		} -	} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) { -		dma_unmap_single(&efx->pci_dev->dev, rx_buf->dma_addr, -				 rx_buf->len, DMA_FROM_DEVICE); +	if (page) { +		struct efx_rx_page_state *state = page_address(page); +		dma_unmap_page(&efx->pci_dev->dev, +			       state->dma_addr, +			       PAGE_SIZE << efx->rx_buffer_order, +			       DMA_FROM_DEVICE);  	}  } -static void efx_free_rx_buffer(struct efx_nic *efx, -			       struct efx_rx_buffer *rx_buf) +static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf)  { -	if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { -		__free_pages(rx_buf->u.page, efx->rx_buffer_order); -		rx_buf->u.page = NULL; -	} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) { -		dev_kfree_skb_any(rx_buf->u.skb); -		rx_buf->u.skb = NULL; +	if (rx_buf->page) { +		put_page(rx_buf->page); +		rx_buf->page = NULL;  	}  } -static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, -			       struct efx_rx_buffer *rx_buf) +/* Attempt to recycle the page if there is an RX recycle ring; the page can + * only be added if this is the final RX buffer, to prevent pages being used in + * the descriptor ring and appearing in the recycle ring simultaneously. + */ +static void efx_recycle_rx_page(struct efx_channel *channel, +				struct efx_rx_buffer *rx_buf)  { -	efx_unmap_rx_buffer(rx_queue->efx, rx_buf, 0); -	efx_free_rx_buffer(rx_queue->efx, rx_buf); -} +	struct page *page = rx_buf->page; +	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); +	struct efx_nic *efx = rx_queue->efx; +	unsigned index; -/* Attempt to resurrect the other receive buffer that used to share this page, - * which had previously been passed up to the kernel and freed. */ -static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue, -				    struct efx_rx_buffer *rx_buf) -{ -	struct efx_rx_page_state *state = page_address(rx_buf->u.page); -	struct efx_rx_buffer *new_buf; -	unsigned fill_level, index; +	/* Only recycle the page after processing the final buffer. */ +	if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE)) +		return; -	/* +1 because efx_rx_packet() incremented removed_count. +1 because -	 * we'd like to insert an additional descriptor whilst leaving -	 * EFX_RXD_HEAD_ROOM for the non-recycle path */ -	fill_level = (rx_queue->added_count - rx_queue->removed_count + 2); -	if (unlikely(fill_level > rx_queue->max_fill)) { -		/* We could place "state" on a list, and drain the list in -		 * efx_fast_push_rx_descriptors(). For now, this will do. */ +	index = rx_queue->page_add & rx_queue->page_ptr_mask; +	if (rx_queue->page_ring[index] == NULL) { +		unsigned read_index = rx_queue->page_remove & +			rx_queue->page_ptr_mask; + +		/* The next slot in the recycle ring is available, but +		 * increment page_remove if the read pointer currently +		 * points here. +		 */ +		if (read_index == index) +			++rx_queue->page_remove; +		rx_queue->page_ring[index] = page; +		++rx_queue->page_add;  		return;  	} +	++rx_queue->page_recycle_full; +	efx_unmap_rx_buffer(efx, rx_buf); +	put_page(rx_buf->page); +} -	++state->refcnt; -	get_page(rx_buf->u.page); +static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, +			       struct efx_rx_buffer *rx_buf) +{ +	/* Release the page reference we hold for the buffer. */ +	if (rx_buf->page) +		put_page(rx_buf->page); -	index = rx_queue->added_count & rx_queue->ptr_mask; -	new_buf = efx_rx_buffer(rx_queue, index); -	new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1); -	new_buf->u.page = rx_buf->u.page; -	new_buf->len = rx_buf->len; -	new_buf->flags = EFX_RX_BUF_PAGE; -	++rx_queue->added_count; +	/* If this is the last buffer in a page, unmap and free it. */ +	if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) { +		efx_unmap_rx_buffer(rx_queue->efx, rx_buf); +		efx_free_rx_buffer(rx_buf); +	} +	rx_buf->page = NULL;  } -/* Recycle the given rx buffer directly back into the rx_queue. There is - * always room to add this buffer, because we've just popped a buffer. */ -static void efx_recycle_rx_buffer(struct efx_channel *channel, -				  struct efx_rx_buffer *rx_buf) +/* Recycle the pages that are used by buffers that have just been received. */ +static void efx_recycle_rx_buffers(struct efx_channel *channel, +				   struct efx_rx_buffer *rx_buf, +				   unsigned int n_frags)  { -	struct efx_nic *efx = channel->efx;  	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); -	struct efx_rx_buffer *new_buf; -	unsigned index; - -	rx_buf->flags &= EFX_RX_BUF_PAGE; - -	if ((rx_buf->flags & EFX_RX_BUF_PAGE) && -	    efx->rx_buffer_len <= EFX_RX_HALF_PAGE && -	    page_count(rx_buf->u.page) == 1) -		efx_resurrect_rx_buffer(rx_queue, rx_buf); -	index = rx_queue->added_count & rx_queue->ptr_mask; -	new_buf = efx_rx_buffer(rx_queue, index); - -	memcpy(new_buf, rx_buf, sizeof(*new_buf)); -	rx_buf->u.page = NULL; -	++rx_queue->added_count; +	do { +		efx_recycle_rx_page(channel, rx_buf); +		rx_buf = efx_rx_buf_next(rx_queue, rx_buf); +	} while (--n_frags);  }  /** @@ -348,8 +308,8 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,   */  void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)  { -	struct efx_channel *channel = efx_rx_queue_channel(rx_queue); -	unsigned fill_level; +	struct efx_nic *efx = rx_queue->efx; +	unsigned int fill_level, batch_size;  	int space, rc = 0;  	/* Calculate current fill level, and exit if we don't need to fill */ @@ -364,28 +324,26 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)  			rx_queue->min_fill = fill_level;  	} +	batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;  	space = rx_queue->max_fill - fill_level; -	EFX_BUG_ON_PARANOID(space < EFX_RX_BATCH); +	EFX_BUG_ON_PARANOID(space < batch_size);  	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,  		   "RX queue %d fast-filling descriptor ring from" -		   " level %d to level %d using %s allocation\n", +		   " level %d to level %d\n",  		   efx_rx_queue_index(rx_queue), fill_level, -		   rx_queue->max_fill, -		   channel->rx_alloc_push_pages ? "page" : "skb"); +		   rx_queue->max_fill); +  	do { -		if (channel->rx_alloc_push_pages) -			rc = efx_init_rx_buffers_page(rx_queue); -		else -			rc = efx_init_rx_buffers_skb(rx_queue); +		rc = efx_init_rx_buffers(rx_queue);  		if (unlikely(rc)) {  			/* Ensure that we don't leave the rx queue empty */  			if (rx_queue->added_count == rx_queue->removed_count)  				efx_schedule_slow_fill(rx_queue);  			goto out;  		} -	} while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH); +	} while ((space -= batch_size) >= batch_size);  	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,  		   "RX queue %d fast-filled descriptor ring " @@ -408,7 +366,7 @@ void efx_rx_slow_fill(unsigned long context)  static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,  				     struct efx_rx_buffer *rx_buf, -				     int len, bool *leak_packet) +				     int len)  {  	struct efx_nic *efx = rx_queue->efx;  	unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding; @@ -428,11 +386,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,  				  "RX event (0x%x > 0x%x+0x%x). Leaking\n",  				  efx_rx_queue_index(rx_queue), len, max_len,  				  efx->type->rx_buffer_padding); -		/* If this buffer was skb-allocated, then the meta -		 * data at the end of the skb will be trashed. So -		 * we have no choice but to leak the fragment. -		 */ -		*leak_packet = !(rx_buf->flags & EFX_RX_BUF_PAGE);  		efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);  	} else {  		if (net_ratelimit()) @@ -448,212 +401,238 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,  /* Pass a received packet up through GRO.  GRO can handle pages   * regardless of checksum state and skbs with a good checksum.   */ -static void efx_rx_packet_gro(struct efx_channel *channel, -			      struct efx_rx_buffer *rx_buf, -			      const u8 *eh) +static void +efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, +		  unsigned int n_frags, u8 *eh)  {  	struct napi_struct *napi = &channel->napi_str;  	gro_result_t gro_result; +	struct efx_nic *efx = channel->efx; +	struct sk_buff *skb; + +	skb = napi_get_frags(napi); +	if (unlikely(!skb)) { +		while (n_frags--) { +			put_page(rx_buf->page); +			rx_buf->page = NULL; +			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf); +		} +		return; +	} -	if (rx_buf->flags & EFX_RX_BUF_PAGE) { -		struct efx_nic *efx = channel->efx; -		struct page *page = rx_buf->u.page; -		struct sk_buff *skb; +	if (efx->net_dev->features & NETIF_F_RXHASH) +		skb->rxhash = efx_rx_buf_hash(eh); +	skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? +			  CHECKSUM_UNNECESSARY : CHECKSUM_NONE); -		rx_buf->u.page = NULL; +	for (;;) { +		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, +				   rx_buf->page, rx_buf->page_offset, +				   rx_buf->len); +		rx_buf->page = NULL; +		skb->len += rx_buf->len; +		if (skb_shinfo(skb)->nr_frags == n_frags) +			break; -		skb = napi_get_frags(napi); -		if (!skb) { -			put_page(page); -			return; -		} +		rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf); +	} + +	skb->data_len = skb->len; +	skb->truesize += n_frags * efx->rx_buffer_truesize; + +	skb_record_rx_queue(skb, channel->rx_queue.core_index); + +	gro_result = napi_gro_frags(napi); +	if (gro_result != GRO_DROP) +		channel->irq_mod_score += 2; +} -		if (efx->net_dev->features & NETIF_F_RXHASH) -			skb->rxhash = efx_rx_buf_hash(eh); +/* Allocate and construct an SKB around page fragments */ +static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel, +				     struct efx_rx_buffer *rx_buf, +				     unsigned int n_frags, +				     u8 *eh, int hdr_len) +{ +	struct efx_nic *efx = channel->efx; +	struct sk_buff *skb; -		skb_fill_page_desc(skb, 0, page, -				   efx_rx_buf_offset(efx, rx_buf), rx_buf->len); +	/* Allocate an SKB to store the headers */ +	skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN); +	if (unlikely(skb == NULL)) +		return NULL; -		skb->len = rx_buf->len; -		skb->data_len = rx_buf->len; -		skb->truesize += rx_buf->len; -		skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? -				  CHECKSUM_UNNECESSARY : CHECKSUM_NONE); +	EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len); -		skb_record_rx_queue(skb, channel->rx_queue.core_index); +	skb_reserve(skb, EFX_PAGE_SKB_ALIGN); +	memcpy(__skb_put(skb, hdr_len), eh, hdr_len); -		gro_result = napi_gro_frags(napi); -	} else { -		struct sk_buff *skb = rx_buf->u.skb; +	/* Append the remaining page(s) onto the frag list */ +	if (rx_buf->len > hdr_len) { +		rx_buf->page_offset += hdr_len; +		rx_buf->len -= hdr_len; -		EFX_BUG_ON_PARANOID(!(rx_buf->flags & EFX_RX_PKT_CSUMMED)); -		rx_buf->u.skb = NULL; -		skb->ip_summed = CHECKSUM_UNNECESSARY; +		for (;;) { +			skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, +					   rx_buf->page, rx_buf->page_offset, +					   rx_buf->len); +			rx_buf->page = NULL; +			skb->len += rx_buf->len; +			skb->data_len += rx_buf->len; +			if (skb_shinfo(skb)->nr_frags == n_frags) +				break; -		gro_result = napi_gro_receive(napi, skb); +			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf); +		} +	} else { +		__free_pages(rx_buf->page, efx->rx_buffer_order); +		rx_buf->page = NULL; +		n_frags = 0;  	} -	if (gro_result == GRO_NORMAL) { -		channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; -	} else if (gro_result != GRO_DROP) { -		channel->rx_alloc_level += RX_ALLOC_FACTOR_GRO; -		channel->irq_mod_score += 2; -	} +	skb->truesize += n_frags * efx->rx_buffer_truesize; + +	/* Move past the ethernet header */ +	skb->protocol = eth_type_trans(skb, efx->net_dev); + +	return skb;  }  void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, -		   unsigned int len, u16 flags) +		   unsigned int n_frags, unsigned int len, u16 flags)  {  	struct efx_nic *efx = rx_queue->efx;  	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);  	struct efx_rx_buffer *rx_buf; -	bool leak_packet = false;  	rx_buf = efx_rx_buffer(rx_queue, index);  	rx_buf->flags |= flags; -	/* This allows the refill path to post another buffer. -	 * EFX_RXD_HEAD_ROOM ensures that the slot we are using -	 * isn't overwritten yet. -	 */ -	rx_queue->removed_count++; - -	/* Validate the length encoded in the event vs the descriptor pushed */ -	efx_rx_packet__check_len(rx_queue, rx_buf, len, &leak_packet); +	/* Validate the number of fragments and completed length */ +	if (n_frags == 1) { +		efx_rx_packet__check_len(rx_queue, rx_buf, len); +	} else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) || +		   unlikely(len <= (n_frags - 1) * EFX_RX_USR_BUF_SIZE) || +		   unlikely(len > n_frags * EFX_RX_USR_BUF_SIZE) || +		   unlikely(!efx->rx_scatter)) { +		/* If this isn't an explicit discard request, either +		 * the hardware or the driver is broken. +		 */ +		WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD)); +		rx_buf->flags |= EFX_RX_PKT_DISCARD; +	}  	netif_vdbg(efx, rx_status, efx->net_dev, -		   "RX queue %d received id %x at %llx+%x %s%s\n", +		   "RX queue %d received ids %x-%x len %d %s%s\n",  		   efx_rx_queue_index(rx_queue), index, -		   (unsigned long long)rx_buf->dma_addr, len, +		   (index + n_frags - 1) & rx_queue->ptr_mask, len,  		   (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",  		   (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : ""); -	/* Discard packet, if instructed to do so */ +	/* Discard packet, if instructed to do so.  Process the +	 * previous receive first. +	 */  	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { -		if (unlikely(leak_packet)) -			channel->n_skbuff_leaks++; -		else -			efx_recycle_rx_buffer(channel, rx_buf); - -		/* Don't hold off the previous receive */ -		rx_buf = NULL; -		goto out; +		efx_rx_flush_packet(channel); +		put_page(rx_buf->page); +		efx_recycle_rx_buffers(channel, rx_buf, n_frags); +		return;  	} -	/* Release and/or sync DMA mapping - assumes all RX buffers -	 * consumed in-order per RX queue +	if (n_frags == 1) +		rx_buf->len = len; + +	/* Release and/or sync the DMA mapping - assumes all RX buffers +	 * consumed in-order per RX queue.  	 */ -	efx_unmap_rx_buffer(efx, rx_buf, len); +	efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);  	/* Prefetch nice and early so data will (hopefully) be in cache by  	 * the time we look at it.  	 */ -	prefetch(efx_rx_buf_eh(efx, rx_buf)); +	prefetch(efx_rx_buf_va(rx_buf)); + +	rx_buf->page_offset += efx->type->rx_buffer_hash_size; +	rx_buf->len -= efx->type->rx_buffer_hash_size; + +	if (n_frags > 1) { +		/* Release/sync DMA mapping for additional fragments. +		 * Fix length for last fragment. +		 */ +		unsigned int tail_frags = n_frags - 1; + +		for (;;) { +			rx_buf = efx_rx_buf_next(rx_queue, rx_buf); +			if (--tail_frags == 0) +				break; +			efx_sync_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE); +		} +		rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE; +		efx_sync_rx_buffer(efx, rx_buf, rx_buf->len); +	} + +	/* All fragments have been DMA-synced, so recycle buffers and pages. */ +	rx_buf = efx_rx_buffer(rx_queue, index); +	efx_recycle_rx_buffers(channel, rx_buf, n_frags);  	/* Pipeline receives so that we give time for packet headers to be  	 * prefetched into cache.  	 */ -	rx_buf->len = len - efx->type->rx_buffer_hash_size; -out: -	if (channel->rx_pkt) -		__efx_rx_packet(channel, channel->rx_pkt); -	channel->rx_pkt = rx_buf; +	efx_rx_flush_packet(channel); +	channel->rx_pkt_n_frags = n_frags; +	channel->rx_pkt_index = index;  } -static void efx_rx_deliver(struct efx_channel *channel, -			   struct efx_rx_buffer *rx_buf) +static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, +			   struct efx_rx_buffer *rx_buf, +			   unsigned int n_frags)  {  	struct sk_buff *skb; +	u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS); -	/* We now own the SKB */ -	skb = rx_buf->u.skb; -	rx_buf->u.skb = NULL; +	skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len); +	if (unlikely(skb == NULL)) { +		efx_free_rx_buffer(rx_buf); +		return; +	} +	skb_record_rx_queue(skb, channel->rx_queue.core_index);  	/* Set the SKB flags */  	skb_checksum_none_assert(skb); -	/* Record the rx_queue */ -	skb_record_rx_queue(skb, channel->rx_queue.core_index); - -	/* Pass the packet up */  	if (channel->type->receive_skb) -		channel->type->receive_skb(channel, skb); -	else -		netif_receive_skb(skb); +		if (channel->type->receive_skb(channel, skb)) +			return; -	/* Update allocation strategy method */ -	channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; +	/* Pass the packet up */ +	netif_receive_skb(skb);  }  /* Handle a received packet.  Second half: Touches packet payload. */ -void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf) +void __efx_rx_packet(struct efx_channel *channel)  {  	struct efx_nic *efx = channel->efx; -	u8 *eh = efx_rx_buf_eh(efx, rx_buf); +	struct efx_rx_buffer *rx_buf = +		efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index); +	u8 *eh = efx_rx_buf_va(rx_buf);  	/* If we're in loopback test, then pass the packet directly to the  	 * loopback layer, and free the rx_buf here  	 */  	if (unlikely(efx->loopback_selftest)) {  		efx_loopback_rx_packet(efx, eh, rx_buf->len); -		efx_free_rx_buffer(efx, rx_buf); -		return; -	} - -	if (!(rx_buf->flags & EFX_RX_BUF_PAGE)) { -		struct sk_buff *skb = rx_buf->u.skb; - -		prefetch(skb_shinfo(skb)); - -		skb_reserve(skb, efx->type->rx_buffer_hash_size); -		skb_put(skb, rx_buf->len); - -		if (efx->net_dev->features & NETIF_F_RXHASH) -			skb->rxhash = efx_rx_buf_hash(eh); - -		/* Move past the ethernet header. rx_buf->data still points -		 * at the ethernet header */ -		skb->protocol = eth_type_trans(skb, efx->net_dev); - -		skb_record_rx_queue(skb, channel->rx_queue.core_index); +		efx_free_rx_buffer(rx_buf); +		goto out;  	}  	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))  		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; -	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) && -	    !channel->type->receive_skb) -		efx_rx_packet_gro(channel, rx_buf, eh); +	if (!channel->type->receive_skb) +		efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh);  	else -		efx_rx_deliver(channel, rx_buf); -} - -void efx_rx_strategy(struct efx_channel *channel) -{ -	enum efx_rx_alloc_method method = rx_alloc_method; - -	if (channel->type->receive_skb) { -		channel->rx_alloc_push_pages = false; -		return; -	} - -	/* Only makes sense to use page based allocation if GRO is enabled */ -	if (!(channel->efx->net_dev->features & NETIF_F_GRO)) { -		method = RX_ALLOC_METHOD_SKB; -	} else if (method == RX_ALLOC_METHOD_AUTO) { -		/* Constrain the rx_alloc_level */ -		if (channel->rx_alloc_level < 0) -			channel->rx_alloc_level = 0; -		else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX) -			channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX; - -		/* Decide on the allocation method */ -		method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_GRO) ? -			  RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB); -	} - -	/* Push the option */ -	channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE); +		efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags); +out: +	channel->rx_pkt_n_frags = 0;  }  int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) @@ -683,9 +662,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)  		kfree(rx_queue->buffer);  		rx_queue->buffer = NULL;  	} +  	return rc;  } +static void efx_init_rx_recycle_ring(struct efx_nic *efx, +				     struct efx_rx_queue *rx_queue) +{ +	unsigned int bufs_in_recycle_ring, page_ring_size; + +	/* Set the RX recycle ring size */ +#ifdef CONFIG_PPC64 +	bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; +#else +	if (efx->pci_dev->dev.iommu_group) +		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; +	else +		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU; +#endif /* CONFIG_PPC64 */ + +	page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring / +					    efx->rx_bufs_per_page); +	rx_queue->page_ring = kcalloc(page_ring_size, +				      sizeof(*rx_queue->page_ring), GFP_KERNEL); +	rx_queue->page_ptr_mask = page_ring_size - 1; +} +  void efx_init_rx_queue(struct efx_rx_queue *rx_queue)  {  	struct efx_nic *efx = rx_queue->efx; @@ -699,10 +701,18 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)  	rx_queue->notified_count = 0;  	rx_queue->removed_count = 0;  	rx_queue->min_fill = -1U; +	efx_init_rx_recycle_ring(efx, rx_queue); + +	rx_queue->page_remove = 0; +	rx_queue->page_add = rx_queue->page_ptr_mask + 1; +	rx_queue->page_recycle_count = 0; +	rx_queue->page_recycle_failed = 0; +	rx_queue->page_recycle_full = 0;  	/* Initialise limit fields */  	max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; -	max_trigger = max_fill - EFX_RX_BATCH; +	max_trigger = +		max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;  	if (rx_refill_threshold != 0) {  		trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;  		if (trigger > max_trigger) @@ -722,6 +732,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)  void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)  {  	int i; +	struct efx_nic *efx = rx_queue->efx;  	struct efx_rx_buffer *rx_buf;  	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, @@ -733,13 +744,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)  	del_timer_sync(&rx_queue->slow_fill);  	efx_nic_fini_rx(rx_queue); -	/* Release RX buffers NB start at index 0 not current HW ptr */ +	/* Release RX buffers from the current read ptr to the write ptr */  	if (rx_queue->buffer) { -		for (i = 0; i <= rx_queue->ptr_mask; i++) { -			rx_buf = efx_rx_buffer(rx_queue, i); +		for (i = rx_queue->removed_count; i < rx_queue->added_count; +		     i++) { +			unsigned index = i & rx_queue->ptr_mask; +			rx_buf = efx_rx_buffer(rx_queue, index);  			efx_fini_rx_buffer(rx_queue, rx_buf);  		}  	} + +	/* Unmap and release the pages in the recycle ring. Remove the ring. */ +	for (i = 0; i <= rx_queue->page_ptr_mask; i++) { +		struct page *page = rx_queue->page_ring[i]; +		struct efx_rx_page_state *state; + +		if (page == NULL) +			continue; + +		state = page_address(page); +		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, +			       PAGE_SIZE << efx->rx_buffer_order, +			       DMA_FROM_DEVICE); +		put_page(page); +	} +	kfree(rx_queue->page_ring); +	rx_queue->page_ring = NULL;  }  void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) @@ -754,9 +784,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)  } -module_param(rx_alloc_method, int, 0644); -MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers"); -  module_param(rx_refill_threshold, uint, 0444);  MODULE_PARM_DESC(rx_refill_threshold,  		 "RX descriptor ring refill threshold (%)"); diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c index ba40f67e4f0..51669244d15 100644 --- a/drivers/net/ethernet/sfc/siena.c +++ b/drivers/net/ethernet/sfc/siena.c @@ -202,7 +202,7 @@ out:  static enum reset_type siena_map_reset_reason(enum reset_type reason)  { -	return RESET_TYPE_ALL; +	return RESET_TYPE_RECOVER_OR_ALL;  }  static int siena_map_reset_flags(u32 *flags) @@ -245,6 +245,22 @@ static int siena_reset_hw(struct efx_nic *efx, enum reset_type method)  		return efx_mcdi_reset_port(efx);  } +#ifdef CONFIG_EEH +/* When a PCI device is isolated from the bus, a subsequent MMIO read is + * required for the kernel EEH mechanisms to notice. As the Solarflare driver + * was written to minimise MMIO read (for latency) then a periodic call to check + * the EEH status of the device is required so that device recovery can happen + * in a timely fashion. + */ +static void siena_monitor(struct efx_nic *efx) +{ +	struct eeh_dev *eehdev = +		of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev)); + +	eeh_dev_check_failure(eehdev); +} +#endif +  static int siena_probe_nvconfig(struct efx_nic *efx)  {  	u32 caps = 0; @@ -398,6 +414,8 @@ static int siena_init_nic(struct efx_nic *efx)  	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1);  	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1);  	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1); +	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_USR_BUF_SIZE, +			    EFX_RX_USR_BUF_SIZE >> 5);  	efx_writeo(efx, &temp, FR_AZ_RX_CFG);  	/* Set hash key for IPv4 */ @@ -665,7 +683,11 @@ const struct efx_nic_type siena_a0_nic_type = {  	.init = siena_init_nic,  	.dimension_resources = siena_dimension_resources,  	.fini = efx_port_dummy_op_void, +#ifdef CONFIG_EEH +	.monitor = siena_monitor, +#else  	.monitor = NULL, +#endif  	.map_reset_reason = siena_map_reset_reason,  	.map_reset_flags = siena_map_reset_flags,  	.reset = siena_reset_hw, @@ -698,6 +720,7 @@ const struct efx_nic_type siena_a0_nic_type = {  	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),  	.rx_buffer_hash_size = 0x10,  	.rx_buffer_padding = 0, +	.can_rx_scatter = true,  	.max_interrupt_mode = EFX_INT_MODE_MSIX,  	.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy  				   * interrupt handler only supports 32  |