diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 378 | 
1 files changed, 284 insertions, 94 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 821cb1628e5..e54acde839d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -90,6 +90,7 @@  #include <linux/if_ether.h>  #include <linux/netdevice.h>  #include <linux/etherdevice.h> +#include <linux/ethtool.h>  #include <linux/notifier.h>  #include <linux/skbuff.h>  #include <net/net_namespace.h> @@ -120,6 +121,9 @@  #include <linux/ctype.h>  #include <linux/if_arp.h>  #include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h>  #include "net-sysfs.h" @@ -257,7 +261,7 @@ DEFINE_PER_CPU(struct softnet_data, softnet_data);  #ifdef CONFIG_DEBUG_LOCK_ALLOC  /* - * register_netdevice() inits dev->_xmit_lock and sets lockdep class + * register_netdevice() inits txq->_xmit_lock and sets lockdep class   * according to dev->type   */  static const unsigned short netdev_lock_type[] = @@ -961,6 +965,12 @@ void netdev_state_change(struct net_device *dev)  	}  } +void netdev_bonding_change(struct net_device *dev) +{ +	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); +} +EXPORT_SYMBOL(netdev_bonding_change); +  /**   *	dev_load 	- load a network module   *	@net: the applicable net namespace @@ -1117,6 +1127,29 @@ int dev_close(struct net_device *dev)  } +/** + *	dev_disable_lro - disable Large Receive Offload on a device + *	@dev: device + * + *	Disable Large Receive Offload (LRO) on a net device.  Must be + *	called under RTNL.  This is needed if received packets may be + *	forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ +	if (dev->ethtool_ops && dev->ethtool_ops->get_flags && +	    dev->ethtool_ops->set_flags) { +		u32 flags = dev->ethtool_ops->get_flags(dev); +		if (flags & ETH_FLAG_LRO) { +			flags &= ~ETH_FLAG_LRO; +			dev->ethtool_ops->set_flags(dev, flags); +		} +	} +	WARN_ON(dev->features & NETIF_F_LRO); +} +EXPORT_SYMBOL(dev_disable_lro); + +  static int dev_boot_phase = 1;  /* @@ -1290,16 +1323,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  } -void __netif_schedule(struct net_device *dev) +void __netif_schedule(struct Qdisc *q)  { -	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { -		unsigned long flags; +	BUG_ON(q == &noop_qdisc); + +	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {  		struct softnet_data *sd; +		unsigned long flags;  		local_irq_save(flags);  		sd = &__get_cpu_var(softnet_data); -		dev->next_sched = sd->output_queue; -		sd->output_queue = dev; +		q->next_sched = sd->output_queue; +		sd->output_queue = q;  		raise_softirq_irqoff(NET_TX_SOFTIRQ);  		local_irq_restore(flags);  	} @@ -1566,7 +1601,8 @@ static int dev_gso_segment(struct sk_buff *skb)  	return 0;  } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, +			struct netdev_queue *txq)  {  	if (likely(!skb->next)) {  		if (!list_empty(&ptype_all)) @@ -1595,9 +1631,7 @@ gso:  			skb->next = nskb;  			return rc;  		} -		if (unlikely((netif_queue_stopped(dev) || -			     netif_subqueue_stopped(dev, skb)) && -			     skb->next)) +		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))  			return NETDEV_TX_BUSY;  	} while (skb->next); @@ -1634,9 +1668,71 @@ out_kfree_skb:   *          --BLG   */ +static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) +{ +	u32 *addr, *ports, hash, ihl; +	u8 ip_proto; +	int alen; + +	switch (skb->protocol) { +	case __constant_htons(ETH_P_IP): +		ip_proto = ip_hdr(skb)->protocol; +		addr = &ip_hdr(skb)->saddr; +		ihl = ip_hdr(skb)->ihl; +		alen = 2; +		break; +	case __constant_htons(ETH_P_IPV6): +		ip_proto = ipv6_hdr(skb)->nexthdr; +		addr = &ipv6_hdr(skb)->saddr.s6_addr32[0]; +		ihl = (40 >> 2); +		alen = 8; +		break; +	default: +		return 0; +	} + +	ports = (u32 *) (skb_network_header(skb) + (ihl * 4)); + +	hash = 0; +	while (alen--) +		hash ^= *addr++; + +	switch (ip_proto) { +	case IPPROTO_TCP: +	case IPPROTO_UDP: +	case IPPROTO_DCCP: +	case IPPROTO_ESP: +	case IPPROTO_AH: +	case IPPROTO_SCTP: +	case IPPROTO_UDPLITE: +		hash ^= *ports; +		break; + +	default: +		break; +	} + +	return hash % dev->real_num_tx_queues; +} + +static struct netdev_queue *dev_pick_tx(struct net_device *dev, +					struct sk_buff *skb) +{ +	u16 queue_index = 0; + +	if (dev->select_queue) +		queue_index = dev->select_queue(dev, skb); +	else if (dev->real_num_tx_queues > 1) +		queue_index = simple_tx_hash(dev, skb); + +	skb_set_queue_mapping(skb, queue_index); +	return netdev_get_tx_queue(dev, queue_index); +} +  int dev_queue_xmit(struct sk_buff *skb)  {  	struct net_device *dev = skb->dev; +	struct netdev_queue *txq;  	struct Qdisc *q;  	int rc = -ENOMEM; @@ -1669,44 +1765,29 @@ int dev_queue_xmit(struct sk_buff *skb)  	}  gso: -	spin_lock_prefetch(&dev->queue_lock); -  	/* Disable soft irqs for various locks below. Also  	 * stops preemption for RCU.  	 */  	rcu_read_lock_bh(); -	/* Updates of qdisc are serialized by queue_lock. -	 * The struct Qdisc which is pointed to by qdisc is now a -	 * rcu structure - it may be accessed without acquiring -	 * a lock (but the structure may be stale.) The freeing of the -	 * qdisc will be deferred until it's known that there are no -	 * more references to it. -	 * -	 * If the qdisc has an enqueue function, we still need to -	 * hold the queue_lock before calling it, since queue_lock -	 * also serializes access to the device queue. -	 */ +	txq = dev_pick_tx(dev, skb); +	q = rcu_dereference(txq->qdisc); -	q = rcu_dereference(dev->qdisc);  #ifdef CONFIG_NET_CLS_ACT  	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);  #endif  	if (q->enqueue) { -		/* Grab device queue */ -		spin_lock(&dev->queue_lock); -		q = dev->qdisc; -		if (q->enqueue) { -			/* reset queue_mapping to zero */ -			skb_set_queue_mapping(skb, 0); -			rc = q->enqueue(skb, q); -			qdisc_run(dev); -			spin_unlock(&dev->queue_lock); +		spinlock_t *root_lock = qdisc_root_lock(q); -			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; -			goto out; -		} -		spin_unlock(&dev->queue_lock); +		spin_lock(root_lock); + +		rc = q->enqueue(skb, q); +		qdisc_run(q); + +		spin_unlock(root_lock); + +		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; +		goto out;  	}  	/* The device has no queue. Common case for software devices: @@ -1724,19 +1805,18 @@ gso:  	if (dev->flags & IFF_UP) {  		int cpu = smp_processor_id(); /* ok because BHs are off */ -		if (dev->xmit_lock_owner != cpu) { +		if (txq->xmit_lock_owner != cpu) { -			HARD_TX_LOCK(dev, cpu); +			HARD_TX_LOCK(dev, txq, cpu); -			if (!netif_queue_stopped(dev) && -			    !netif_subqueue_stopped(dev, skb)) { +			if (!netif_tx_queue_stopped(txq)) {  				rc = 0; -				if (!dev_hard_start_xmit(skb, dev)) { -					HARD_TX_UNLOCK(dev); +				if (!dev_hard_start_xmit(skb, dev, txq)) { +					HARD_TX_UNLOCK(dev, txq);  					goto out;  				}  			} -			HARD_TX_UNLOCK(dev); +			HARD_TX_UNLOCK(dev, txq);  			if (net_ratelimit())  				printk(KERN_CRIT "Virtual device %s asks to "  				       "queue packet!\n", dev->name); @@ -1880,7 +1960,7 @@ static void net_tx_action(struct softirq_action *h)  	}  	if (sd->output_queue) { -		struct net_device *head; +		struct Qdisc *head;  		local_irq_disable();  		head = sd->output_queue; @@ -1888,17 +1968,20 @@ static void net_tx_action(struct softirq_action *h)  		local_irq_enable();  		while (head) { -			struct net_device *dev = head; +			struct Qdisc *q = head; +			spinlock_t *root_lock; +  			head = head->next_sched;  			smp_mb__before_clear_bit(); -			clear_bit(__LINK_STATE_SCHED, &dev->state); +			clear_bit(__QDISC_STATE_SCHED, &q->state); -			if (spin_trylock(&dev->queue_lock)) { -				qdisc_run(dev); -				spin_unlock(&dev->queue_lock); +			root_lock = qdisc_root_lock(q); +			if (spin_trylock(root_lock)) { +				qdisc_run(q); +				spin_unlock(root_lock);  			} else { -				netif_schedule(dev); +				__netif_schedule(q);  			}  		}  	} @@ -1979,10 +2062,11 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,   */  static int ing_filter(struct sk_buff *skb)  { -	struct Qdisc *q;  	struct net_device *dev = skb->dev; -	int result = TC_ACT_OK;  	u32 ttl = G_TC_RTTL(skb->tc_verd); +	struct netdev_queue *rxq; +	int result = TC_ACT_OK; +	struct Qdisc *q;  	if (MAX_RED_LOOP < ttl++) {  		printk(KERN_WARNING @@ -1994,10 +2078,14 @@ static int ing_filter(struct sk_buff *skb)  	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); -	spin_lock(&dev->ingress_lock); -	if ((q = dev->qdisc_ingress) != NULL) +	rxq = &dev->rx_queue; + +	q = rxq->qdisc; +	if (q) { +		spin_lock(qdisc_lock(q));  		result = q->enqueue(skb, q); -	spin_unlock(&dev->ingress_lock); +		spin_unlock(qdisc_lock(q)); +	}  	return result;  } @@ -2006,7 +2094,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,  					 struct packet_type **pt_prev,  					 int *ret, struct net_device *orig_dev)  { -	if (!skb->dev->qdisc_ingress) +	if (!skb->dev->rx_queue.qdisc)  		goto out;  	if (*pt_prev) { @@ -2030,6 +2118,33 @@ out:  }  #endif +/* + * 	netif_nit_deliver - deliver received packets to network taps + * 	@skb: buffer + * + * 	This function is used to deliver incoming packets to network + * 	taps. It should be used when the normal netif_receive_skb path + * 	is bypassed, for example because of VLAN acceleration. + */ +void netif_nit_deliver(struct sk_buff *skb) +{ +	struct packet_type *ptype; + +	if (list_empty(&ptype_all)) +		return; + +	skb_reset_network_header(skb); +	skb_reset_transport_header(skb); +	skb->mac_len = skb->network_header - skb->mac_header; + +	rcu_read_lock(); +	list_for_each_entry_rcu(ptype, &ptype_all, list) { +		if (!ptype->dev || ptype->dev == skb->dev) +			deliver_skb(skb, ptype, skb->dev); +	} +	rcu_read_unlock(); +} +  /**   *	netif_receive_skb - process receive buffer from network   *	@skb: buffer to process @@ -2769,16 +2884,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)  	return 0;  } -static void __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc)  {  	unsigned short old_flags = dev->flags;  	ASSERT_RTNL(); -	if ((dev->promiscuity += inc) == 0) -		dev->flags &= ~IFF_PROMISC; -	else -		dev->flags |= IFF_PROMISC; +	dev->flags |= IFF_PROMISC; +	dev->promiscuity += inc; +	if (dev->promiscuity == 0) { +		/* +		 * Avoid overflow. +		 * If inc causes overflow, untouch promisc and return error. +		 */ +		if (inc < 0) +			dev->flags &= ~IFF_PROMISC; +		else { +			dev->promiscuity -= inc; +			printk(KERN_WARNING "%s: promiscuity touches roof, " +				"set promiscuity failed, promiscuity feature " +				"of device might be broken.\n", dev->name); +			return -EOVERFLOW; +		} +	}  	if (dev->flags != old_flags) {  		printk(KERN_INFO "device %s %s promiscuous mode\n",  		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" : @@ -2796,6 +2924,7 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)  		if (dev->change_rx_flags)  			dev->change_rx_flags(dev, IFF_PROMISC);  	} +	return 0;  }  /** @@ -2807,14 +2936,19 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)   *	remains above zero the interface remains promiscuous. Once it hits zero   *	the device reverts back to normal filtering operation. A negative inc   *	value is used to drop promiscuity on the device. + *	Return 0 if successful or a negative errno code on error.   */ -void dev_set_promiscuity(struct net_device *dev, int inc) +int dev_set_promiscuity(struct net_device *dev, int inc)  {  	unsigned short old_flags = dev->flags; +	int err; -	__dev_set_promiscuity(dev, inc); +	err = __dev_set_promiscuity(dev, inc); +	if (err < 0) +		return err;  	if (dev->flags != old_flags)  		dev_set_rx_mode(dev); +	return err;  }  /** @@ -2827,22 +2961,38 @@ void dev_set_promiscuity(struct net_device *dev, int inc)   *	to all interfaces. Once it hits zero the device reverts back to normal   *	filtering operation. A negative @inc value is used to drop the counter   *	when releasing a resource needing all multicasts. + *	Return 0 if successful or a negative errno code on error.   */ -void dev_set_allmulti(struct net_device *dev, int inc) +int dev_set_allmulti(struct net_device *dev, int inc)  {  	unsigned short old_flags = dev->flags;  	ASSERT_RTNL();  	dev->flags |= IFF_ALLMULTI; -	if ((dev->allmulti += inc) == 0) -		dev->flags &= ~IFF_ALLMULTI; +	dev->allmulti += inc; +	if (dev->allmulti == 0) { +		/* +		 * Avoid overflow. +		 * If inc causes overflow, untouch allmulti and return error. +		 */ +		if (inc < 0) +			dev->flags &= ~IFF_ALLMULTI; +		else { +			dev->allmulti -= inc; +			printk(KERN_WARNING "%s: allmulti touches roof, " +				"set allmulti failed, allmulti feature of " +				"device might be broken.\n", dev->name); +			return -EOVERFLOW; +		} +	}  	if (dev->flags ^ old_flags) {  		if (dev->change_rx_flags)  			dev->change_rx_flags(dev, IFF_ALLMULTI);  		dev_set_rx_mode(dev);  	} +	return 0;  }  /* @@ -2881,9 +3031,9 @@ void __dev_set_rx_mode(struct net_device *dev)  void dev_set_rx_mode(struct net_device *dev)  { -	netif_tx_lock_bh(dev); +	netif_addr_lock_bh(dev);  	__dev_set_rx_mode(dev); -	netif_tx_unlock_bh(dev); +	netif_addr_unlock_bh(dev);  }  int __dev_addr_delete(struct dev_addr_list **list, int *count, @@ -2961,11 +3111,11 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen)  	ASSERT_RTNL(); -	netif_tx_lock_bh(dev); +	netif_addr_lock_bh(dev);  	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);  	if (!err)  		__dev_set_rx_mode(dev); -	netif_tx_unlock_bh(dev); +	netif_addr_unlock_bh(dev);  	return err;  }  EXPORT_SYMBOL(dev_unicast_delete); @@ -2987,11 +3137,11 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen)  	ASSERT_RTNL(); -	netif_tx_lock_bh(dev); +	netif_addr_lock_bh(dev);  	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);  	if (!err)  		__dev_set_rx_mode(dev); -	netif_tx_unlock_bh(dev); +	netif_addr_unlock_bh(dev);  	return err;  }  EXPORT_SYMBOL(dev_unicast_add); @@ -3058,12 +3208,12 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)  {  	int err = 0; -	netif_tx_lock_bh(to); +	netif_addr_lock_bh(to);  	err = __dev_addr_sync(&to->uc_list, &to->uc_count,  			      &from->uc_list, &from->uc_count);  	if (!err)  		__dev_set_rx_mode(to); -	netif_tx_unlock_bh(to); +	netif_addr_unlock_bh(to);  	return err;  }  EXPORT_SYMBOL(dev_unicast_sync); @@ -3079,15 +3229,15 @@ EXPORT_SYMBOL(dev_unicast_sync);   */  void dev_unicast_unsync(struct net_device *to, struct net_device *from)  { -	netif_tx_lock_bh(from); -	netif_tx_lock_bh(to); +	netif_addr_lock_bh(from); +	netif_addr_lock(to);  	__dev_addr_unsync(&to->uc_list, &to->uc_count,  			  &from->uc_list, &from->uc_count);  	__dev_set_rx_mode(to); -	netif_tx_unlock_bh(to); -	netif_tx_unlock_bh(from); +	netif_addr_unlock(to); +	netif_addr_unlock_bh(from);  }  EXPORT_SYMBOL(dev_unicast_unsync); @@ -3107,7 +3257,7 @@ static void __dev_addr_discard(struct dev_addr_list **list)  static void dev_addr_discard(struct net_device *dev)  { -	netif_tx_lock_bh(dev); +	netif_addr_lock_bh(dev);  	__dev_addr_discard(&dev->uc_list);  	dev->uc_count = 0; @@ -3115,7 +3265,7 @@ static void dev_addr_discard(struct net_device *dev)  	__dev_addr_discard(&dev->mc_list);  	dev->mc_count = 0; -	netif_tx_unlock_bh(dev); +	netif_addr_unlock_bh(dev);  }  unsigned dev_get_flags(const struct net_device *dev) @@ -3688,6 +3838,21 @@ static void rollback_registered(struct net_device *dev)  	dev_put(dev);  } +static void __netdev_init_queue_locks_one(struct net_device *dev, +					  struct netdev_queue *dev_queue, +					  void *_unused) +{ +	spin_lock_init(&dev_queue->_xmit_lock); +	netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type); +	dev_queue->xmit_lock_owner = -1; +} + +static void netdev_init_queue_locks(struct net_device *dev) +{ +	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); +	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); +} +  /**   *	register_netdevice	- register a network device   *	@dev: device to register @@ -3722,11 +3887,10 @@ int register_netdevice(struct net_device *dev)  	BUG_ON(!dev_net(dev));  	net = dev_net(dev); -	spin_lock_init(&dev->queue_lock); -	spin_lock_init(&dev->_xmit_lock); -	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); -	dev->xmit_lock_owner = -1; -	spin_lock_init(&dev->ingress_lock); +	spin_lock_init(&dev->addr_list_lock); +	spin_lock_init(&dev->qdisc_list_lock); +	INIT_LIST_HEAD(&dev->qdisc_list); +	netdev_init_queue_locks(dev);  	dev->iflink = -1; @@ -4007,6 +4171,19 @@ static struct net_device_stats *internal_stats(struct net_device *dev)  	return &dev->stats;  } +static void netdev_init_one_queue(struct net_device *dev, +				  struct netdev_queue *queue, +				  void *_unused) +{ +	queue->dev = dev; +} + +static void netdev_init_queues(struct net_device *dev) +{ +	netdev_init_one_queue(dev, &dev->rx_queue, NULL); +	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); +} +  /**   *	alloc_netdev_mq - allocate network device   *	@sizeof_priv:	size of private data to allocate space for @@ -4021,14 +4198,14 @@ static struct net_device_stats *internal_stats(struct net_device *dev)  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  		void (*setup)(struct net_device *), unsigned int queue_count)  { -	void *p; +	struct netdev_queue *tx;  	struct net_device *dev;  	int alloc_size; +	void *p;  	BUG_ON(strlen(name) >= sizeof(dev->name)); -	alloc_size = sizeof(struct net_device) + -		     sizeof(struct net_device_subqueue) * (queue_count - 1); +	alloc_size = sizeof(struct net_device);  	if (sizeof_priv) {  		/* ensure 32-byte alignment of private area */  		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; @@ -4043,22 +4220,33 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  		return NULL;  	} +	tx = kzalloc(sizeof(struct netdev_queue) * queue_count, GFP_KERNEL); +	if (!tx) { +		printk(KERN_ERR "alloc_netdev: Unable to allocate " +		       "tx qdiscs.\n"); +		kfree(p); +		return NULL; +	} +  	dev = (struct net_device *)  		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);  	dev->padded = (char *)dev - (char *)p;  	dev_net_set(dev, &init_net); +	dev->_tx = tx; +	dev->num_tx_queues = queue_count; +	dev->real_num_tx_queues = queue_count; +  	if (sizeof_priv) {  		dev->priv = ((char *)dev + -			     ((sizeof(struct net_device) + -			       (sizeof(struct net_device_subqueue) * -				(queue_count - 1)) + NETDEV_ALIGN_CONST) +			     ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)  			      & ~NETDEV_ALIGN_CONST));  	} -	dev->egress_subqueue_count = queue_count;  	dev->gso_max_size = GSO_MAX_SIZE; +	netdev_init_queues(dev); +  	dev->get_stats = internal_stats;  	netpoll_netdev_init(dev);  	setup(dev); @@ -4079,6 +4267,8 @@ void free_netdev(struct net_device *dev)  {  	release_net(dev_net(dev)); +	kfree(dev->_tx); +  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) {  		kfree((char *)dev - dev->padded); @@ -4260,7 +4450,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,  			    void *ocpu)  {  	struct sk_buff **list_skb; -	struct net_device **list_net; +	struct Qdisc **list_net;  	struct sk_buff *skb;  	unsigned int cpu, oldcpu = (unsigned long)ocpu;  	struct softnet_data *sd, *oldsd;  |