diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 6 | ||||
| -rw-r--r-- | net/core/dev.c | 307 | ||||
| -rw-r--r-- | net/core/dst.c | 2 | ||||
| -rw-r--r-- | net/core/ethtool.c | 638 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 143 | ||||
| -rw-r--r-- | net/core/neighbour.c | 209 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 279 | ||||
| -rw-r--r-- | net/core/netpoll.c | 10 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 344 | ||||
| -rw-r--r-- | net/core/pktgen.c | 17 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 25 | ||||
| -rw-r--r-- | net/core/secure_seq.c | 6 | ||||
| -rw-r--r-- | net/core/skbuff.c | 89 | ||||
| -rw-r--r-- | net/core/sock.c | 197 | ||||
| -rw-r--r-- | net/core/sock_diag.c | 169 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 9 | 
16 files changed, 1550 insertions, 900 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 0d357b1c4e5..674641b13ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,12 +3,13 @@  #  obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ -	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o +	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o  obj-$(CONFIG_SYSCTL) += sysctl_net_core.o  obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ -			neighbour.o rtnetlink.o utils.o link_watch.o filter.o +			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ +			sock_diag.o  obj-$(CONFIG_XFRM) += flow.o  obj-y += net-sysfs.o @@ -19,3 +20,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o  obj-$(CONFIG_TRACEPOINTS) += net-traces.o  obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o diff --git a/net/core/dev.c b/net/core/dev.c index 5a13edfc9f7..f494675471a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,10 +133,9 @@  #include <linux/pci.h>  #include <linux/inetdevice.h>  #include <linux/cpu_rmap.h> -#include <linux/if_tunnel.h> -#include <linux/if_pppox.h> -#include <linux/ppp_defs.h>  #include <linux/net_tstamp.h> +#include <linux/jump_label.h> +#include <net/flow_keys.h>  #include "net-sysfs.h" @@ -1320,8 +1319,6 @@ EXPORT_SYMBOL(dev_close);   */  void dev_disable_lro(struct net_device *dev)  { -	u32 flags; -  	/*  	 * If we're trying to disable lro on a vlan device  	 * use the underlying physical device instead @@ -1329,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev)  	if (is_vlan_dev(dev))  		dev = vlan_dev_real_dev(dev); -	if (dev->ethtool_ops && dev->ethtool_ops->get_flags) -		flags = dev->ethtool_ops->get_flags(dev); -	else -		flags = ethtool_op_get_flags(dev); +	dev->wanted_features &= ~NETIF_F_LRO; +	netdev_update_features(dev); -	if (!(flags & ETH_FLAG_LRO)) -		return; - -	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);  	if (unlikely(dev->features & NETIF_F_LRO))  		netdev_WARN(dev, "failed to disable LRO!\n");  } @@ -1450,34 +1441,55 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)  }  EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct jump_label_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call jump_label_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * jump_label_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif  void net_enable_timestamp(void)  { -	atomic_inc(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL +	int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + +	if (deferred) { +		while (--deferred) +			jump_label_dec(&netstamp_needed); +		return; +	} +#endif +	WARN_ON(in_interrupt()); +	jump_label_inc(&netstamp_needed);  }  EXPORT_SYMBOL(net_enable_timestamp);  void net_disable_timestamp(void)  { -	atomic_dec(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL +	if (in_interrupt()) { +		atomic_inc(&netstamp_needed_deferred); +		return; +	} +#endif +	jump_label_dec(&netstamp_needed);  }  EXPORT_SYMBOL(net_disable_timestamp);  static inline void net_timestamp_set(struct sk_buff *skb)  { -	if (atomic_read(&netstamp_needed)) +	skb->tstamp.tv64 = 0; +	if (static_branch(&netstamp_needed))  		__net_timestamp(skb); -	else -		skb->tstamp.tv64 = 0;  } -static inline void net_timestamp_check(struct sk_buff *skb) -{ -	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) -		__net_timestamp(skb); -} +#define net_timestamp_check(COND, SKB)			\ +	if (static_branch(&netstamp_needed)) {		\ +		if ((COND) && !(SKB)->tstamp.tv64)	\ +			__net_timestamp(SKB);		\ +	}						\  static int net_hwtstamp_validate(struct ifreq *ifr)  { @@ -1924,7 +1936,8 @@ EXPORT_SYMBOL(skb_checksum_help);   *	It may return NULL if the skb requires no segmentation.  This is   *	only possible when GSO is used for verifying header integrity.   */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, +	netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);  	struct packet_type *ptype; @@ -1954,9 +1967,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)  		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)  			dev->ethtool_ops->get_drvinfo(dev, &info); -		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", -		     info.driver, dev ? dev->features : 0L, -		     skb->sk ? skb->sk->sk_route_caps : 0L, +		WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", +		     info.driver, dev ? &dev->features : NULL, +		     skb->sk ? &skb->sk->sk_route_caps : NULL,  		     skb->len, skb->data_len, skb->ip_summed);  		if (skb_header_cloned(skb) && @@ -2065,7 +2078,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)   *	This function segments the given skb and stores the list of segments   *	in skb->next.   */ -static int dev_gso_segment(struct sk_buff *skb, int features) +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)  {  	struct sk_buff *segs; @@ -2104,7 +2117,7 @@ static inline void skb_orphan_try(struct sk_buff *skb)  	}  } -static bool can_checksum_protocol(unsigned long features, __be16 protocol) +static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)  {  	return ((features & NETIF_F_GEN_CSUM) ||  		((features & NETIF_F_V4_CSUM) && @@ -2115,7 +2128,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)  		 protocol == htons(ETH_P_FCOE)));  } -static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +static netdev_features_t harmonize_features(struct sk_buff *skb, +	__be16 protocol, netdev_features_t features)  {  	if (!can_checksum_protocol(features, protocol)) {  		features &= ~NETIF_F_ALL_CSUM; @@ -2127,10 +2141,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features  	return features;  } -u32 netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_features(struct sk_buff *skb)  {  	__be16 protocol = skb->protocol; -	u32 features = skb->dev->features; +	netdev_features_t features = skb->dev->features;  	if (protocol == htons(ETH_P_8021Q)) {  		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2176,7 +2190,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  	unsigned int skb_len;  	if (likely(!skb->next)) { -		u32 features; +		netdev_features_t features;  		/*  		 * If device doesn't need skb->dst, release it right now while @@ -2257,7 +2271,7 @@ gso:  			return rc;  		}  		txq_trans_update(txq); -		if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) +		if (unlikely(netif_xmit_stopped(txq) && skb->next))  			return NETDEV_TX_BUSY;  	} while (skb->next); @@ -2457,6 +2471,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	return rc;  } +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +static void skb_update_prio(struct sk_buff *skb) +{ +	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + +	if ((!skb->priority) && (skb->sk) && map) +		skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +#else +#define skb_update_prio(skb) +#endif +  static DEFINE_PER_CPU(int, xmit_recursion);  #define RECURSION_LIMIT 10 @@ -2497,6 +2523,8 @@ int dev_queue_xmit(struct sk_buff *skb)  	 */  	rcu_read_lock_bh(); +	skb_update_prio(skb); +  	txq = dev_pick_tx(dev, skb);  	q = rcu_dereference_bh(txq->qdisc); @@ -2531,7 +2559,7 @@ int dev_queue_xmit(struct sk_buff *skb)  			HARD_TX_LOCK(dev, txq, cpu); -			if (!netif_tx_queue_stopped(txq)) { +			if (!netif_xmit_stopped(txq)) {  				__this_cpu_inc(xmit_recursion);  				rc = dev_hard_start_xmit(skb, dev, txq);  				__this_cpu_dec(xmit_recursion); @@ -2592,123 +2620,28 @@ static inline void ____napi_schedule(struct softnet_data *sd,   */  void __skb_get_rxhash(struct sk_buff *skb)  { -	int nhoff, hash = 0, poff; -	const struct ipv6hdr *ip6; -	const struct iphdr *ip; -	const struct vlan_hdr *vlan; -	u8 ip_proto; -	u32 addr1, addr2; -	u16 proto; -	union { -		u32 v32; -		u16 v16[2]; -	} ports; - -	nhoff = skb_network_offset(skb); -	proto = skb->protocol; - -again: -	switch (proto) { -	case __constant_htons(ETH_P_IP): -ip: -		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) -			goto done; - -		ip = (const struct iphdr *) (skb->data + nhoff); -		if (ip_is_fragment(ip)) -			ip_proto = 0; -		else -			ip_proto = ip->protocol; -		addr1 = (__force u32) ip->saddr; -		addr2 = (__force u32) ip->daddr; -		nhoff += ip->ihl * 4; -		break; -	case __constant_htons(ETH_P_IPV6): -ipv6: -		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) -			goto done; - -		ip6 = (const struct ipv6hdr *) (skb->data + nhoff); -		ip_proto = ip6->nexthdr; -		addr1 = (__force u32) ip6->saddr.s6_addr32[3]; -		addr2 = (__force u32) ip6->daddr.s6_addr32[3]; -		nhoff += 40; -		break; -	case __constant_htons(ETH_P_8021Q): -		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) -			goto done; -		vlan = (const struct vlan_hdr *) (skb->data + nhoff); -		proto = vlan->h_vlan_encapsulated_proto; -		nhoff += sizeof(*vlan); -		goto again; -	case __constant_htons(ETH_P_PPP_SES): -		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) -			goto done; -		proto = *((__be16 *) (skb->data + nhoff + -				      sizeof(struct pppoe_hdr))); -		nhoff += PPPOE_SES_HLEN; -		switch (proto) { -		case __constant_htons(PPP_IP): -			goto ip; -		case __constant_htons(PPP_IPV6): -			goto ipv6; -		default: -			goto done; -		} -	default: -		goto done; -	} - -	switch (ip_proto) { -	case IPPROTO_GRE: -		if (pskb_may_pull(skb, nhoff + 16)) { -			u8 *h = skb->data + nhoff; -			__be16 flags = *(__be16 *)h; +	struct flow_keys keys; +	u32 hash; -			/* -			 * Only look inside GRE if version zero and no -			 * routing -			 */ -			if (!(flags & (GRE_VERSION|GRE_ROUTING))) { -				proto = *(__be16 *)(h + 2); -				nhoff += 4; -				if (flags & GRE_CSUM) -					nhoff += 4; -				if (flags & GRE_KEY) -					nhoff += 4; -				if (flags & GRE_SEQ) -					nhoff += 4; -				goto again; -			} -		} -		break; -	case IPPROTO_IPIP: -		goto again; -	default: -		break; -	} +	if (!skb_flow_dissect(skb, &keys)) +		return; -	ports.v32 = 0; -	poff = proto_ports_offset(ip_proto); -	if (poff >= 0) { -		nhoff += poff; -		if (pskb_may_pull(skb, nhoff + 4)) { -			ports.v32 = * (__force u32 *) (skb->data + nhoff); -			if (ports.v16[1] < ports.v16[0]) -				swap(ports.v16[0], ports.v16[1]); -			skb->l4_rxhash = 1; -		} +	if (keys.ports) { +		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) +			swap(keys.port16[0], keys.port16[1]); +		skb->l4_rxhash = 1;  	}  	/* get a consistent hash (same value on both flow directions) */ -	if (addr2 < addr1) -		swap(addr1, addr2); +	if ((__force u32)keys.dst < (__force u32)keys.src) +		swap(keys.dst, keys.src); -	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); +	hash = jhash_3words((__force u32)keys.dst, +			    (__force u32)keys.src, +			    (__force u32)keys.ports, hashrnd);  	if (!hash)  		hash = 1; -done:  	skb->rxhash = hash;  }  EXPORT_SYMBOL(__skb_get_rxhash); @@ -2719,6 +2652,8 @@ EXPORT_SYMBOL(__skb_get_rxhash);  struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;  EXPORT_SYMBOL(rps_sock_flow_table); +struct jump_label_key rps_needed __read_mostly; +  static struct rps_dev_flow *  set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	    struct rps_dev_flow *rflow, u16 next_cpu) @@ -2998,12 +2933,11 @@ int netif_rx(struct sk_buff *skb)  	if (netpoll_rx(skb))  		return NET_RX_DROP; -	if (netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	net_timestamp_check(netdev_tstamp_prequeue, skb);  	trace_netif_rx(skb);  #ifdef CONFIG_RPS -	{ +	if (static_branch(&rps_needed))	{  		struct rps_dev_flow voidflow, *rflow = &voidflow;  		int cpu; @@ -3018,14 +2952,13 @@ int netif_rx(struct sk_buff *skb)  		rcu_read_unlock();  		preempt_enable(); -	} -#else +	} else +#endif  	{  		unsigned int qtail;  		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);  		put_cpu();  	} -#endif  	return ret;  }  EXPORT_SYMBOL(netif_rx); @@ -3231,8 +3164,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	int ret = NET_RX_DROP;  	__be16 type; -	if (!netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	net_timestamp_check(!netdev_tstamp_prequeue, skb);  	trace_netif_receive_skb(skb); @@ -3363,14 +3295,13 @@ out:   */  int netif_receive_skb(struct sk_buff *skb)  { -	if (netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	net_timestamp_check(netdev_tstamp_prequeue, skb);  	if (skb_defer_rx_timestamp(skb))  		return NET_RX_SUCCESS;  #ifdef CONFIG_RPS -	{ +	if (static_branch(&rps_needed)) {  		struct rps_dev_flow voidflow, *rflow = &voidflow;  		int cpu, ret; @@ -3381,16 +3312,12 @@ int netif_receive_skb(struct sk_buff *skb)  		if (cpu >= 0) {  			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);  			rcu_read_unlock(); -		} else { -			rcu_read_unlock(); -			ret = __netif_receive_skb(skb); +			return ret;  		} - -		return ret; +		rcu_read_unlock();  	} -#else -	return __netif_receive_skb(skb);  #endif +	return __netif_receive_skb(skb);  }  EXPORT_SYMBOL(netif_receive_skb); @@ -4539,7 +4466,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)  static int __dev_set_promiscuity(struct net_device *dev, int inc)  { -	unsigned short old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	uid_t uid;  	gid_t gid; @@ -4596,7 +4523,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)   */  int dev_set_promiscuity(struct net_device *dev, int inc)  { -	unsigned short old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	int err;  	err = __dev_set_promiscuity(dev, inc); @@ -4623,7 +4550,7 @@ EXPORT_SYMBOL(dev_set_promiscuity);  int dev_set_allmulti(struct net_device *dev, int inc)  { -	unsigned short old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	ASSERT_RTNL(); @@ -4726,7 +4653,7 @@ EXPORT_SYMBOL(dev_get_flags);  int __dev_change_flags(struct net_device *dev, unsigned int flags)  { -	int old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	int ret;  	ASSERT_RTNL(); @@ -4809,10 +4736,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)   *	Change settings on device based state flags. The flags are   *	in the userspace exported format.   */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned int flags)  { -	int ret, changes; -	int old_flags = dev->flags; +	int ret; +	unsigned int changes, old_flags = dev->flags;  	ret = __dev_change_flags(dev, flags);  	if (ret < 0) @@ -5369,7 +5296,8 @@ static void rollback_registered(struct net_device *dev)  	list_del(&single);  } -static u32 netdev_fix_features(struct net_device *dev, u32 features) +static netdev_features_t netdev_fix_features(struct net_device *dev, +	netdev_features_t features)  {  	/* Fix illegal checksum combinations */  	if ((features & NETIF_F_HW_CSUM) && @@ -5378,12 +5306,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)  		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);  	} -	if ((features & NETIF_F_NO_CSUM) && -	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { -		netdev_warn(dev, "mixed no checksumming and other settings.\n"); -		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); -	} -  	/* Fix illegal SG+CSUM combinations. */  	if ((features & NETIF_F_SG) &&  	    !(features & NETIF_F_ALL_CSUM)) { @@ -5431,7 +5353,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)  int __netdev_update_features(struct net_device *dev)  { -	u32 features; +	netdev_features_t features;  	int err = 0;  	ASSERT_RTNL(); @@ -5447,16 +5369,16 @@ int __netdev_update_features(struct net_device *dev)  	if (dev->features == features)  		return 0; -	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", -		dev->features, features); +	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", +		&dev->features, &features);  	if (dev->netdev_ops->ndo_set_features)  		err = dev->netdev_ops->ndo_set_features(dev, features);  	if (unlikely(err < 0)) {  		netdev_err(dev, -			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", -			err, features, dev->features); +			"set_features() failed (%d); wanted %pNF, left %pNF\n", +			err, &features, &dev->features);  		return -1;  	} @@ -5555,6 +5477,9 @@ static void netdev_init_one_queue(struct net_device *dev,  	queue->xmit_lock_owner = -1;  	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);  	queue->dev = dev; +#ifdef CONFIG_BQL +	dql_init(&queue->dql, HZ); +#endif  }  static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5640,11 +5565,12 @@ int register_netdevice(struct net_device *dev)  	dev->wanted_features = dev->features & dev->hw_features;  	/* Turn on no cache copy if HW is doing checksum */ -	dev->hw_features |= NETIF_F_NOCACHE_COPY; -	if ((dev->features & NETIF_F_ALL_CSUM) && -	    !(dev->features & NETIF_F_NO_CSUM)) { -		dev->wanted_features |= NETIF_F_NOCACHE_COPY; -		dev->features |= NETIF_F_NOCACHE_COPY; +	if (!(dev->flags & IFF_LOOPBACK)) { +		dev->hw_features |= NETIF_F_NOCACHE_COPY; +		if (dev->features & NETIF_F_ALL_CSUM) { +			dev->wanted_features |= NETIF_F_NOCACHE_COPY; +			dev->features |= NETIF_F_NOCACHE_COPY; +		}  	}  	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -6380,7 +6306,8 @@ static int dev_cpu_callback(struct notifier_block *nfb,   *	@one to the master device with current feature set @all.  Will not   *	enable anything that is off in @mask. Returns the new feature set.   */ -u32 netdev_increment_features(u32 all, u32 one, u32 mask) +netdev_features_t netdev_increment_features(netdev_features_t all, +	netdev_features_t one, netdev_features_t mask)  {  	if (mask & NETIF_F_GEN_CSUM)  		mask |= NETIF_F_ALL_CSUM; @@ -6389,10 +6316,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask)  	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;  	all &= one | ~NETIF_F_ALL_FOR_ALL; -	/* If device needs checksumming, downgrade to it. */ -	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) -		all &= ~NETIF_F_NO_CSUM; -  	/* If one device supports hw checksumming, set for all. */  	if (all & NETIF_F_GEN_CSUM)  		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); diff --git a/net/core/dst.c b/net/core/dst.c index d5e2c4c0910..43d94cedbf7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -366,7 +366,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,  		dev_hold(dst->dev);  		dev_put(dev);  		rcu_read_lock(); -		neigh = dst_get_neighbour(dst); +		neigh = dst_get_neighbour_noref(dst);  		if (neigh && neigh->dev == dev) {  			neigh->dev = dst->dev;  			dev_hold(dst->dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4448170712..597732c989c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev)  }  EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_tx_csum(struct net_device *dev) -{ -	return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tx_csum); - -int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_IP_CSUM; -	else -		dev->features &= ~NETIF_F_IP_CSUM; - -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_csum); - -int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_HW_CSUM; -	else -		dev->features &= ~NETIF_F_HW_CSUM; - -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); - -int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; -	else -		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); - -u32 ethtool_op_get_sg(struct net_device *dev) -{ -	return (dev->features & NETIF_F_SG) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_sg); - -int ethtool_op_set_sg(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_SG; -	else -		dev->features &= ~NETIF_F_SG; - -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_sg); - -u32 ethtool_op_get_tso(struct net_device *dev) -{ -	return (dev->features & NETIF_F_TSO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tso); - -int ethtool_op_set_tso(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_TSO; -	else -		dev->features &= ~NETIF_F_TSO; - -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tso); - -u32 ethtool_op_get_ufo(struct net_device *dev) -{ -	return (dev->features & NETIF_F_UFO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_ufo); - -int ethtool_op_set_ufo(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_UFO; -	else -		dev->features &= ~NETIF_F_UFO; -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_ufo); - -/* the following list of flags are the same as their associated - * NETIF_F_xxx values in include/linux/netdevice.h - */ -static const u32 flags_dup_features = -	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | -	 ETH_FLAG_RXHASH); - -u32 ethtool_op_get_flags(struct net_device *dev) -{ -	/* in the future, this function will probably contain additional -	 * handling for flags which are not so easily handled -	 * by a simple masking operation -	 */ - -	return dev->features & flags_dup_features; -} -EXPORT_SYMBOL(ethtool_op_get_flags); - -/* Check if device can enable (or disable) particular feature coded in "data" - * argument. Flags "supported" describe features that can be toggled by device. - * If feature can not be toggled, it state (enabled or disabled) must match - * hardcoded device features state, otherwise flags are marked as invalid. - */ -bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) -{ -	u32 features = dev->features & flags_dup_features; -	/* "data" can contain only flags_dup_features bits, -	 * see __ethtool_set_flags */ - -	return (features & ~supported) != (data & ~supported); -} -EXPORT_SYMBOL(ethtool_invalid_flags); - -int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) -{ -	if (ethtool_invalid_flags(dev, data, supported)) -		return -EINVAL; - -	dev->features = ((dev->features & ~flags_dup_features) | -			 (data & flags_dup_features)); -	return 0; -} -EXPORT_SYMBOL(ethtool_op_set_flags); -  /* Handlers for each ethtool command */ -#define ETHTOOL_DEV_FEATURE_WORDS	1 - -static void ethtool_get_features_compat(struct net_device *dev, -	struct ethtool_get_features_block *features) -{ -	if (!dev->ethtool_ops) -		return; - -	/* getting RX checksum */ -	if (dev->ethtool_ops->get_rx_csum) -		if (dev->ethtool_ops->get_rx_csum(dev)) -			features[0].active |= NETIF_F_RXCSUM; - -	/* mark legacy-changeable features */ -	if (dev->ethtool_ops->set_sg) -		features[0].available |= NETIF_F_SG; -	if (dev->ethtool_ops->set_tx_csum) -		features[0].available |= NETIF_F_ALL_CSUM; -	if (dev->ethtool_ops->set_tso) -		features[0].available |= NETIF_F_ALL_TSO; -	if (dev->ethtool_ops->set_rx_csum) -		features[0].available |= NETIF_F_RXCSUM; -	if (dev->ethtool_ops->set_flags) -		features[0].available |= flags_dup_features; -} - -static int ethtool_set_feature_compat(struct net_device *dev, -	int (*legacy_set)(struct net_device *, u32), -	struct ethtool_set_features_block *features, u32 mask) -{ -	u32 do_set; - -	if (!legacy_set) -		return 0; - -	if (!(features[0].valid & mask)) -		return 0; - -	features[0].valid &= ~mask; - -	do_set = !!(features[0].requested & mask); - -	if (legacy_set(dev, do_set) < 0) -		netdev_info(dev, -			"Legacy feature change (%s) failed for 0x%08x\n", -			do_set ? "set" : "clear", mask); - -	return 1; -} - -static int ethtool_set_flags_compat(struct net_device *dev, -	int (*legacy_set)(struct net_device *, u32), -	struct ethtool_set_features_block *features, u32 mask) -{ -	u32 value; - -	if (!legacy_set) -		return 0; - -	if (!(features[0].valid & mask)) -		return 0; +#define ETHTOOL_DEV_FEATURE_WORDS	((NETDEV_FEATURE_COUNT + 31) / 32) -	value = dev->features & ~features[0].valid; -	value |= features[0].requested; +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { +	[NETIF_F_SG_BIT] =               "tx-scatter-gather", +	[NETIF_F_IP_CSUM_BIT] =          "tx-checksum-ipv4", +	[NETIF_F_HW_CSUM_BIT] =          "tx-checksum-ip-generic", +	[NETIF_F_IPV6_CSUM_BIT] =        "tx-checksum-ipv6", +	[NETIF_F_HIGHDMA_BIT] =          "highdma", +	[NETIF_F_FRAGLIST_BIT] =         "tx-scatter-gather-fraglist", +	[NETIF_F_HW_VLAN_TX_BIT] =       "tx-vlan-hw-insert", -	features[0].valid &= ~mask; - -	if (legacy_set(dev, value & mask) < 0) -		netdev_info(dev, "Legacy flags change failed\n"); - -	return 1; -} - -static int ethtool_set_features_compat(struct net_device *dev, -	struct ethtool_set_features_block *features) -{ -	int compat; - -	if (!dev->ethtool_ops) -		return 0; +	[NETIF_F_HW_VLAN_RX_BIT] =       "rx-vlan-hw-parse", +	[NETIF_F_HW_VLAN_FILTER_BIT] =   "rx-vlan-filter", +	[NETIF_F_VLAN_CHALLENGED_BIT] =  "vlan-challenged", +	[NETIF_F_GSO_BIT] =              "tx-generic-segmentation", +	[NETIF_F_LLTX_BIT] =             "tx-lockless", +	[NETIF_F_NETNS_LOCAL_BIT] =      "netns-local", +	[NETIF_F_GRO_BIT] =              "rx-gro", +	[NETIF_F_LRO_BIT] =              "rx-lro", -	compat  = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg, -		features, NETIF_F_SG); -	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum, -		features, NETIF_F_ALL_CSUM); -	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso, -		features, NETIF_F_ALL_TSO); -	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum, -		features, NETIF_F_RXCSUM); -	compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags, -		features, flags_dup_features); +	[NETIF_F_TSO_BIT] =              "tx-tcp-segmentation", +	[NETIF_F_UFO_BIT] =              "tx-udp-fragmentation", +	[NETIF_F_GSO_ROBUST_BIT] =       "tx-gso-robust", +	[NETIF_F_TSO_ECN_BIT] =          "tx-tcp-ecn-segmentation", +	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation", +	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation", -	return compat; -} +	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc", +	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp", +	[NETIF_F_FCOE_MTU_BIT] =         "fcoe-mtu", +	[NETIF_F_NTUPLE_BIT] =           "rx-ntuple-filter", +	[NETIF_F_RXHASH_BIT] =           "rx-hashing", +	[NETIF_F_RXCSUM_BIT] =           "rx-checksum", +	[NETIF_F_NOCACHE_COPY_BIT] =     "tx-nocache-copy", +	[NETIF_F_LOOPBACK_BIT] =         "loopback", +};  static int ethtool_get_features(struct net_device *dev, void __user *useraddr)  { @@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)  		.cmd = ETHTOOL_GFEATURES,  		.size = ETHTOOL_DEV_FEATURE_WORDS,  	}; -	struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { -		{ -			.available = dev->hw_features, -			.requested = dev->wanted_features, -			.active = dev->features, -			.never_changed = NETIF_F_NEVER_CHANGE, -		}, -	}; +	struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];  	u32 __user *sizeaddr;  	u32 copy_size; +	int i; -	ethtool_get_features_compat(dev, features); +	/* in case feature bits run out again */ +	BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); + +	for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { +		features[i].available = (u32)(dev->hw_features >> (32 * i)); +		features[i].requested = (u32)(dev->wanted_features >> (32 * i)); +		features[i].active = (u32)(dev->features >> (32 * i)); +		features[i].never_changed = +			(u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); +	}  	sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);  	if (get_user(copy_size, sizeaddr)) @@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)  {  	struct ethtool_sfeatures cmd;  	struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; -	int ret = 0; +	netdev_features_t wanted = 0, valid = 0; +	int i, ret = 0;  	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))  		return -EFAULT; @@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)  	if (copy_from_user(features, useraddr, sizeof(features)))  		return -EFAULT; -	if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) -		return -EINVAL; +	for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { +		valid |= (netdev_features_t)features[i].valid << (32 * i); +		wanted |= (netdev_features_t)features[i].requested << (32 * i); +	} -	if (ethtool_set_features_compat(dev, features)) -		ret |= ETHTOOL_F_COMPAT; +	if (valid & ~NETIF_F_ETHTOOL_BITS) +		return -EINVAL; -	if (features[0].valid & ~dev->hw_features) { -		features[0].valid &= dev->hw_features; +	if (valid & ~dev->hw_features) { +		valid &= dev->hw_features;  		ret |= ETHTOOL_F_UNSUPPORTED;  	} -	dev->wanted_features &= ~features[0].valid; -	dev->wanted_features |= features[0].valid & features[0].requested; +	dev->wanted_features &= ~valid; +	dev->wanted_features |= wanted & valid;  	__netdev_update_features(dev); -	if ((dev->wanted_features ^ dev->features) & features[0].valid) +	if ((dev->wanted_features ^ dev->features) & valid)  		ret |= ETHTOOL_F_WISH;  	return ret;  } -static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { -	/* NETIF_F_SG */              "tx-scatter-gather", -	/* NETIF_F_IP_CSUM */         "tx-checksum-ipv4", -	/* NETIF_F_NO_CSUM */         "tx-checksum-unneeded", -	/* NETIF_F_HW_CSUM */         "tx-checksum-ip-generic", -	/* NETIF_F_IPV6_CSUM */       "tx-checksum-ipv6", -	/* NETIF_F_HIGHDMA */         "highdma", -	/* NETIF_F_FRAGLIST */        "tx-scatter-gather-fraglist", -	/* NETIF_F_HW_VLAN_TX */      "tx-vlan-hw-insert", - -	/* NETIF_F_HW_VLAN_RX */      "rx-vlan-hw-parse", -	/* NETIF_F_HW_VLAN_FILTER */  "rx-vlan-filter", -	/* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", -	/* NETIF_F_GSO */             "tx-generic-segmentation", -	/* NETIF_F_LLTX */            "tx-lockless", -	/* NETIF_F_NETNS_LOCAL */     "netns-local", -	/* NETIF_F_GRO */             "rx-gro", -	/* NETIF_F_LRO */             "rx-lro", - -	/* NETIF_F_TSO */             "tx-tcp-segmentation", -	/* NETIF_F_UFO */             "tx-udp-fragmentation", -	/* NETIF_F_GSO_ROBUST */      "tx-gso-robust", -	/* NETIF_F_TSO_ECN */         "tx-tcp-ecn-segmentation", -	/* NETIF_F_TSO6 */            "tx-tcp6-segmentation", -	/* NETIF_F_FSO */             "tx-fcoe-segmentation", -	"", -	"", - -	/* NETIF_F_FCOE_CRC */        "tx-checksum-fcoe-crc", -	/* NETIF_F_SCTP_CSUM */       "tx-checksum-sctp", -	/* NETIF_F_FCOE_MTU */        "fcoe-mtu", -	/* NETIF_F_NTUPLE */          "rx-ntuple-filter", -	/* NETIF_F_RXHASH */          "rx-hashing", -	/* NETIF_F_RXCSUM */          "rx-checksum", -	/* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy", -	/* NETIF_F_LOOPBACK */        "loopback", -}; -  static int __ethtool_get_sset_count(struct net_device *dev, int sset)  {  	const struct ethtool_ops *ops = dev->ethtool_ops; @@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev,  		ops->get_strings(dev, stringset, data);  } -static u32 ethtool_get_feature_mask(u32 eth_cmd) +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)  {  	/* feature masks of legacy discrete ethtool ops */ @@ -433,136 +210,82 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)  	}  } -static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) -{ -	const struct ethtool_ops *ops = dev->ethtool_ops; - -	if (!ops) -		return NULL; - -	switch (ethcmd) { -	case ETHTOOL_GTXCSUM: -		return ops->get_tx_csum; -	case ETHTOOL_GRXCSUM: -		return ops->get_rx_csum; -	case ETHTOOL_SSG: -		return ops->get_sg; -	case ETHTOOL_STSO: -		return ops->get_tso; -	case ETHTOOL_SUFO: -		return ops->get_ufo; -	default: -		return NULL; -	} -} - -static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) -{ -	return !!(dev->features & NETIF_F_ALL_CSUM); -} -  static int ethtool_get_one_feature(struct net_device *dev,  	char __user *useraddr, u32 ethcmd)  { -	u32 mask = ethtool_get_feature_mask(ethcmd); +	netdev_features_t mask = ethtool_get_feature_mask(ethcmd);  	struct ethtool_value edata = {  		.cmd = ethcmd,  		.data = !!(dev->features & mask),  	}; -	/* compatibility with discrete get_ ops */ -	if (!(dev->hw_features & mask)) { -		u32 (*actor)(struct net_device *); - -		actor = __ethtool_get_one_feature_actor(dev, ethcmd); - -		/* bug compatibility with old get_rx_csum */ -		if (ethcmd == ETHTOOL_GRXCSUM && !actor) -			actor = __ethtool_get_rx_csum_oldbug; - -		if (actor) -			edata.data = actor(dev); -	} -  	if (copy_to_user(useraddr, &edata, sizeof(edata)))  		return -EFAULT;  	return 0;  } -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_sg(struct net_device *dev, u32 data); -static int __ethtool_set_tso(struct net_device *dev, u32 data); -static int __ethtool_set_ufo(struct net_device *dev, u32 data); -  static int ethtool_set_one_feature(struct net_device *dev,  	void __user *useraddr, u32 ethcmd)  {  	struct ethtool_value edata; -	u32 mask; +	netdev_features_t mask;  	if (copy_from_user(&edata, useraddr, sizeof(edata)))  		return -EFAULT;  	mask = ethtool_get_feature_mask(ethcmd);  	mask &= dev->hw_features; -	if (mask) { -		if (edata.data) -			dev->wanted_features |= mask; -		else -			dev->wanted_features &= ~mask; +	if (!mask) +		return -EOPNOTSUPP; -		__netdev_update_features(dev); -		return 0; -	} +	if (edata.data) +		dev->wanted_features |= mask; +	else +		dev->wanted_features &= ~mask; -	/* Driver is not converted to ndo_fix_features or does not -	 * support changing this offload. In the latter case it won't -	 * have corresponding ethtool_ops field set. -	 * -	 * Following part is to be removed after all drivers advertise -	 * their changeable features in netdev->hw_features and stop -	 * using discrete offload setting ops. -	 */ +	__netdev_update_features(dev); -	switch (ethcmd) { -	case ETHTOOL_STXCSUM: -		return __ethtool_set_tx_csum(dev, edata.data); -	case ETHTOOL_SRXCSUM: -		return __ethtool_set_rx_csum(dev, edata.data); -	case ETHTOOL_SSG: -		return __ethtool_set_sg(dev, edata.data); -	case ETHTOOL_STSO: -		return __ethtool_set_tso(dev, edata.data); -	case ETHTOOL_SUFO: -		return __ethtool_set_ufo(dev, edata.data); -	default: -		return -EOPNOTSUPP; -	} +	return 0;  } -int __ethtool_set_flags(struct net_device *dev, u32 data) +#define ETH_ALL_FLAGS    (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ +			  ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ +			  NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) + +static u32 __ethtool_get_flags(struct net_device *dev)  { -	u32 changed; +	u32 flags = 0; + +	if (dev->features & NETIF_F_LRO)	flags |= ETH_FLAG_LRO; +	if (dev->features & NETIF_F_HW_VLAN_RX)	flags |= ETH_FLAG_RXVLAN; +	if (dev->features & NETIF_F_HW_VLAN_TX)	flags |= ETH_FLAG_TXVLAN; +	if (dev->features & NETIF_F_NTUPLE)	flags |= ETH_FLAG_NTUPLE; +	if (dev->features & NETIF_F_RXHASH)	flags |= ETH_FLAG_RXHASH; + +	return flags; +} -	if (data & ~flags_dup_features) +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ +	netdev_features_t features = 0, changed; + +	if (data & ~ETH_ALL_FLAGS)  		return -EINVAL; -	/* legacy set_flags() op */ -	if (dev->ethtool_ops->set_flags) { -		if (unlikely(dev->hw_features & flags_dup_features)) -			netdev_warn(dev, -				"driver BUG: mixed hw_features and set_flags()\n"); -		return dev->ethtool_ops->set_flags(dev, data); -	} +	if (data & ETH_FLAG_LRO)	features |= NETIF_F_LRO; +	if (data & ETH_FLAG_RXVLAN)	features |= NETIF_F_HW_VLAN_RX; +	if (data & ETH_FLAG_TXVLAN)	features |= NETIF_F_HW_VLAN_TX; +	if (data & ETH_FLAG_NTUPLE)	features |= NETIF_F_NTUPLE; +	if (data & ETH_FLAG_RXHASH)	features |= NETIF_F_RXHASH;  	/* allow changing only bits set in hw_features */ -	changed = (data ^ dev->features) & flags_dup_features; +	changed = (features ^ dev->features) & ETH_ALL_FEATURES;  	if (changed & ~dev->hw_features)  		return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;  	dev->wanted_features = -		(dev->wanted_features & ~changed) | (data & dev->hw_features); +		(dev->wanted_features & ~changed) | (features & changed);  	__netdev_update_features(dev); @@ -792,34 +515,44 @@ err_out:  static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,  						     void __user *useraddr)  { -	struct ethtool_rxfh_indir *indir; -	u32 table_size; -	size_t full_size; +	u32 user_size, dev_size; +	u32 *indir;  	int ret; -	if (!dev->ethtool_ops->get_rxfh_indir) +	if (!dev->ethtool_ops->get_rxfh_indir_size || +	    !dev->ethtool_ops->get_rxfh_indir) +		return -EOPNOTSUPP; +	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); +	if (dev_size == 0)  		return -EOPNOTSUPP; -	if (copy_from_user(&table_size, +	if (copy_from_user(&user_size,  			   useraddr + offsetof(struct ethtool_rxfh_indir, size), -			   sizeof(table_size))) +			   sizeof(user_size)))  		return -EFAULT; -	if (table_size > -	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) -		return -ENOMEM; -	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; -	indir = kzalloc(full_size, GFP_USER); +	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), +			 &dev_size, sizeof(dev_size))) +		return -EFAULT; + +	/* If the user buffer size is 0, this is just a query for the +	 * device table size.  Otherwise, if it's smaller than the +	 * device table size it's an error. +	 */ +	if (user_size < dev_size) +		return user_size == 0 ? 0 : -EINVAL; + +	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);  	if (!indir)  		return -ENOMEM; -	indir->cmd = ETHTOOL_GRXFHINDIR; -	indir->size = table_size;  	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);  	if (ret)  		goto out; -	if (copy_to_user(useraddr, indir, full_size)) +	if (copy_to_user(useraddr + +			 offsetof(struct ethtool_rxfh_indir, ring_index[0]), +			 indir, dev_size * sizeof(indir[0])))  		ret = -EFAULT;  out: @@ -830,30 +563,56 @@ out:  static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  						     void __user *useraddr)  { -	struct ethtool_rxfh_indir *indir; -	u32 table_size; -	size_t full_size; +	struct ethtool_rxnfc rx_rings; +	u32 user_size, dev_size, i; +	u32 *indir;  	int ret; -	if (!dev->ethtool_ops->set_rxfh_indir) +	if (!dev->ethtool_ops->get_rxfh_indir_size || +	    !dev->ethtool_ops->set_rxfh_indir || +	    !dev->ethtool_ops->get_rxnfc) +		return -EOPNOTSUPP; +	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); +	if (dev_size == 0)  		return -EOPNOTSUPP; -	if (copy_from_user(&table_size, +	if (copy_from_user(&user_size,  			   useraddr + offsetof(struct ethtool_rxfh_indir, size), -			   sizeof(table_size))) +			   sizeof(user_size)))  		return -EFAULT; -	if (table_size > -	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) -		return -ENOMEM; -	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; -	indir = kmalloc(full_size, GFP_USER); +	if (user_size != 0 && user_size != dev_size) +		return -EINVAL; + +	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);  	if (!indir)  		return -ENOMEM; -	if (copy_from_user(indir, useraddr, full_size)) { -		ret = -EFAULT; +	rx_rings.cmd = ETHTOOL_GRXRINGS; +	ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); +	if (ret)  		goto out; + +	if (user_size == 0) { +		for (i = 0; i < dev_size; i++) +			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); +	} else { +		if (copy_from_user(indir, +				  useraddr + +				  offsetof(struct ethtool_rxfh_indir, +					   ring_index[0]), +				  dev_size * sizeof(indir[0]))) { +			ret = -EFAULT; +			goto out; +		} + +		/* Validate ring indices */ +		for (i = 0; i < dev_size; i++) { +			if (indir[i] >= rx_rings.data) { +				ret = -EINVAL; +				goto out; +			} +		}  	}  	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); @@ -1231,81 +990,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)  	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);  } -static int __ethtool_set_sg(struct net_device *dev, u32 data) -{ -	int err; - -	if (!dev->ethtool_ops->set_sg) -		return -EOPNOTSUPP; - -	if (data && !(dev->features & NETIF_F_ALL_CSUM)) -		return -EINVAL; - -	if (!data && dev->ethtool_ops->set_tso) { -		err = dev->ethtool_ops->set_tso(dev, 0); -		if (err) -			return err; -	} - -	if (!data && dev->ethtool_ops->set_ufo) { -		err = dev->ethtool_ops->set_ufo(dev, 0); -		if (err) -			return err; -	} -	return dev->ethtool_ops->set_sg(dev, data); -} - -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) -{ -	int err; - -	if (!dev->ethtool_ops->set_tx_csum) -		return -EOPNOTSUPP; - -	if (!data && dev->ethtool_ops->set_sg) { -		err = __ethtool_set_sg(dev, 0); -		if (err) -			return err; -	} - -	return dev->ethtool_ops->set_tx_csum(dev, data); -} - -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) -{ -	if (!dev->ethtool_ops->set_rx_csum) -		return -EOPNOTSUPP; - -	if (!data) -		dev->features &= ~NETIF_F_GRO; - -	return dev->ethtool_ops->set_rx_csum(dev, data); -} - -static int __ethtool_set_tso(struct net_device *dev, u32 data) -{ -	if (!dev->ethtool_ops->set_tso) -		return -EOPNOTSUPP; - -	if (data && !(dev->features & NETIF_F_SG)) -		return -EINVAL; - -	return dev->ethtool_ops->set_tso(dev, data); -} - -static int __ethtool_set_ufo(struct net_device *dev, u32 data) -{ -	if (!dev->ethtool_ops->set_ufo) -		return -EOPNOTSUPP; -	if (data && !(dev->features & NETIF_F_SG)) -		return -EINVAL; -	if (data && !((dev->features & NETIF_F_GEN_CSUM) || -		(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) -			== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) -		return -EINVAL; -	return dev->ethtool_ops->set_ufo(dev, data); -} -  static int ethtool_self_test(struct net_device *dev, char __user *useraddr)  {  	struct ethtool_test test; @@ -1771,9 +1455,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  		break;  	case ETHTOOL_GFLAGS:  		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_flags ? -					dev->ethtool_ops->get_flags : -					ethtool_op_get_flags)); +					__ethtool_get_flags);  		break;  	case ETHTOOL_SFLAGS:  		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 00000000000..0985b9b14b8 --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,143 @@ +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <net/flow_keys.h> + +/* copy saddr & daddr, possibly using 64bit load/store + * Equivalent to :	flow->src = iph->saddr; + *			flow->dst = iph->daddr; + */ +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +{ +	BUILD_BUG_ON(offsetof(typeof(*flow), dst) != +		     offsetof(typeof(*flow), src) + sizeof(flow->src)); +	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); +} + +bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ +	int poff, nhoff = skb_network_offset(skb); +	u8 ip_proto; +	__be16 proto = skb->protocol; + +	memset(flow, 0, sizeof(*flow)); + +again: +	switch (proto) { +	case __constant_htons(ETH_P_IP): { +		const struct iphdr *iph; +		struct iphdr _iph; +ip: +		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); +		if (!iph) +			return false; + +		if (ip_is_fragment(iph)) +			ip_proto = 0; +		else +			ip_proto = iph->protocol; +		iph_to_flow_copy_addrs(flow, iph); +		nhoff += iph->ihl * 4; +		break; +	} +	case __constant_htons(ETH_P_IPV6): { +		const struct ipv6hdr *iph; +		struct ipv6hdr _iph; +ipv6: +		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); +		if (!iph) +			return false; + +		ip_proto = iph->nexthdr; +		flow->src = iph->saddr.s6_addr32[3]; +		flow->dst = iph->daddr.s6_addr32[3]; +		nhoff += sizeof(struct ipv6hdr); +		break; +	} +	case __constant_htons(ETH_P_8021Q): { +		const struct vlan_hdr *vlan; +		struct vlan_hdr _vlan; + +		vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); +		if (!vlan) +			return false; + +		proto = vlan->h_vlan_encapsulated_proto; +		nhoff += sizeof(*vlan); +		goto again; +	} +	case __constant_htons(ETH_P_PPP_SES): { +		struct { +			struct pppoe_hdr hdr; +			__be16 proto; +		} *hdr, _hdr; +		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); +		if (!hdr) +			return false; +		proto = hdr->proto; +		nhoff += PPPOE_SES_HLEN; +		switch (proto) { +		case __constant_htons(PPP_IP): +			goto ip; +		case __constant_htons(PPP_IPV6): +			goto ipv6; +		default: +			return false; +		} +	} +	default: +		return false; +	} + +	switch (ip_proto) { +	case IPPROTO_GRE: { +		struct gre_hdr { +			__be16 flags; +			__be16 proto; +		} *hdr, _hdr; + +		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); +		if (!hdr) +			return false; +		/* +		 * Only look inside GRE if version zero and no +		 * routing +		 */ +		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { +			proto = hdr->proto; +			nhoff += 4; +			if (hdr->flags & GRE_CSUM) +				nhoff += 4; +			if (hdr->flags & GRE_KEY) +				nhoff += 4; +			if (hdr->flags & GRE_SEQ) +				nhoff += 4; +			goto again; +		} +		break; +	} +	case IPPROTO_IPIP: +		goto again; +	default: +		break; +	} + +	flow->ip_proto = ip_proto; +	poff = proto_ports_offset(ip_proto); +	if (poff >= 0) { +		__be32 *ports, _ports; + +		nhoff += poff; +		ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); +		if (ports) +			flow->ports = *ports; +	} + +	return true; +} +EXPORT_SYMBOL(skb_flow_dissect); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5ac07d31fbc..4af151e1bf5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)  				   it to safe state.  				 */  				skb_queue_purge(&n->arp_queue); +				n->arp_queue_len_bytes = 0;  				n->output = neigh_blackhole;  				if (n->nud_state & NUD_VALID)  					n->nud_state = NUD_NOARP; @@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)  }  EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)  {  	struct neighbour *n = NULL;  	unsigned long now = jiffies; @@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)  			goto out_entries;  	} -	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); +	if (tbl->entry_size) +		n = kzalloc(tbl->entry_size, GFP_ATOMIC); +	else { +		int sz = sizeof(*n) + tbl->key_len; + +		sz = ALIGN(sz, NEIGH_PRIV_ALIGN); +		sz += dev->neigh_priv_len; +		n = kzalloc(sz, GFP_ATOMIC); +	}  	if (!n)  		goto out_entries; @@ -462,7 +471,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,  	u32 hash_val;  	int key_len = tbl->key_len;  	int error; -	struct neighbour *n1, *rc, *n = neigh_alloc(tbl); +	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);  	struct neigh_hash_table *nht;  	if (!n) { @@ -480,6 +489,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,  		goto out_neigh_release;  	} +	if (dev->netdev_ops->ndo_neigh_construct) { +		error = dev->netdev_ops->ndo_neigh_construct(n); +		if (error < 0) { +			rc = ERR_PTR(error); +			goto out_neigh_release; +		} +	} +  	/* Device specific setup. */  	if (n->parms->neigh_setup &&  	    (error = n->parms->neigh_setup(n)) < 0) { @@ -677,18 +694,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms)  		neigh_parms_destroy(parms);  } -static void neigh_destroy_rcu(struct rcu_head *head) -{ -	struct neighbour *neigh = container_of(head, struct neighbour, rcu); - -	kmem_cache_free(neigh->tbl->kmem_cachep, neigh); -}  /*   *	neighbour must already be out of the table;   *   */  void neigh_destroy(struct neighbour *neigh)  { +	struct net_device *dev = neigh->dev; +  	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);  	if (!neigh->dead) { @@ -702,14 +715,18 @@ void neigh_destroy(struct neighbour *neigh)  		printk(KERN_WARNING "Impossible event.\n");  	skb_queue_purge(&neigh->arp_queue); +	neigh->arp_queue_len_bytes = 0; + +	if (dev->netdev_ops->ndo_neigh_destroy) +		dev->netdev_ops->ndo_neigh_destroy(neigh); -	dev_put(neigh->dev); +	dev_put(dev);  	neigh_parms_put(neigh->parms);  	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);  	atomic_dec(&neigh->tbl->entries); -	call_rcu(&neigh->rcu, neigh_destroy_rcu); +	kfree_rcu(neigh, rcu);  }  EXPORT_SYMBOL(neigh_destroy); @@ -842,6 +859,7 @@ static void neigh_invalidate(struct neighbour *neigh)  		write_lock(&neigh->lock);  	}  	skb_queue_purge(&neigh->arp_queue); +	neigh->arp_queue_len_bytes = 0;  }  static void neigh_probe(struct neighbour *neigh) @@ -980,15 +998,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  	if (neigh->nud_state == NUD_INCOMPLETE) {  		if (skb) { -			if (skb_queue_len(&neigh->arp_queue) >= -			    neigh->parms->queue_len) { +			while (neigh->arp_queue_len_bytes + skb->truesize > +			       neigh->parms->queue_len_bytes) {  				struct sk_buff *buff; +  				buff = __skb_dequeue(&neigh->arp_queue); +				if (!buff) +					break; +				neigh->arp_queue_len_bytes -= buff->truesize;  				kfree_skb(buff);  				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);  			}  			skb_dst_force(skb);  			__skb_queue_tail(&neigh->arp_queue, skb); +			neigh->arp_queue_len_bytes += skb->truesize;  		}  		rc = 1;  	} @@ -1167,7 +1190,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  			rcu_read_lock();  			/* On shaper/eql skb->dst->neighbour != neigh :( */ -			if (dst && (n2 = dst_get_neighbour(dst)) != NULL) +			if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL)  				n1 = n2;  			n1->output(n1, skb);  			rcu_read_unlock(); @@ -1175,6 +1198,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  			write_lock_bh(&neigh->lock);  		}  		skb_queue_purge(&neigh->arp_queue); +		neigh->arp_queue_len_bytes = 0;  	}  out:  	if (update_isrouter) { @@ -1477,11 +1501,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)  	tbl->parms.reachable_time =  			  neigh_rand_reach_time(tbl->parms.base_reachable_time); -	if (!tbl->kmem_cachep) -		tbl->kmem_cachep = -			kmem_cache_create(tbl->id, tbl->entry_size, 0, -					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, -					  NULL);  	tbl->stats = alloc_percpu(struct neigh_statistics);  	if (!tbl->stats)  		panic("cannot create neighbour cache statistics"); @@ -1566,9 +1585,6 @@ int neigh_table_clear(struct neigh_table *tbl)  	free_percpu(tbl->stats);  	tbl->stats = NULL; -	kmem_cache_destroy(tbl->kmem_cachep); -	tbl->kmem_cachep = NULL; -  	return 0;  }  EXPORT_SYMBOL(neigh_table_clear); @@ -1747,7 +1763,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);  	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); -	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); +	NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes); +	/* approximative value for deprecated QUEUE_LEN (in packets) */ +	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, +		    DIV_ROUND_UP(parms->queue_len_bytes, +				 SKB_TRUESIZE(ETH_FRAME_LEN)));  	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);  	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);  	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); @@ -1974,7 +1994,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  			switch (i) {  			case NDTPA_QUEUE_LEN: -				p->queue_len = nla_get_u32(tbp[i]); +				p->queue_len_bytes = nla_get_u32(tbp[i]) * +						     SKB_TRUESIZE(ETH_FRAME_LEN); +				break; +			case NDTPA_QUEUE_LENBYTES: +				p->queue_len_bytes = nla_get_u32(tbp[i]);  				break;  			case NDTPA_PROXY_QLEN:  				p->proxy_qlen = nla_get_u32(tbp[i]); @@ -2638,117 +2662,158 @@ EXPORT_SYMBOL(neigh_app_ns);  #ifdef CONFIG_SYSCTL -#define NEIGH_VARS_MAX 19 +static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, +			   size_t *lenp, loff_t *ppos) +{ +	int size, ret; +	ctl_table tmp = *ctl; + +	tmp.data = &size; +	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); +	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); +	if (write && !ret) +		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); +	return ret; +} + +enum { +	NEIGH_VAR_MCAST_PROBE, +	NEIGH_VAR_UCAST_PROBE, +	NEIGH_VAR_APP_PROBE, +	NEIGH_VAR_RETRANS_TIME, +	NEIGH_VAR_BASE_REACHABLE_TIME, +	NEIGH_VAR_DELAY_PROBE_TIME, +	NEIGH_VAR_GC_STALETIME, +	NEIGH_VAR_QUEUE_LEN, +	NEIGH_VAR_QUEUE_LEN_BYTES, +	NEIGH_VAR_PROXY_QLEN, +	NEIGH_VAR_ANYCAST_DELAY, +	NEIGH_VAR_PROXY_DELAY, +	NEIGH_VAR_LOCKTIME, +	NEIGH_VAR_RETRANS_TIME_MS, +	NEIGH_VAR_BASE_REACHABLE_TIME_MS, +	NEIGH_VAR_GC_INTERVAL, +	NEIGH_VAR_GC_THRESH1, +	NEIGH_VAR_GC_THRESH2, +	NEIGH_VAR_GC_THRESH3, +	NEIGH_VAR_MAX +};  static struct neigh_sysctl_table {  	struct ctl_table_header *sysctl_header; -	struct ctl_table neigh_vars[NEIGH_VARS_MAX]; +	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];  	char *dev_name;  } neigh_sysctl_template __read_mostly = {  	.neigh_vars = { -		{ +		[NEIGH_VAR_MCAST_PROBE] = {  			.procname	= "mcast_solicit",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_UCAST_PROBE] = {  			.procname	= "ucast_solicit",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_APP_PROBE] = {  			.procname	= "app_solicit",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_RETRANS_TIME] = {  			.procname	= "retrans_time",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_userhz_jiffies,  		}, -		{ +		[NEIGH_VAR_BASE_REACHABLE_TIME] = {  			.procname	= "base_reachable_time",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_jiffies,  		}, -		{ +		[NEIGH_VAR_DELAY_PROBE_TIME] = {  			.procname	= "delay_first_probe_time",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_jiffies,  		}, -		{ +		[NEIGH_VAR_GC_STALETIME] = {  			.procname	= "gc_stale_time",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_jiffies,  		}, -		{ +		[NEIGH_VAR_QUEUE_LEN] = {  			.procname	= "unres_qlen",  			.maxlen		= sizeof(int),  			.mode		= 0644, +			.proc_handler	= proc_unres_qlen, +		}, +		[NEIGH_VAR_QUEUE_LEN_BYTES] = { +			.procname	= "unres_qlen_bytes", +			.maxlen		= sizeof(int), +			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_PROXY_QLEN] = {  			.procname	= "proxy_qlen",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_ANYCAST_DELAY] = {  			.procname	= "anycast_delay",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_userhz_jiffies,  		}, -		{ +		[NEIGH_VAR_PROXY_DELAY] = {  			.procname	= "proxy_delay",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_userhz_jiffies,  		}, -		{ +		[NEIGH_VAR_LOCKTIME] = {  			.procname	= "locktime",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_userhz_jiffies,  		}, -		{ +		[NEIGH_VAR_RETRANS_TIME_MS] = {  			.procname	= "retrans_time_ms",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_ms_jiffies,  		}, -		{ +		[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {  			.procname	= "base_reachable_time_ms",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_ms_jiffies,  		}, -		{ +		[NEIGH_VAR_GC_INTERVAL] = {  			.procname	= "gc_interval",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_jiffies,  		}, -		{ +		[NEIGH_VAR_GC_THRESH1] = {  			.procname	= "gc_thresh1",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_GC_THRESH2] = {  			.procname	= "gc_thresh2",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec,  		}, -		{ +		[NEIGH_VAR_GC_THRESH3] = {  			.procname	= "gc_thresh3",  			.maxlen		= sizeof(int),  			.mode		= 0644, @@ -2781,47 +2846,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  	if (!t)  		goto err; -	t->neigh_vars[0].data  = &p->mcast_probes; -	t->neigh_vars[1].data  = &p->ucast_probes; -	t->neigh_vars[2].data  = &p->app_probes; -	t->neigh_vars[3].data  = &p->retrans_time; -	t->neigh_vars[4].data  = &p->base_reachable_time; -	t->neigh_vars[5].data  = &p->delay_probe_time; -	t->neigh_vars[6].data  = &p->gc_staletime; -	t->neigh_vars[7].data  = &p->queue_len; -	t->neigh_vars[8].data  = &p->proxy_qlen; -	t->neigh_vars[9].data  = &p->anycast_delay; -	t->neigh_vars[10].data = &p->proxy_delay; -	t->neigh_vars[11].data = &p->locktime; -	t->neigh_vars[12].data  = &p->retrans_time; -	t->neigh_vars[13].data  = &p->base_reachable_time; +	t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data  = &p->mcast_probes; +	t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data  = &p->ucast_probes; +	t->neigh_vars[NEIGH_VAR_APP_PROBE].data  = &p->app_probes; +	t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data  = &p->retrans_time; +	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data  = &p->base_reachable_time; +	t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data  = &p->delay_probe_time; +	t->neigh_vars[NEIGH_VAR_GC_STALETIME].data  = &p->gc_staletime; +	t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data  = &p->queue_len_bytes; +	t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data  = &p->queue_len_bytes; +	t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data  = &p->proxy_qlen; +	t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data  = &p->anycast_delay; +	t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; +	t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; +	t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data  = &p->retrans_time; +	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data  = &p->base_reachable_time;  	if (dev) {  		dev_name_source = dev->name;  		/* Terminate the table early */ -		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); +		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, +		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));  	} else {  		dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; -		t->neigh_vars[14].data = (int *)(p + 1); -		t->neigh_vars[15].data = (int *)(p + 1) + 1; -		t->neigh_vars[16].data = (int *)(p + 1) + 2; -		t->neigh_vars[17].data = (int *)(p + 1) + 3; +		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); +		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; +		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; +		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;  	}  	if (handler) {  		/* RetransTime */ -		t->neigh_vars[3].proc_handler = handler; -		t->neigh_vars[3].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;  		/* ReachableTime */ -		t->neigh_vars[4].proc_handler = handler; -		t->neigh_vars[4].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;  		/* RetransTime (in milliseconds)*/ -		t->neigh_vars[12].proc_handler = handler; -		t->neigh_vars[12].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;  		/* ReachableTime (in milliseconds) */ -		t->neigh_vars[13].proc_handler = handler; -		t->neigh_vars[13].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;  	}  	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 385aefe5364..4b4d0b0a354 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -21,6 +21,7 @@  #include <linux/wireless.h>  #include <linux/vmalloc.h>  #include <linux/export.h> +#include <linux/jiffies.h>  #include <net/wext.h>  #include "net-sysfs.h" @@ -606,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,  	rcu_assign_pointer(queue->rps_map, map);  	spin_unlock(&rps_map_lock); -	if (old_map) +	if (map) +		jump_label_inc(&rps_needed); +	if (old_map) {  		kfree_rcu(old_map, rcu); - +		jump_label_dec(&rps_needed); +	}  	free_cpumask_var(mask);  	return len;  } @@ -783,7 +787,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  #endif  } -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS  /*   * netdev_queue sysfs structures and functions.   */ @@ -829,6 +833,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {  	.store = netdev_queue_attr_store,  }; +static ssize_t show_trans_timeout(struct netdev_queue *queue, +				  struct netdev_queue_attribute *attribute, +				  char *buf) +{ +	unsigned long trans_timeout; + +	spin_lock_irq(&queue->_xmit_lock); +	trans_timeout = queue->trans_timeout; +	spin_unlock_irq(&queue->_xmit_lock); + +	return sprintf(buf, "%lu", trans_timeout); +} + +static struct netdev_queue_attribute queue_trans_timeout = +	__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ +	return sprintf(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, +		       unsigned int *pvalue) +{ +	unsigned int value; +	int err; + +	if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) +		value = DQL_MAX_LIMIT; +	else { +		err = kstrtouint(buf, 10, &value); +		if (err < 0) +			return err; +		if (value > DQL_MAX_LIMIT) +			return -EINVAL; +	} + +	*pvalue = value; + +	return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, +				  struct netdev_queue_attribute *attr, +				  char *buf) +{ +	struct dql *dql = &queue->dql; + +	return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, +				 struct netdev_queue_attribute *attribute, +				 const char *buf, size_t len) +{ +	struct dql *dql = &queue->dql; +	unsigned value; +	int err; + +	err = kstrtouint(buf, 10, &value); +	if (err < 0) +		return err; + +	dql->slack_hold_time = msecs_to_jiffies(value); + +	return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute = +	__ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, +	    bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, +				 struct netdev_queue_attribute *attr, +				 char *buf) +{ +	struct dql *dql = &queue->dql; + +	return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute = +	__ATTR(inflight, S_IRUGO | S_IWUSR, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD)						\ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue,		\ +				 struct netdev_queue_attribute *attr,	\ +				 char *buf)				\ +{									\ +	return bql_show(buf, queue->dql.FIELD);				\ +}									\ +									\ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue,		\ +				struct netdev_queue_attribute *attr,	\ +				const char *buf, size_t len)		\ +{									\ +	return bql_set(buf, len, &queue->dql.FIELD);			\ +}									\ +									\ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute =	\ +	__ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME,		\ +	    bql_set_ ## NAME); + +BQL_ATTR(limit, limit) +BQL_ATTR(limit_max, max_limit) +BQL_ATTR(limit_min, min_limit) + +static struct attribute *dql_attrs[] = { +	&bql_limit_attribute.attr, +	&bql_limit_max_attribute.attr, +	&bql_limit_min_attribute.attr, +	&bql_hold_time_attribute.attr, +	&bql_inflight_attribute.attr, +	NULL +}; + +static struct attribute_group dql_group = { +	.name  = "byte_queue_limits", +	.attrs  = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS  static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)  {  	struct net_device *dev = queue->dev; @@ -893,6 +1024,52 @@ static DEFINE_MUTEX(xps_map_mutex);  #define xmap_dereference(P)		\  	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) +static void xps_queue_release(struct netdev_queue *queue) +{ +	struct net_device *dev = queue->dev; +	struct xps_dev_maps *dev_maps; +	struct xps_map *map; +	unsigned long index; +	int i, pos, nonempty = 0; + +	index = get_netdev_queue_index(queue); + +	mutex_lock(&xps_map_mutex); +	dev_maps = xmap_dereference(dev->xps_maps); + +	if (dev_maps) { +		for_each_possible_cpu(i) { +			map = xmap_dereference(dev_maps->cpu_map[i]); +			if (!map) +				continue; + +			for (pos = 0; pos < map->len; pos++) +				if (map->queues[pos] == index) +					break; + +			if (pos < map->len) { +				if (map->len > 1) +					map->queues[pos] = +					    map->queues[--map->len]; +				else { +					RCU_INIT_POINTER(dev_maps->cpu_map[i], +					    NULL); +					kfree_rcu(map, rcu); +					map = NULL; +				} +			} +			if (map) +				nonempty = 1; +		} + +		if (!nonempty) { +			RCU_INIT_POINTER(dev->xps_maps, NULL); +			kfree_rcu(dev_maps, rcu); +		} +	} +	mutex_unlock(&xps_map_mutex); +} +  static ssize_t store_xps_map(struct netdev_queue *queue,  		      struct netdev_queue_attribute *attribute,  		      const char *buf, size_t len) @@ -904,7 +1081,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,  	struct xps_map *map, *new_map;  	struct xps_dev_maps *dev_maps, *new_dev_maps;  	int nonempty = 0; -	int numa_node = -2; +	int numa_node_id = -2;  	if (!capable(CAP_NET_ADMIN))  		return -EPERM; @@ -947,10 +1124,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,  		need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);  #ifdef CONFIG_NUMA  		if (need_set) { -			if (numa_node == -2) -				numa_node = cpu_to_node(cpu); -			else if (numa_node != cpu_to_node(cpu)) -				numa_node = -1; +			if (numa_node_id == -2) +				numa_node_id = cpu_to_node(cpu); +			else if (numa_node_id != cpu_to_node(cpu)) +				numa_node_id = -1;  		}  #endif  		if (need_set && pos >= map_len) { @@ -1000,7 +1177,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,  	if (dev_maps)  		kfree_rcu(dev_maps, rcu); -	netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : +	netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :  					    NUMA_NO_NODE);  	mutex_unlock(&xps_map_mutex); @@ -1023,58 +1200,23 @@ error:  static struct netdev_queue_attribute xps_cpus_attribute =      __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +#endif /* CONFIG_XPS */  static struct attribute *netdev_queue_default_attrs[] = { +	&queue_trans_timeout.attr, +#ifdef CONFIG_XPS  	&xps_cpus_attribute.attr, +#endif  	NULL  };  static void netdev_queue_release(struct kobject *kobj)  {  	struct netdev_queue *queue = to_netdev_queue(kobj); -	struct net_device *dev = queue->dev; -	struct xps_dev_maps *dev_maps; -	struct xps_map *map; -	unsigned long index; -	int i, pos, nonempty = 0; - -	index = get_netdev_queue_index(queue); - -	mutex_lock(&xps_map_mutex); -	dev_maps = xmap_dereference(dev->xps_maps); - -	if (dev_maps) { -		for_each_possible_cpu(i) { -			map = xmap_dereference(dev_maps->cpu_map[i]); -			if (!map) -				continue; - -			for (pos = 0; pos < map->len; pos++) -				if (map->queues[pos] == index) -					break; - -			if (pos < map->len) { -				if (map->len > 1) -					map->queues[pos] = -					    map->queues[--map->len]; -				else { -					RCU_INIT_POINTER(dev_maps->cpu_map[i], -					    NULL); -					kfree_rcu(map, rcu); -					map = NULL; -				} -			} -			if (map) -				nonempty = 1; -		} - -		if (!nonempty) { -			RCU_INIT_POINTER(dev->xps_maps, NULL); -			kfree_rcu(dev_maps, rcu); -		} -	} -	mutex_unlock(&xps_map_mutex); +#ifdef CONFIG_XPS +	xps_queue_release(queue); +#endif  	memset(kobj, 0, sizeof(*kobj));  	dev_put(queue->dev); @@ -1095,22 +1237,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)  	kobj->kset = net->queues_kset;  	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,  	    "tx-%u", index); -	if (error) { -		kobject_put(kobj); -		return error; -	} +	if (error) +		goto exit; + +#ifdef CONFIG_BQL +	error = sysfs_create_group(kobj, &dql_group); +	if (error) +		goto exit; +#endif  	kobject_uevent(kobj, KOBJ_ADD);  	dev_hold(queue->dev); +	return 0; +exit: +	kobject_put(kobj);  	return error;  } -#endif /* CONFIG_XPS */ +#endif /* CONFIG_SYSFS */  int  netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  { -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS  	int i;  	int error = 0; @@ -1122,20 +1271,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  		}  	} -	while (--i >= new_num) -		kobject_put(&net->_tx[i].kobj); +	while (--i >= new_num) { +		struct netdev_queue *queue = net->_tx + i; + +#ifdef CONFIG_BQL +		sysfs_remove_group(&queue->kobj, &dql_group); +#endif +		kobject_put(&queue->kobj); +	}  	return error;  #else  	return 0; -#endif +#endif /* CONFIG_SYSFS */  }  static int register_queue_kobjects(struct net_device *net)  {  	int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS  	net->queues_kset = kset_create_and_add("queues",  	    NULL, &net->dev.kobj);  	if (!net->queues_kset) @@ -1176,7 +1331,7 @@ static void remove_queue_kobjects(struct net_device *net)  	net_rx_queue_update_kobjects(net, real_rx, 0);  	netdev_queue_update_kobjects(net, real_tx, 0); -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS  	kset_unregister(net->queues_kset);  #endif  } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index cf64c1ffa4c..0d38808a230 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -76,7 +76,7 @@ static void queue_process(struct work_struct *work)  		local_irq_save(flags);  		__netif_tx_lock(txq, smp_processor_id()); -		if (netif_tx_queue_frozen_or_stopped(txq) || +		if (netif_xmit_frozen_or_stopped(txq) ||  		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {  			skb_queue_head(&npinfo->txq, skb);  			__netif_tx_unlock(txq); @@ -317,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;  		     tries > 0; --tries) {  			if (__netif_tx_trylock(txq)) { -				if (!netif_tx_queue_stopped(txq)) { +				if (!netif_xmit_stopped(txq)) {  					status = ops->ndo_start_xmit(skb, dev);  					if (status == NETDEV_TX_OK)  						txq_trans_update(txq); @@ -422,6 +422,7 @@ static void arp_reply(struct sk_buff *skb)  	struct sk_buff *send_skb;  	struct netpoll *np, *tmp;  	unsigned long flags; +	int hlen, tlen;  	int hits = 0;  	if (list_empty(&npinfo->rx_np)) @@ -479,8 +480,9 @@ static void arp_reply(struct sk_buff *skb)  		if (tip != np->local_ip)  			continue; -		send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), -				    LL_RESERVED_SPACE(np->dev)); +		hlen = LL_RESERVED_SPACE(np->dev); +		tlen = np->dev->needed_tailroom; +		send_skb = find_skb(np, size + hlen + tlen, hlen);  		if (!send_skb)  			continue; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 00000000000..3a9fd4826b7 --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,344 @@ +/* + * net/core/netprio_cgroup.c	Priority Control Group + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, +					       struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_prio_subsys = { +	.name		= "net_prio", +	.create		= cgrp_create, +	.destroy	= cgrp_destroy, +	.populate	= cgrp_populate, +#ifdef CONFIG_NETPRIO_CGROUP +	.subsys_id	= net_prio_subsys_id, +#endif +	.module		= THIS_MODULE +}; + +#define PRIOIDX_SZ 128 + +static unsigned long prioidx_map[PRIOIDX_SZ]; +static DEFINE_SPINLOCK(prioidx_map_lock); +static atomic_t max_prioidx = ATOMIC_INIT(0); + +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) +{ +	return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), +			    struct cgroup_netprio_state, css); +} + +static int get_prioidx(u32 *prio) +{ +	unsigned long flags; +	u32 prioidx; + +	spin_lock_irqsave(&prioidx_map_lock, flags); +	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); +	set_bit(prioidx, prioidx_map); +	spin_unlock_irqrestore(&prioidx_map_lock, flags); +	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) +		return -ENOSPC; + +	atomic_set(&max_prioidx, prioidx); +	*prio = prioidx; +	return 0; +} + +static void put_prioidx(u32 idx) +{ +	unsigned long flags; + +	spin_lock_irqsave(&prioidx_map_lock, flags); +	clear_bit(idx, prioidx_map); +	spin_unlock_irqrestore(&prioidx_map_lock, flags); +} + +static void extend_netdev_table(struct net_device *dev, u32 new_len) +{ +	size_t new_size = sizeof(struct netprio_map) + +			   ((sizeof(u32) * new_len)); +	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); +	struct netprio_map *old_priomap; +	int i; + +	old_priomap  = rtnl_dereference(dev->priomap); + +	if (!new_priomap) { +		printk(KERN_WARNING "Unable to alloc new priomap!\n"); +		return; +	} + +	for (i = 0; +	     old_priomap && (i < old_priomap->priomap_len); +	     i++) +		new_priomap->priomap[i] = old_priomap->priomap[i]; + +	new_priomap->priomap_len = new_len; + +	rcu_assign_pointer(dev->priomap, new_priomap); +	if (old_priomap) +		kfree_rcu(old_priomap, rcu); +} + +static void update_netdev_tables(void) +{ +	struct net_device *dev; +	u32 max_len = atomic_read(&max_prioidx); +	struct netprio_map *map; + +	rtnl_lock(); +	for_each_netdev(&init_net, dev) { +		map = rtnl_dereference(dev->priomap); +		if ((!map) || +		    (map->priomap_len < max_len)) +			extend_netdev_table(dev, max_len); +	} +	rtnl_unlock(); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, +						 struct cgroup *cgrp) +{ +	struct cgroup_netprio_state *cs; +	int ret; + +	cs = kzalloc(sizeof(*cs), GFP_KERNEL); +	if (!cs) +		return ERR_PTR(-ENOMEM); + +	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { +		kfree(cs); +		return ERR_PTR(-EINVAL); +	} + +	ret = get_prioidx(&cs->prioidx); +	if (ret != 0) { +		printk(KERN_WARNING "No space in priority index array\n"); +		kfree(cs); +		return ERR_PTR(ret); +	} + +	return &cs->css; +} + +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct cgroup_netprio_state *cs; +	struct net_device *dev; +	struct netprio_map *map; + +	cs = cgrp_netprio_state(cgrp); +	rtnl_lock(); +	for_each_netdev(&init_net, dev) { +		map = rtnl_dereference(dev->priomap); +		if (map) +			map->priomap[cs->prioidx] = 0; +	} +	rtnl_unlock(); +	put_prioidx(cs->prioidx); +	kfree(cs); +} + +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +{ +	return (u64)cgrp_netprio_state(cgrp)->prioidx; +} + +static int read_priomap(struct cgroup *cont, struct cftype *cft, +			struct cgroup_map_cb *cb) +{ +	struct net_device *dev; +	u32 prioidx = cgrp_netprio_state(cont)->prioidx; +	u32 priority; +	struct netprio_map *map; + +	rcu_read_lock(); +	for_each_netdev_rcu(&init_net, dev) { +		map = rcu_dereference(dev->priomap); +		priority = map ? map->priomap[prioidx] : 0; +		cb->fill(cb, dev->name, priority); +	} +	rcu_read_unlock(); +	return 0; +} + +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, +			 const char *buffer) +{ +	char *devname = kstrdup(buffer, GFP_KERNEL); +	int ret = -EINVAL; +	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; +	unsigned long priority; +	char *priostr; +	struct net_device *dev; +	struct netprio_map *map; + +	if (!devname) +		return -ENOMEM; + +	/* +	 * Minimally sized valid priomap string +	 */ +	if (strlen(devname) < 3) +		goto out_free_devname; + +	priostr = strstr(devname, " "); +	if (!priostr) +		goto out_free_devname; + +	/* +	 *Separate the devname from the associated priority +	 *and advance the priostr poitner to the priority value +	 */ +	*priostr = '\0'; +	priostr++; + +	/* +	 * If the priostr points to NULL, we're at the end of the passed +	 * in string, and its not a valid write +	 */ +	if (*priostr == '\0') +		goto out_free_devname; + +	ret = kstrtoul(priostr, 10, &priority); +	if (ret < 0) +		goto out_free_devname; + +	ret = -ENODEV; + +	dev = dev_get_by_name(&init_net, devname); +	if (!dev) +		goto out_free_devname; + +	update_netdev_tables(); +	ret = 0; +	rcu_read_lock(); +	map = rcu_dereference(dev->priomap); +	if (map) +		map->priomap[prioidx] = priority; +	rcu_read_unlock(); +	dev_put(dev); + +out_free_devname: +	kfree(devname); +	return ret; +} + +static struct cftype ss_files[] = { +	{ +		.name = "prioidx", +		.read_u64 = read_prioidx, +	}, +	{ +		.name = "ifpriomap", +		.read_map = read_priomap, +		.write_string = write_priomap, +	}, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +static int netprio_device_event(struct notifier_block *unused, +				unsigned long event, void *ptr) +{ +	struct net_device *dev = ptr; +	struct netprio_map *old; +	u32 max_len = atomic_read(&max_prioidx); + +	/* +	 * Note this is called with rtnl_lock held so we have update side +	 * protection on our rcu assignments +	 */ + +	switch (event) { + +	case NETDEV_REGISTER: +		if (max_len) +			extend_netdev_table(dev, max_len); +		break; +	case NETDEV_UNREGISTER: +		old = rtnl_dereference(dev->priomap); +		RCU_INIT_POINTER(dev->priomap, NULL); +		if (old) +			kfree_rcu(old, rcu); +		break; +	} +	return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { +	.notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ +	int ret; + +	ret = cgroup_load_subsys(&net_prio_subsys); +	if (ret) +		goto out; +#ifndef CONFIG_NETPRIO_CGROUP +	smp_wmb(); +	net_prio_subsys_id = net_prio_subsys.subsys_id; +#endif + +	register_netdevice_notifier(&netprio_device_notifier); + +out: +	return ret; +} + +static void __exit exit_cgroup_netprio(void) +{ +	struct netprio_map *old; +	struct net_device *dev; + +	unregister_netdevice_notifier(&netprio_device_notifier); + +	cgroup_unload_subsys(&net_prio_subsys); + +#ifndef CONFIG_NETPRIO_CGROUP +	net_prio_subsys_id = -1; +	synchronize_rcu(); +#endif + +	rtnl_lock(); +	for_each_netdev(&init_net, dev) { +		old = rtnl_dereference(dev->priomap); +		RCU_INIT_POINTER(dev->priomap, NULL); +		if (old) +			kfree_rcu(old, rcu); +	} +	rtnl_unlock(); +} + +module_init(init_cgroup_netprio); +module_exit(exit_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0001c243b35..449fe0f068f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file,  		scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);  		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); +		pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;  		if (debug)  			printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); @@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file,  		scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);  		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, -			       &pkt_dev->min_in6_daddr); +		pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;  		if (debug)  			printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); @@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file,  		scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);  		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); +		pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;  		if (debug)  			printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); @@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  				     ifp = ifp->if_next) {  					if (ifp->scope == IFA_LINK &&  					    !(ifp->flags & IFA_F_TENTATIVE)) { -						ipv6_addr_copy(&pkt_dev-> -							       cur_in6_saddr, -							       &ifp->addr); +						pkt_dev->cur_in6_saddr = ifp->addr;  						err = 0;  						break;  					} @@ -2958,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  	iph->payload_len = htons(sizeof(struct udphdr) + datalen);  	iph->nexthdr = IPPROTO_UDP; -	ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); -	ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); +	iph->daddr = pkt_dev->cur_in6_daddr; +	iph->saddr = pkt_dev->cur_in6_saddr;  	skb->mac_header = (skb->network_header - ETH_HLEN -  			   pkt_dev->pkt_overhead); @@ -3345,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	__netif_tx_lock_bh(txq); -	if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { +	if (unlikely(netif_xmit_frozen_or_stopped(txq))) {  		ret = NETDEV_TX_BUSY;  		pkt_dev->last_ok = 0;  		goto unlock; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9083e82bdae..dbf2ddafd52 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -273,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);  static LIST_HEAD(link_ops); +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ +	const struct rtnl_link_ops *ops; + +	list_for_each_entry(ops, &link_ops, list) { +		if (!strcmp(ops->kind, kind)) +			return ops; +	} +	return NULL; +} +  /**   * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.   * @ops: struct rtnl_link_ops * to register @@ -285,6 +296,9 @@ static LIST_HEAD(link_ops);   */  int __rtnl_link_register(struct rtnl_link_ops *ops)  { +	if (rtnl_link_ops_get(ops->kind)) +		return -EEXIST; +  	if (!ops->dellink)  		ops->dellink = unregister_netdevice_queue; @@ -351,17 +365,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops)  }  EXPORT_SYMBOL_GPL(rtnl_link_unregister); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) -{ -	const struct rtnl_link_ops *ops; - -	list_for_each_entry(ops, &link_ops, list) { -		if (!strcmp(ops->kind, kind)) -			return ops; -	} -	return NULL; -} -  static size_t rtnl_link_get_size(const struct net_device *dev)  {  	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 925991ae6f5..6fd44606fdd 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -36,7 +36,7 @@ static u32 seq_scale(u32 seq)  }  #endif -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,  				   __be16 sport, __be16 dport)  { @@ -134,7 +134,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)  EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);  #endif -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#if IS_ENABLED(CONFIG_IP_DCCP)  u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,  				__be16 sport, __be16 dport)  { @@ -156,7 +156,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,  }  EXPORT_SYMBOL(secure_dccp_sequence_number); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,  				  __be16 sport, __be16 dport)  { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3c30ee4a571..da0c97f2fab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -245,6 +245,55 @@ nodata:  EXPORT_SYMBOL(__alloc_skb);  /** + * build_skb - build a network buffer + * @data: data buffer provided by caller + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + *  Before IO, driver allocates only data buffer where NIC put incoming frame + *  Driver should add room at head (NET_SKB_PAD) and + *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + *  After IO, driver calls build_skb(), to allocate sk_buff and populate it + *  before giving packet to stack. + *  RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *build_skb(void *data) +{ +	struct skb_shared_info *shinfo; +	struct sk_buff *skb; +	unsigned int size; + +	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); +	if (!skb) +		return NULL; + +	size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	memset(skb, 0, offsetof(struct sk_buff, tail)); +	skb->truesize = SKB_TRUESIZE(size); +	atomic_set(&skb->users, 1); +	skb->head = data; +	skb->data = data; +	skb_reset_tail_pointer(skb); +	skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET +	skb->mac_header = ~0U; +#endif + +	/* make sure we initialize shinfo sequentially */ +	shinfo = skb_shinfo(skb); +	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); +	atomic_set(&shinfo->dataref, 1); +	kmemcheck_annotate_variable(shinfo->destructor_arg); + +	return skb; +} +EXPORT_SYMBOL(build_skb); + +/**   *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device   *	@dev: network device to receive on   *	@length: length to allocate @@ -403,7 +452,7 @@ static void skb_release_head_state(struct sk_buff *skb)  		WARN_ON(in_irq());  		skb->destructor(skb);  	} -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	nf_conntrack_put(skb->nfct);  #endif  #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED @@ -553,15 +602,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->ip_summed		= old->ip_summed;  	skb_copy_queue_mapping(new, old);  	new->priority		= old->priority; -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) +#if IS_ENABLED(CONFIG_IP_VS)  	new->ipvs_property	= old->ipvs_property;  #endif  	new->protocol		= old->protocol;  	new->mark		= old->mark;  	new->skb_iif		= old->skb_iif;  	__nf_copy(new, old); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)  	new->nf_trace		= old->nf_trace;  #endif  #ifdef CONFIG_NET_SCHED @@ -791,8 +839,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  EXPORT_SYMBOL(skb_copy);  /** - *	pskb_copy	-	create copy of an sk_buff with private head. + *	__pskb_copy	-	create copy of an sk_buff with private head.   *	@skb: buffer to copy + *	@headroom: headroom of new skb   *	@gfp_mask: allocation priority   *   *	Make a copy of both an &sk_buff and part of its data, located @@ -803,16 +852,16 @@ EXPORT_SYMBOL(skb_copy);   *	The returned buffer has a reference count of 1.   */ -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)  { -	unsigned int size = skb_end_pointer(skb) - skb->head; +	unsigned int size = skb_headlen(skb) + headroom;  	struct sk_buff *n = alloc_skb(size, gfp_mask);  	if (!n)  		goto out;  	/* Set the data pointer */ -	skb_reserve(n, skb_headroom(skb)); +	skb_reserve(n, headroom);  	/* Set the tail pointer and length */  	skb_put(n, skb_headlen(skb));  	/* Copy the bytes */ @@ -848,7 +897,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)  out:  	return n;  } -EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(__pskb_copy);  /**   *	pskb_expand_head - reallocate header of &sk_buff @@ -2621,7 +2670,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);   *	a pointer to the first in a list of new skbs for the segments.   *	In case of error it returns ERR_PTR(err).   */ -struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  {  	struct sk_buff *segs = NULL;  	struct sk_buff *tail = NULL; @@ -3169,6 +3218,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,  }  EXPORT_SYMBOL_GPL(skb_tstamp_tx); +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) +{ +	struct sock *sk = skb->sk; +	struct sock_exterr_skb *serr; +	int err; + +	skb->wifi_acked_valid = 1; +	skb->wifi_acked = acked; + +	serr = SKB_EXT_ERR(skb); +	memset(serr, 0, sizeof(*serr)); +	serr->ee.ee_errno = ENOMSG; +	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + +	err = sock_queue_err_skb(sk, skb); +	if (err) +		kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +  /**   * skb_partial_csum_set - set up and verify partial csum values for packet diff --git a/net/core/sock.c b/net/core/sock.c index b23f174ab84..002939cfc06 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,6 +111,7 @@  #include <linux/init.h>  #include <linux/highmem.h>  #include <linux/user_namespace.h> +#include <linux/jump_label.h>  #include <asm/uaccess.h>  #include <asm/system.h> @@ -125,6 +126,7 @@  #include <net/xfrm.h>  #include <linux/ipsec.h>  #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h>  #include <linux/filter.h> @@ -134,6 +136,46 @@  #include <net/tcp.h>  #endif +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ +	struct proto *proto; +	int ret = 0; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry(proto, &proto_list, node) { +		if (proto->init_cgroup) { +			ret = proto->init_cgroup(cgrp, ss); +			if (ret) +				goto out; +		} +	} + +	mutex_unlock(&proto_list_mutex); +	return ret; +out: +	list_for_each_entry_continue_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(cgrp, ss); +	mutex_unlock(&proto_list_mutex); +	return ret; +} + +void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ +	struct proto *proto; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(cgrp, ss); +	mutex_unlock(&proto_list_mutex); +} +#endif +  /*   * Each address family might have different locking rules, so we have   * one slock key per address family: @@ -141,6 +183,9 @@  static struct lock_class_key af_family_keys[AF_MAX];  static struct lock_class_key af_family_slock_keys[AF_MAX]; +struct jump_label_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); +  /*   * Make lock validator output more readable. (we pre-construct these   * strings build-time, so that runtime initialization of socket @@ -221,10 +266,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);  EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP)  int net_cls_subsys_id = -1;  EXPORT_SYMBOL_GPL(net_cls_subsys_id);  #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)  { @@ -269,14 +320,14 @@ static void sock_warn_obsolete_bsdism(const char *name)  	}  } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags)  { -	if (sock_flag(sk, flag)) { -		sock_reset_flag(sk, flag); -		if (!sock_flag(sk, SOCK_TIMESTAMP) && -		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { +	if (sk->sk_flags & flags) { +		sk->sk_flags &= ~flags; +		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))  			net_disable_timestamp(); -		}  	}  } @@ -678,7 +729,7 @@ set_rcvbuf:  					      SOCK_TIMESTAMPING_RX_SOFTWARE);  		else  			sock_disable_timestamp(sk, -					       SOCK_TIMESTAMPING_RX_SOFTWARE); +					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,  				  val & SOF_TIMESTAMPING_SOFTWARE);  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -736,6 +787,11 @@ set_rcvbuf:  	case SO_RXQ_OVFL:  		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);  		break; + +	case SO_WIFI_STATUS: +		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); +		break; +  	default:  		ret = -ENOPROTOOPT;  		break; @@ -957,6 +1013,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);  		break; +	case SO_WIFI_STATUS: +		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); +		break; +  	default:  		return -ENOPROTOOPT;  	} @@ -1107,6 +1167,18 @@ void sock_update_classid(struct sock *sk)  		sk->sk_classid = classid;  }  EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ +	struct cgroup_netprio_state *state; +	if (in_interrupt()) +		return; +	rcu_read_lock(); +	state = task_netprio_state(current); +	sk->sk_cgrp_prioidx = state ? state->prioidx : 0; +	rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx);  #endif  /** @@ -1134,6 +1206,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		atomic_set(&sk->sk_wmem_alloc, 1);  		sock_update_classid(sk); +		sock_update_netprioidx(sk);  	}  	return sk; @@ -1154,8 +1227,7 @@ static void __sk_free(struct sock *sk)  		RCU_INIT_POINTER(sk->sk_filter, NULL);  	} -	sock_disable_timestamp(sk, SOCK_TIMESTAMP); -	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); +	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);  	if (atomic_read(&sk->sk_omem_alloc))  		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1200,7 +1272,14 @@ void sk_release_kernel(struct sock *sk)  }  EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +/** + *	sk_clone_lock - clone a socket, and lock its clone + *	@sk: the socket to clone + *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + *	Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  {  	struct sock *newsk; @@ -1284,16 +1363,15 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)  		newsk->sk_wq = NULL;  		if (newsk->sk_prot->sockets_allocated) -			percpu_counter_inc(newsk->sk_prot->sockets_allocated); +			sk_sockets_allocated_inc(newsk); -		if (sock_flag(newsk, SOCK_TIMESTAMP) || -		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) +		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)  			net_enable_timestamp();  	}  out:  	return newsk;  } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock);  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)  { @@ -1673,30 +1751,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  	struct proto *prot = sk->sk_prot;  	int amt = sk_mem_pages(size);  	long allocated; +	int parent_status = UNDER_LIMIT;  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; -	allocated = atomic_long_add_return(amt, prot->memory_allocated); + +	allocated = sk_memory_allocated_add(sk, amt, &parent_status);  	/* Under limit. */ -	if (allocated <= prot->sysctl_mem[0]) { -		if (prot->memory_pressure && *prot->memory_pressure) -			*prot->memory_pressure = 0; +	if (parent_status == UNDER_LIMIT && +			allocated <= sk_prot_mem_limits(sk, 0)) { +		sk_leave_memory_pressure(sk);  		return 1;  	} -	/* Under pressure. */ -	if (allocated > prot->sysctl_mem[1]) -		if (prot->enter_memory_pressure) -			prot->enter_memory_pressure(sk); +	/* Under pressure. (we or our parents) */ +	if ((parent_status > SOFT_LIMIT) || +			allocated > sk_prot_mem_limits(sk, 1)) +		sk_enter_memory_pressure(sk); -	/* Over hard limit. */ -	if (allocated > prot->sysctl_mem[2]) +	/* Over hard limit (we or our parents) */ +	if ((parent_status == OVER_LIMIT) || +			(allocated > sk_prot_mem_limits(sk, 2)))  		goto suppress_allocation;  	/* guarantee minimum buffer size under pressure */  	if (kind == SK_MEM_RECV) {  		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])  			return 1; +  	} else { /* SK_MEM_SEND */  		if (sk->sk_type == SOCK_STREAM) {  			if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1706,13 +1788,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  				return 1;  	} -	if (prot->memory_pressure) { +	if (sk_has_memory_pressure(sk)) {  		int alloc; -		if (!*prot->memory_pressure) +		if (!sk_under_memory_pressure(sk))  			return 1; -		alloc = percpu_counter_read_positive(prot->sockets_allocated); -		if (prot->sysctl_mem[2] > alloc * +		alloc = sk_sockets_allocated_read_positive(sk); +		if (sk_prot_mem_limits(sk, 2) > alloc *  		    sk_mem_pages(sk->sk_wmem_queued +  				 atomic_read(&sk->sk_rmem_alloc) +  				 sk->sk_forward_alloc)) @@ -1735,7 +1817,9 @@ suppress_allocation:  	/* Alas. Undo changes. */  	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; -	atomic_long_sub(amt, prot->memory_allocated); + +	sk_memory_allocated_sub(sk, amt, parent_status); +  	return 0;  }  EXPORT_SYMBOL(__sk_mem_schedule); @@ -1746,15 +1830,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);   */  void __sk_mem_reclaim(struct sock *sk)  { -	struct proto *prot = sk->sk_prot; - -	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, -		   prot->memory_allocated); +	sk_memory_allocated_sub(sk, +				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);  	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; -	if (prot->memory_pressure && *prot->memory_pressure && -	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) -		*prot->memory_pressure = 0; +	if (sk_under_memory_pressure(sk) && +	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) +		sk_leave_memory_pressure(sk);  }  EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2125,16 +2207,15 @@ EXPORT_SYMBOL(sock_get_timestampns);  void sock_enable_timestamp(struct sock *sk, int flag)  {  	if (!sock_flag(sk, flag)) { +		unsigned long previous_flags = sk->sk_flags; +  		sock_set_flag(sk, flag);  		/*  		 * we just set one of the two flags which require net  		 * time stamping, but time stamping might have been on  		 * already because of the other one  		 */ -		if (!sock_flag(sk, -				flag == SOCK_TIMESTAMP ? -				SOCK_TIMESTAMPING_RX_SOFTWARE : -				SOCK_TIMESTAMP)) +		if (!(previous_flags & SK_FLAGS_TIMESTAMP))  			net_enable_timestamp();  	}  } @@ -2246,9 +2327,6 @@ void sk_common_release(struct sock *sk)  }  EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); -  #ifdef CONFIG_PROC_FS  #define PROTO_INUSE_NR	64	/* should be enough for the first time */  struct prot_inuse { @@ -2397,10 +2475,10 @@ int proto_register(struct proto *prot, int alloc_slab)  		}  	} -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	list_add(&prot->node, &proto_list);  	assign_proto_idx(prot); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	return 0;  out_free_timewait_sock_slab_name: @@ -2423,10 +2501,10 @@ EXPORT_SYMBOL(proto_register);  void proto_unregister(struct proto *prot)  { -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	release_proto_idx(prot);  	list_del(&prot->node); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	if (prot->slab != NULL) {  		kmem_cache_destroy(prot->slab); @@ -2449,9 +2527,9 @@ EXPORT_SYMBOL(proto_unregister);  #ifdef CONFIG_PROC_FS  static void *proto_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(proto_list_lock) +	__acquires(proto_list_mutex)  { -	read_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	return seq_list_start_head(&proto_list, *pos);  } @@ -2461,25 +2539,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)  }  static void proto_seq_stop(struct seq_file *seq, void *v) -	__releases(proto_list_lock) +	__releases(proto_list_mutex)  { -	read_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  }  static char proto_method_implemented(const void *method)  {  	return method == NULL ? 'n' : 'y';  } +static long sock_prot_memory_allocated(struct proto *proto) +{ +	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ +	return proto->memory_pressure != NULL ? +	proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +}  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)  { +  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "  			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",  		   proto->name,  		   proto->obj_size,  		   sock_prot_inuse_get(seq_file_net(seq), proto), -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", +		   sock_prot_memory_allocated(proto), +		   sock_prot_memory_pressure(proto),  		   proto->max_header,  		   proto->slab == NULL ? "no" : "yes",  		   module_name(proto->owner), diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c new file mode 100644 index 00000000000..711bdefe775 --- /dev/null +++ b/net/core/sock_diag.c @@ -0,0 +1,169 @@ +#include <linux/mutex.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <linux/module.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +static struct sock_diag_handler *sock_diag_handlers[AF_MAX]; +static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); +static DEFINE_MUTEX(sock_diag_table_mutex); + +int sock_diag_check_cookie(void *sk, __u32 *cookie) +{ +	if ((cookie[0] != INET_DIAG_NOCOOKIE || +	     cookie[1] != INET_DIAG_NOCOOKIE) && +	    ((u32)(unsigned long)sk != cookie[0] || +	     (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) +		return -ESTALE; +	else +		return 0; +} +EXPORT_SYMBOL_GPL(sock_diag_check_cookie); + +void sock_diag_save_cookie(void *sk, __u32 *cookie) +{ +	cookie[0] = (u32)(unsigned long)sk; +	cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); +} +EXPORT_SYMBOL_GPL(sock_diag_save_cookie); + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ +	mutex_lock(&sock_diag_table_mutex); +	inet_rcv_compat = fn; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); + +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ +	mutex_lock(&sock_diag_table_mutex); +	inet_rcv_compat = NULL; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); + +int sock_diag_register(struct sock_diag_handler *hndl) +{ +	int err = 0; + +	if (hndl->family >= AF_MAX) +		return -EINVAL; + +	mutex_lock(&sock_diag_table_mutex); +	if (sock_diag_handlers[hndl->family]) +		err = -EBUSY; +	else +		sock_diag_handlers[hndl->family] = hndl; +	mutex_unlock(&sock_diag_table_mutex); + +	return err; +} +EXPORT_SYMBOL_GPL(sock_diag_register); + +void sock_diag_unregister(struct sock_diag_handler *hnld) +{ +	int family = hnld->family; + +	if (family >= AF_MAX) +		return; + +	mutex_lock(&sock_diag_table_mutex); +	BUG_ON(sock_diag_handlers[family] != hnld); +	sock_diag_handlers[family] = NULL; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister); + +static inline struct sock_diag_handler *sock_diag_lock_handler(int family) +{ +	if (sock_diag_handlers[family] == NULL) +		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, +				NETLINK_SOCK_DIAG, family); + +	mutex_lock(&sock_diag_table_mutex); +	return sock_diag_handlers[family]; +} + +static inline void sock_diag_unlock_handler(struct sock_diag_handler *h) +{ +	mutex_unlock(&sock_diag_table_mutex); +} + +static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	int err; +	struct sock_diag_req *req = NLMSG_DATA(nlh); +	struct sock_diag_handler *hndl; + +	if (nlmsg_len(nlh) < sizeof(*req)) +		return -EINVAL; + +	hndl = sock_diag_lock_handler(req->sdiag_family); +	if (hndl == NULL) +		err = -ENOENT; +	else +		err = hndl->dump(skb, nlh); +	sock_diag_unlock_handler(hndl); + +	return err; +} + +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	int ret; + +	switch (nlh->nlmsg_type) { +	case TCPDIAG_GETSOCK: +	case DCCPDIAG_GETSOCK: +		if (inet_rcv_compat == NULL) +			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, +					NETLINK_SOCK_DIAG, AF_INET); + +		mutex_lock(&sock_diag_table_mutex); +		if (inet_rcv_compat != NULL) +			ret = inet_rcv_compat(skb, nlh); +		else +			ret = -EOPNOTSUPP; +		mutex_unlock(&sock_diag_table_mutex); + +		return ret; +	case SOCK_DIAG_BY_FAMILY: +		return __sock_diag_rcv_msg(skb, nlh); +	default: +		return -EINVAL; +	} +} + +static DEFINE_MUTEX(sock_diag_mutex); + +static void sock_diag_rcv(struct sk_buff *skb) +{ +	mutex_lock(&sock_diag_mutex); +	netlink_rcv_skb(skb, &sock_diag_rcv_msg); +	mutex_unlock(&sock_diag_mutex); +} + +struct sock *sock_diag_nlsk; +EXPORT_SYMBOL_GPL(sock_diag_nlsk); + +static int __init sock_diag_init(void) +{ +	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, +					sock_diag_rcv, NULL, THIS_MODULE); +	return sock_diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __exit sock_diag_exit(void) +{ +	netlink_kernel_release(sock_diag_nlsk); +} + +module_init(sock_diag_init); +module_exit(sock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 77a65f03148..d05559d4d9c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,  		if (sock_table != orig_sock_table) {  			rcu_assign_pointer(rps_sock_flow_table, sock_table); -			synchronize_rcu(); -			vfree(orig_sock_table); +			if (sock_table) +				jump_label_inc(&rps_needed); +			if (orig_sock_table) { +				jump_label_dec(&rps_needed); +				synchronize_rcu(); +				vfree(orig_sock_table); +			}  		}  	}  |