diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 3 | ||||
| -rw-r--r-- | net/core/datagram.c | 2 | ||||
| -rw-r--r-- | net/core/dev.c | 1953 | ||||
| -rw-r--r-- | net/core/dev_addr_lists.c | 77 | ||||
| -rw-r--r-- | net/core/dev_ioctl.c | 576 | ||||
| -rw-r--r-- | net/core/dst.c | 1 | ||||
| -rw-r--r-- | net/core/ethtool.c | 48 | ||||
| -rw-r--r-- | net/core/filter.c | 152 | ||||
| -rw-r--r-- | net/core/flow.c | 17 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 173 | ||||
| -rw-r--r-- | net/core/neighbour.c | 40 | ||||
| -rw-r--r-- | net/core/net-procfs.c | 411 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 221 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 57 | ||||
| -rw-r--r-- | net/core/netpoll.c | 715 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 262 | ||||
| -rw-r--r-- | net/core/pktgen.c | 250 | ||||
| -rw-r--r-- | net/core/request_sock.c | 2 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 417 | ||||
| -rw-r--r-- | net/core/scm.c | 11 | ||||
| -rw-r--r-- | net/core/skbuff.c | 193 | ||||
| -rw-r--r-- | net/core/sock.c | 110 | ||||
| -rw-r--r-- | net/core/sock_diag.c | 27 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 19 | 
24 files changed, 3569 insertions, 2168 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 674641b13ae..b33b996f5dd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,10 +9,11 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o  obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \  			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ -			sock_diag.o +			sock_diag.o dev_ioctl.o  obj-$(CONFIG_XFRM) += flow.o  obj-y += net-sysfs.o +obj-$(CONFIG_PROC_FS) += net-procfs.o  obj-$(CONFIG_NET_PKTGEN) += pktgen.o  obj-$(CONFIG_NETPOLL) += netpoll.o  obj-$(CONFIG_NET_DMA) += user_dma.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 0337e2b7686..368f9c3f9dc 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -187,7 +187,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,  		skb_queue_walk(queue, skb) {  			*peeked = skb->peeked;  			if (flags & MSG_PEEK) { -				if (*off >= skb->len) { +				if (*off >= skb->len && skb->len) {  					*off -= skb->len;  					continue;  				} diff --git a/net/core/dev.c b/net/core/dev.c index 09cb3f6dc40..dffbef70cd3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -97,8 +97,6 @@  #include <net/net_namespace.h>  #include <net/sock.h>  #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h>  #include <linux/stat.h>  #include <net/dst.h>  #include <net/pkt_sched.h> @@ -106,12 +104,10 @@  #include <net/xfrm.h>  #include <linux/highmem.h>  #include <linux/init.h> -#include <linux/kmod.h>  #include <linux/module.h>  #include <linux/netpoll.h>  #include <linux/rcupdate.h>  #include <linux/delay.h> -#include <net/wext.h>  #include <net/iw_handler.h>  #include <asm/current.h>  #include <linux/audit.h> @@ -132,9 +128,7 @@  #include <linux/pci.h>  #include <linux/inetdevice.h>  #include <linux/cpu_rmap.h> -#include <linux/net_tstamp.h>  #include <linux/static_key.h> -#include <net/flow_keys.h>  #include "net-sysfs.h" @@ -144,40 +138,11 @@  /* This should be increased if a protocol with a bigger head is added. */  #define GRO_MAX_HEAD (MAX_HEADER + 128) -/* - *	The list of packet types we will receive (as opposed to discard) - *	and the routines to invoke. - * - *	Why 16. Because with 16 the only overlap we get on a hash of the - *	low nibble of the protocol value is RARP/SNAP/X.25. - * - *      NOTE:  That is no longer true with the addition of VLAN tags.  Not - *             sure which should go first, but I bet it won't make much - *             difference if we are running VLANs.  The good news is that - *             this protocol won't be in the list unless compiled in, so - *             the average user (w/out VLANs) will not be adversely affected. - *             --BLG - * - *		0800	IP - *		8100    802.1Q VLAN - *		0001	802.3 - *		0002	AX.25 - *		0004	802.2 - *		8035	RARP - *		0005	SNAP - *		0805	X.25 - *		0806	ARP - *		8137	IPX - *		0009	Localtalk - *		86DD	IPv6 - */ - -#define PTYPE_HASH_SIZE	(16) -#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1) -  static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly;	/* Taps */ +static DEFINE_SPINLOCK(offload_lock); +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly;	/* Taps */ +static struct list_head offload_base __read_mostly;  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -201,6 +166,8 @@ static struct list_head ptype_all __read_mostly;	/* Taps */  DEFINE_RWLOCK(dev_base_lock);  EXPORT_SYMBOL(dev_base_lock); +seqcount_t devnet_rename_seq; +  static inline void dev_base_seq_inc(struct net *net)  {  	while (++net->dev_base_seq == 0); @@ -470,6 +437,82 @@ void dev_remove_pack(struct packet_type *pt)  }  EXPORT_SYMBOL(dev_remove_pack); + +/** + *	dev_add_offload - register offload handlers + *	@po: protocol offload declaration + * + *	Add protocol offload handlers to the networking stack. The passed + *	&proto_offload is linked into kernel lists and may not be freed until + *	it has been removed from the kernel lists. + * + *	This call does not sleep therefore it can not + *	guarantee all CPU's that are in middle of receiving packets + *	will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; + +	spin_lock(&offload_lock); +	list_add_rcu(&po->list, head); +	spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + *	__dev_remove_offload	 - remove offload handler + *	@po: packet offload declaration + * + *	Remove a protocol offload handler that was previously added to the + *	kernel offload handlers by dev_add_offload(). The passed &offload_type + *	is removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *      The packet type might still be in use by receivers + *	and must not be freed until after all the CPU's have gone + *	through a quiescent state. + */ +void __dev_remove_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; +	struct packet_offload *po1; + +	spin_lock(&offload_lock); + +	list_for_each_entry(po1, head, list) { +		if (po == po1) { +			list_del_rcu(&po->list); +			goto out; +		} +	} + +	pr_warn("dev_remove_offload: %p not found\n", po); +out: +	spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(__dev_remove_offload); + +/** + *	dev_remove_offload	 - remove packet offload handler + *	@po: packet offload declaration + * + *	Remove a packet offload handler that was previously added to the kernel + *	offload handlers by dev_add_offload(). The passed &offload_type is + *	removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *	This call sleeps to guarantee that no CPU is looking at the packet + *	type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ +	__dev_remove_offload(po); + +	synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); +  /******************************************************************************  		      Device Boot-time Settings Routines @@ -615,11 +658,10 @@ __setup("netdev=", netdev_boot_setup);  struct net_device *__dev_get_by_name(struct net *net, const char *name)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_name_hash(net, name); -	hlist_for_each_entry(dev, p, head, name_hlist) +	hlist_for_each_entry(dev, head, name_hlist)  		if (!strncmp(dev->name, name, IFNAMSIZ))  			return dev; @@ -641,11 +683,10 @@ EXPORT_SYMBOL(__dev_get_by_name);  struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_name_hash(net, name); -	hlist_for_each_entry_rcu(dev, p, head, name_hlist) +	hlist_for_each_entry_rcu(dev, head, name_hlist)  		if (!strncmp(dev->name, name, IFNAMSIZ))  			return dev; @@ -692,11 +733,10 @@ EXPORT_SYMBOL(dev_get_by_name);  struct net_device *__dev_get_by_index(struct net *net, int ifindex)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_index_hash(net, ifindex); -	hlist_for_each_entry(dev, p, head, index_hlist) +	hlist_for_each_entry(dev, head, index_hlist)  		if (dev->ifindex == ifindex)  			return dev; @@ -717,11 +757,10 @@ EXPORT_SYMBOL(__dev_get_by_index);  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_index_hash(net, ifindex); -	hlist_for_each_entry_rcu(dev, p, head, index_hlist) +	hlist_for_each_entry_rcu(dev, head, index_hlist)  		if (dev->ifindex == ifindex)  			return dev; @@ -1013,22 +1052,31 @@ int dev_change_name(struct net_device *dev, const char *newname)  	if (dev->flags & IFF_UP)  		return -EBUSY; -	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) +	write_seqcount_begin(&devnet_rename_seq); + +	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { +		write_seqcount_end(&devnet_rename_seq);  		return 0; +	}  	memcpy(oldname, dev->name, IFNAMSIZ);  	err = dev_get_valid_name(net, dev, newname); -	if (err < 0) +	if (err < 0) { +		write_seqcount_end(&devnet_rename_seq);  		return err; +	}  rollback:  	ret = device_rename(&dev->dev, dev->name);  	if (ret) {  		memcpy(dev->name, oldname, IFNAMSIZ); +		write_seqcount_end(&devnet_rename_seq);  		return ret;  	} +	write_seqcount_end(&devnet_rename_seq); +  	write_lock_bh(&dev_base_lock);  	hlist_del_rcu(&dev->name_hlist);  	write_unlock_bh(&dev_base_lock); @@ -1046,6 +1094,7 @@ rollback:  		/* err >= 0 after dev_alloc_name() or stores the first errno */  		if (err >= 0) {  			err = ret; +			write_seqcount_begin(&devnet_rename_seq);  			memcpy(dev->name, oldname, IFNAMSIZ);  			goto rollback;  		} else { @@ -1075,10 +1124,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  		return -EINVAL;  	if (!len) { -		if (dev->ifalias) { -			kfree(dev->ifalias); -			dev->ifalias = NULL; -		} +		kfree(dev->ifalias); +		dev->ifalias = NULL;  		return 0;  	} @@ -1139,36 +1186,6 @@ void netdev_notify_peers(struct net_device *dev)  }  EXPORT_SYMBOL(netdev_notify_peers); -/** - *	dev_load 	- load a network module - *	@net: the applicable net namespace - *	@name: name of interface - * - *	If a network interface is not present and the process has suitable - *	privileges this function loads the module. If module loading is not - *	available in this kernel then it becomes a nop. - */ - -void dev_load(struct net *net, const char *name) -{ -	struct net_device *dev; -	int no_module; - -	rcu_read_lock(); -	dev = dev_get_by_name_rcu(net, name); -	rcu_read_unlock(); - -	no_module = !dev; -	if (no_module && capable(CAP_NET_ADMIN)) -		no_module = request_module("netdev-%s", name); -	if (no_module && capable(CAP_SYS_MODULE)) { -		if (!request_module("%s", name)) -			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n", -				name); -	} -} -EXPORT_SYMBOL(dev_load); -  static int __dev_open(struct net_device *dev)  {  	const struct net_device_ops *ops = dev->netdev_ops; @@ -1179,6 +1196,14 @@ static int __dev_open(struct net_device *dev)  	if (!netif_device_present(dev))  		return -ENODEV; +	/* Block netpoll from trying to do any rx path servicing. +	 * If we don't do this there is a chance ndo_poll_controller +	 * or ndo_poll may be running while we open the device +	 */ +	ret = netpoll_rx_disable(dev); +	if (ret) +		return ret; +  	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);  	ret = notifier_to_errno(ret);  	if (ret) @@ -1192,6 +1217,8 @@ static int __dev_open(struct net_device *dev)  	if (!ret && ops->ndo_open)  		ret = ops->ndo_open(dev); +	netpoll_rx_enable(dev); +  	if (ret)  		clear_bit(__LINK_STATE_START, &dev->state);  	else { @@ -1283,9 +1310,16 @@ static int __dev_close(struct net_device *dev)  	int retval;  	LIST_HEAD(single); +	/* Temporarily disable netpoll until the interface is down */ +	retval = netpoll_rx_disable(dev); +	if (retval) +		return retval; +  	list_add(&dev->unreg_list, &single);  	retval = __dev_close_many(&single);  	list_del(&single); + +	netpoll_rx_enable(dev);  	return retval;  } @@ -1321,14 +1355,22 @@ static int dev_close_many(struct list_head *head)   */  int dev_close(struct net_device *dev)  { +	int ret = 0;  	if (dev->flags & IFF_UP) {  		LIST_HEAD(single); +		/* Block netpoll rx while the interface is going down */ +		ret = netpoll_rx_disable(dev); +		if (ret) +			return ret; +  		list_add(&dev->unreg_list, &single);  		dev_close_many(&single);  		list_del(&single); + +		netpoll_rx_enable(dev);  	} -	return 0; +	return ret;  }  EXPORT_SYMBOL(dev_close); @@ -1533,57 +1575,6 @@ static inline void net_timestamp_set(struct sk_buff *skb)  			__net_timestamp(SKB);		\  	}						\ -static int net_hwtstamp_validate(struct ifreq *ifr) -{ -	struct hwtstamp_config cfg; -	enum hwtstamp_tx_types tx_type; -	enum hwtstamp_rx_filters rx_filter; -	int tx_type_valid = 0; -	int rx_filter_valid = 0; - -	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) -		return -EFAULT; - -	if (cfg.flags) /* reserved for future extensions */ -		return -EINVAL; - -	tx_type = cfg.tx_type; -	rx_filter = cfg.rx_filter; - -	switch (tx_type) { -	case HWTSTAMP_TX_OFF: -	case HWTSTAMP_TX_ON: -	case HWTSTAMP_TX_ONESTEP_SYNC: -		tx_type_valid = 1; -		break; -	} - -	switch (rx_filter) { -	case HWTSTAMP_FILTER_NONE: -	case HWTSTAMP_FILTER_ALL: -	case HWTSTAMP_FILTER_SOME: -	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: -	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: -	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: -	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: -	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: -	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: -	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: -	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: -	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: -	case HWTSTAMP_FILTER_PTP_V2_EVENT: -	case HWTSTAMP_FILTER_PTP_V2_SYNC: -	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: -		rx_filter_valid = 1; -		break; -	} - -	if (!tx_type_valid || !rx_filter_valid) -		return -ERANGE; - -	return 0; -} -  static inline bool is_skb_forwardable(struct net_device *dev,  				      struct sk_buff *skb)  { @@ -1666,7 +1657,7 @@ static inline int deliver_skb(struct sk_buff *skb,  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)  { -	if (ptype->af_packet_priv == NULL) +	if (!ptype->af_packet_priv || !skb->sk)  		return false;  	if (ptype->id_match) @@ -1769,6 +1760,230 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)  	}  } +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P)		\ +	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, +					int cpu, u16 index) +{ +	struct xps_map *map = NULL; +	int pos; + +	if (dev_maps) +		map = xmap_dereference(dev_maps->cpu_map[cpu]); + +	for (pos = 0; map && pos < map->len; pos++) { +		if (map->queues[pos] == index) { +			if (map->len > 1) { +				map->queues[pos] = map->queues[--map->len]; +			} else { +				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); +				kfree_rcu(map, rcu); +				map = NULL; +			} +			break; +		} +	} + +	return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ +	struct xps_dev_maps *dev_maps; +	int cpu, i; +	bool active = false; + +	mutex_lock(&xps_map_mutex); +	dev_maps = xmap_dereference(dev->xps_maps); + +	if (!dev_maps) +		goto out_no_maps; + +	for_each_possible_cpu(cpu) { +		for (i = index; i < dev->num_tx_queues; i++) { +			if (!remove_xps_queue(dev_maps, cpu, i)) +				break; +		} +		if (i == dev->num_tx_queues) +			active = true; +	} + +	if (!active) { +		RCU_INIT_POINTER(dev->xps_maps, NULL); +		kfree_rcu(dev_maps, rcu); +	} + +	for (i = index; i < dev->num_tx_queues; i++) +		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), +					     NUMA_NO_NODE); + +out_no_maps: +	mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, +				      int cpu, u16 index) +{ +	struct xps_map *new_map; +	int alloc_len = XPS_MIN_MAP_ALLOC; +	int i, pos; + +	for (pos = 0; map && pos < map->len; pos++) { +		if (map->queues[pos] != index) +			continue; +		return map; +	} + +	/* Need to add queue to this CPU's existing map */ +	if (map) { +		if (pos < map->alloc_len) +			return map; + +		alloc_len = map->alloc_len * 2; +	} + +	/* Need to allocate new map to store queue on this CPU's map */ +	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, +			       cpu_to_node(cpu)); +	if (!new_map) +		return NULL; + +	for (i = 0; i < pos; i++) +		new_map->queues[i] = map->queues[i]; +	new_map->alloc_len = alloc_len; +	new_map->len = pos; + +	return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +{ +	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; +	struct xps_map *map, *new_map; +	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); +	int cpu, numa_node_id = -2; +	bool active = false; + +	mutex_lock(&xps_map_mutex); + +	dev_maps = xmap_dereference(dev->xps_maps); + +	/* allocate memory for queue storage */ +	for_each_online_cpu(cpu) { +		if (!cpumask_test_cpu(cpu, mask)) +			continue; + +		if (!new_dev_maps) +			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); +		if (!new_dev_maps) { +			mutex_unlock(&xps_map_mutex); +			return -ENOMEM; +		} + +		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : +				 NULL; + +		map = expand_xps_map(map, cpu, index); +		if (!map) +			goto error; + +		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); +	} + +	if (!new_dev_maps) +		goto out_no_new_maps; + +	for_each_possible_cpu(cpu) { +		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { +			/* add queue to CPU maps */ +			int pos = 0; + +			map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +			while ((pos < map->len) && (map->queues[pos] != index)) +				pos++; + +			if (pos == map->len) +				map->queues[map->len++] = index; +#ifdef CONFIG_NUMA +			if (numa_node_id == -2) +				numa_node_id = cpu_to_node(cpu); +			else if (numa_node_id != cpu_to_node(cpu)) +				numa_node_id = -1; +#endif +		} else if (dev_maps) { +			/* fill in the new device map from the old device map */ +			map = xmap_dereference(dev_maps->cpu_map[cpu]); +			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); +		} + +	} + +	rcu_assign_pointer(dev->xps_maps, new_dev_maps); + +	/* Cleanup old maps */ +	if (dev_maps) { +		for_each_possible_cpu(cpu) { +			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +			map = xmap_dereference(dev_maps->cpu_map[cpu]); +			if (map && map != new_map) +				kfree_rcu(map, rcu); +		} + +		kfree_rcu(dev_maps, rcu); +	} + +	dev_maps = new_dev_maps; +	active = true; + +out_no_new_maps: +	/* update Tx queue numa node */ +	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), +				     (numa_node_id >= 0) ? numa_node_id : +				     NUMA_NO_NODE); + +	if (!dev_maps) +		goto out_no_maps; + +	/* removes queue from unused CPUs */ +	for_each_possible_cpu(cpu) { +		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) +			continue; + +		if (remove_xps_queue(dev_maps, cpu, index)) +			active = true; +	} + +	/* free map if not active */ +	if (!active) { +		RCU_INIT_POINTER(dev->xps_maps, NULL); +		kfree_rcu(dev_maps, rcu); +	} + +out_no_maps: +	mutex_unlock(&xps_map_mutex); + +	return 0; +error: +	/* remove any maps that we added */ +	for_each_possible_cpu(cpu) { +		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : +				 NULL; +		if (new_map && new_map != map) +			kfree(new_map); +	} + +	mutex_unlock(&xps_map_mutex); + +	kfree(new_dev_maps); +	return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif  /*   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1792,8 +2007,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  		if (dev->num_tc)  			netif_setup_tc(dev, txq); -		if (txq < dev->real_num_tx_queues) +		if (txq < dev->real_num_tx_queues) {  			qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS +			netif_reset_xps_queues_gt(dev, txq); +#endif +		}  	}  	dev->real_num_tx_queues = txq; @@ -1958,6 +2177,15 @@ int skb_checksum_help(struct sk_buff *skb)  		return -EINVAL;  	} +	/* Before computing a checksum, we should make sure no frag could +	 * be modified by an external entity : checksum could be wrong. +	 */ +	if (skb_has_shared_frag(skb)) { +		ret = __skb_linearize(skb); +		if (ret) +			goto out; +	} +  	offset = skb_checksum_start_offset(skb);  	BUG_ON(offset >= skb_headlen(skb));  	csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1981,25 +2209,19 @@ out:  EXPORT_SYMBOL(skb_checksum_help);  /** - *	skb_gso_segment - Perform segmentation on skb. + *	skb_mac_gso_segment - mac layer segmentation handler.   *	@skb: buffer to segment   *	@features: features for the output path (see dev->features) - * - *	This function segments the given skb and returns a list of segments. - * - *	It may return NULL if the skb requires no segmentation.  This is - *	only possible when GSO is used for verifying header integrity.   */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, -	netdev_features_t features) +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, +				    netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	int vlan_depth = ETH_HLEN; -	int err;  	while (type == htons(ETH_P_8021Q)) { +		int vlan_depth = ETH_HLEN;  		struct vlan_hdr *vh;  		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) @@ -2010,31 +2232,22 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,  		vlan_depth += VLAN_HLEN;  	} -	skb_reset_mac_header(skb); -	skb->mac_len = skb->network_header - skb->mac_header;  	__skb_pull(skb, skb->mac_len); -	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { -		skb_warn_bad_offload(skb); - -		if (skb_header_cloned(skb) && -		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) -			return ERR_PTR(err); -	} -  	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, -			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && !ptype->dev && ptype->gso_segment) { +	list_for_each_entry_rcu(ptype, &offload_base, list) { +		if (ptype->type == type && ptype->callbacks.gso_segment) {  			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { -				err = ptype->gso_send_check(skb); +				int err; + +				err = ptype->callbacks.gso_send_check(skb);  				segs = ERR_PTR(err);  				if (err || skb_gso_ok(skb, features))  					break;  				__skb_push(skb, (skb->data -  						 skb_network_header(skb)));  			} -			segs = ptype->gso_segment(skb, features); +			segs = ptype->callbacks.gso_segment(skb, features);  			break;  		}  	} @@ -2044,7 +2257,50 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,  	return segs;  } -EXPORT_SYMBOL(skb_gso_segment); +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ +	if (tx_path) +		return skb->ip_summed != CHECKSUM_PARTIAL; +	else +		return skb->ip_summed == CHECKSUM_NONE; +} + +/** + *	__skb_gso_segment - Perform segmentation on skb. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + *	@tx_path: whether it is called in TX path + * + *	This function segments the given skb and returns a list of segments. + * + *	It may return NULL if the skb requires no segmentation.  This is + *	only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, +				  netdev_features_t features, bool tx_path) +{ +	if (unlikely(skb_needs_check(skb, tx_path))) { +		int err; + +		skb_warn_bad_offload(skb); + +		if (skb_header_cloned(skb) && +		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) +			return ERR_PTR(err); +	} + +	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); +	skb_reset_mac_header(skb); +	skb_reset_mac_len(skb); + +	return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment);  /* Take action when hardware reception checksum errors are detected. */  #ifdef CONFIG_BUG @@ -2237,6 +2493,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			skb->vlan_tci = 0;  		} +		/* If encapsulation offload request, verify we are testing +		 * hardware encapsulation features instead of standard +		 * features for the netdev +		 */ +		if (skb->encapsulation) +			features &= dev->hw_enc_features; +  		if (netif_needs_gso(skb, features)) {  			if (unlikely(dev_gso_segment(skb, features)))  				goto out_kfree_skb; @@ -2252,8 +2515,12 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			 * checksumming here.  			 */  			if (skb->ip_summed == CHECKSUM_PARTIAL) { -				skb_set_transport_header(skb, -					skb_checksum_start_offset(skb)); +				if (skb->encapsulation) +					skb_set_inner_transport_header(skb, +						skb_checksum_start_offset(skb)); +				else +					skb_set_transport_header(skb, +						skb_checksum_start_offset(skb));  				if (!(features & NETIF_F_ALL_CSUM) &&  				     skb_checksum_help(skb))  					goto out_kfree_skb; @@ -2312,126 +2579,28 @@ out:  	return rc;  } -static u32 hashrnd __read_mostly; - -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, -		  unsigned int num_tx_queues) -{ -	u32 hash; -	u16 qoffset = 0; -	u16 qcount = num_tx_queues; - -	if (skb_rx_queue_recorded(skb)) { -		hash = skb_get_rx_queue(skb); -		while (unlikely(hash >= num_tx_queues)) -			hash -= num_tx_queues; -		return hash; -	} - -	if (dev->num_tc) { -		u8 tc = netdev_get_prio_tc_map(dev, skb->priority); -		qoffset = dev->tc_to_txq[tc].offset; -		qcount = dev->tc_to_txq[tc].count; -	} - -	if (skb->sk && skb->sk->sk_hash) -		hash = skb->sk->sk_hash; -	else -		hash = (__force u16) skb->protocol; -	hash = jhash_1word(hash, hashrnd); - -	return (u16) (((u64) hash * qcount) >> 32) + qoffset; -} -EXPORT_SYMBOL(__skb_tx_hash); - -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ -	if (unlikely(queue_index >= dev->real_num_tx_queues)) { -		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", -				     dev->name, queue_index, -				     dev->real_num_tx_queues); -		return 0; -	} -	return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_XPS -	struct xps_dev_maps *dev_maps; -	struct xps_map *map; -	int queue_index = -1; - -	rcu_read_lock(); -	dev_maps = rcu_dereference(dev->xps_maps); -	if (dev_maps) { -		map = rcu_dereference( -		    dev_maps->cpu_map[raw_smp_processor_id()]); -		if (map) { -			if (map->len == 1) -				queue_index = map->queues[0]; -			else { -				u32 hash; -				if (skb->sk && skb->sk->sk_hash) -					hash = skb->sk->sk_hash; -				else -					hash = (__force u16) skb->protocol ^ -					    skb->rxhash; -				hash = jhash_1word(hash, hashrnd); -				queue_index = map->queues[ -				    ((u64)hash * map->len) >> 32]; -			} -			if (unlikely(queue_index >= dev->real_num_tx_queues)) -				queue_index = -1; -		} -	} -	rcu_read_unlock(); - -	return queue_index; -#else -	return -1; -#endif -} - -struct netdev_queue *netdev_pick_tx(struct net_device *dev, -				    struct sk_buff *skb) +static void qdisc_pkt_len_init(struct sk_buff *skb)  { -	int queue_index; -	const struct net_device_ops *ops = dev->netdev_ops; - -	if (dev->real_num_tx_queues == 1) -		queue_index = 0; -	else if (ops->ndo_select_queue) { -		queue_index = ops->ndo_select_queue(dev, skb); -		queue_index = dev_cap_txqueue(dev, queue_index); -	} else { -		struct sock *sk = skb->sk; -		queue_index = sk_tx_queue_get(sk); +	const struct skb_shared_info *shinfo = skb_shinfo(skb); -		if (queue_index < 0 || skb->ooo_okay || -		    queue_index >= dev->real_num_tx_queues) { -			int old_index = queue_index; +	qdisc_skb_cb(skb)->pkt_len = skb->len; -			queue_index = get_xps_queue(dev, skb); -			if (queue_index < 0) -				queue_index = skb_tx_hash(dev, skb); +	/* To get more precise estimation of bytes sent on wire, +	 * we add to pkt_len the headers size of all segments +	 */ +	if (shinfo->gso_size)  { +		unsigned int hdr_len; -			if (queue_index != old_index && sk) { -				struct dst_entry *dst = -				    rcu_dereference_check(sk->sk_dst_cache, 1); +		/* mac layer + network layer */ +		hdr_len = skb_transport_header(skb) - skb_mac_header(skb); -				if (dst && skb_dst(skb) == dst) -					sk_tx_queue_set(sk, queue_index); -			} -		} +		/* + transport layer */ +		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +			hdr_len += tcp_hdrlen(skb); +		else +			hdr_len += sizeof(struct udphdr); +		qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;  	} - -	skb_set_queue_mapping(skb, queue_index); -	return netdev_get_tx_queue(dev, queue_index);  }  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, @@ -2442,7 +2611,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	bool contended;  	int rc; -	qdisc_skb_cb(skb)->pkt_len = skb->len; +	qdisc_pkt_len_init(skb);  	qdisc_calculate_pkt_len(skb, q);  	/*  	 * Heuristic to force contended enqueues to serialize on a @@ -2565,6 +2734,8 @@ int dev_queue_xmit(struct sk_buff *skb)  	struct Qdisc *q;  	int rc = -ENOMEM; +	skb_reset_mac_header(skb); +  	/* Disable soft irqs for various locks below. Also  	 * stops preemption for RCU.  	 */ @@ -2659,41 +2830,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);  } -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb - * if hash is a canonical 4-tuple hash over transport ports. - */ -void __skb_get_rxhash(struct sk_buff *skb) -{ -	struct flow_keys keys; -	u32 hash; - -	if (!skb_flow_dissect(skb, &keys)) -		return; - -	if (keys.ports) -		skb->l4_rxhash = 1; - -	/* get a consistent hash (same value on both flow directions) */ -	if (((__force u32)keys.dst < (__force u32)keys.src) || -	    (((__force u32)keys.dst == (__force u32)keys.src) && -	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { -		swap(keys.dst, keys.src); -		swap(keys.port16[0], keys.port16[1]); -	} - -	hash = jhash_3words((__force u32)keys.dst, -			    (__force u32)keys.src, -			    (__force u32)keys.ports, hashrnd); -	if (!hash) -		hash = 1; - -	skb->rxhash = hash; -} -EXPORT_SYMBOL(__skb_get_rxhash); -  #ifdef CONFIG_RPS  /* One global table that all flow-based protocols share. */ @@ -2818,8 +2954,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		if (unlikely(tcpu != next_cpu) &&  		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||  		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head - -		      rflow->last_qtail)) >= 0)) +		      rflow->last_qtail)) >= 0)) { +			tcpu = next_cpu;  			rflow = set_rps_cpu(dev, skb, rflow, next_cpu); +		}  		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {  			*rflowp = rflow; @@ -3218,7 +3356,7 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)  	}  } -static int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)  {  	struct packet_type *ptype, *pt_prev;  	rx_handler_func_t *rx_handler; @@ -3227,24 +3365,11 @@ static int __netif_receive_skb(struct sk_buff *skb)  	bool deliver_exact = false;  	int ret = NET_RX_DROP;  	__be16 type; -	unsigned long pflags = current->flags;  	net_timestamp_check(!netdev_tstamp_prequeue, skb);  	trace_netif_receive_skb(skb); -	/* -	 * PFMEMALLOC skbs are special, they should -	 * - be delivered to SOCK_MEMALLOC sockets only -	 * - stay away from userspace -	 * - have bounded memory usage -	 * -	 * Use PF_MEMALLOC as this saves us from propagating the allocation -	 * context down to all allocation sites. -	 */ -	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) -		current->flags |= PF_MEMALLOC; -  	/* if we've gotten here through NAPI, check netpoll */  	if (netpoll_receive_skb(skb))  		goto out; @@ -3252,7 +3377,8 @@ static int __netif_receive_skb(struct sk_buff *skb)  	orig_dev = skb->dev;  	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); +	if (!skb_transport_header_was_set(skb)) +		skb_reset_transport_header(skb);  	skb_reset_mac_len(skb);  	pt_prev = NULL; @@ -3277,7 +3403,7 @@ another_round:  	}  #endif -	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) +	if (pfmemalloc)  		goto skip_taps;  	list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -3296,8 +3422,7 @@ skip_taps:  ncls:  #endif -	if (sk_memalloc_socks() && skb_pfmemalloc(skb) -				&& !skb_pfmemalloc_protocol(skb)) +	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))  		goto drop;  	if (vlan_tx_tag_present(skb)) { @@ -3319,6 +3444,7 @@ ncls:  		}  		switch (rx_handler(&skb)) {  		case RX_HANDLER_CONSUMED: +			ret = NET_RX_SUCCESS;  			goto unlock;  		case RX_HANDLER_ANOTHER:  			goto another_round; @@ -3367,7 +3493,31 @@ drop:  unlock:  	rcu_read_unlock();  out: -	tsk_restore_flags(current, pflags, PF_MEMALLOC); +	return ret; +} + +static int __netif_receive_skb(struct sk_buff *skb) +{ +	int ret; + +	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { +		unsigned long pflags = current->flags; + +		/* +		 * PFMEMALLOC skbs are special, they should +		 * - be delivered to SOCK_MEMALLOC sockets only +		 * - stay away from userspace +		 * - have bounded memory usage +		 * +		 * Use PF_MEMALLOC as this saves us from propagating the allocation +		 * context down to all allocation sites. +		 */ +		current->flags |= PF_MEMALLOC; +		ret = __netif_receive_skb_core(skb, true); +		tsk_restore_flags(current, pflags, PF_MEMALLOC); +	} else +		ret = __netif_receive_skb_core(skb, false); +  	return ret;  } @@ -3444,11 +3594,13 @@ static void flush_backlog(void *arg)  static int napi_gro_complete(struct sk_buff *skb)  { -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int err = -ENOENT; +	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); +  	if (NAPI_GRO_CB(skb)->count == 1) {  		skb_shinfo(skb)->gso_size = 0;  		goto out; @@ -3456,10 +3608,10 @@ static int napi_gro_complete(struct sk_buff *skb)  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_complete) +		if (ptype->type != type || !ptype->callbacks.gro_complete)  			continue; -		err = ptype->gro_complete(skb); +		err = ptype->callbacks.gro_complete(skb);  		break;  	}  	rcu_read_unlock(); @@ -3503,14 +3655,35 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)  }  EXPORT_SYMBOL(napi_gro_flush); -enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ +	struct sk_buff *p; +	unsigned int maclen = skb->dev->hard_header_len; + +	for (p = napi->gro_list; p; p = p->next) { +		unsigned long diffs; + +		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; +		diffs |= p->vlan_tci ^ skb->vlan_tci; +		if (maclen == ETH_HLEN) +			diffs |= compare_ether_header(skb_mac_header(p), +						      skb_gro_mac_header(skb)); +		else if (!diffs) +			diffs = memcmp(skb_mac_header(p), +				       skb_gro_mac_header(skb), +				       maclen); +		NAPI_GRO_CB(p)->same_flow = !diffs; +		NAPI_GRO_CB(p)->flush = 0; +	} +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff **pp = NULL; -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int same_flow; -	int mac_len;  	enum gro_result ret;  	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) @@ -3519,19 +3692,20 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal; +	gro_list_prepare(napi, skb); +  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_receive) +		if (ptype->type != type || !ptype->callbacks.gro_receive)  			continue;  		skb_set_network_header(skb, skb_gro_offset(skb)); -		mac_len = skb->network_header - skb->mac_header; -		skb->mac_len = mac_len; +		skb_reset_mac_len(skb);  		NAPI_GRO_CB(skb)->same_flow = 0;  		NAPI_GRO_CB(skb)->flush = 0;  		NAPI_GRO_CB(skb)->free = 0; -		pp = ptype->gro_receive(&napi->gro_list, skb); +		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);  		break;  	}  	rcu_read_unlock(); @@ -3594,34 +3768,9 @@ normal:  	ret = GRO_NORMAL;  	goto pull;  } -EXPORT_SYMBOL(dev_gro_receive); - -static inline gro_result_t -__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ -	struct sk_buff *p; -	unsigned int maclen = skb->dev->hard_header_len; - -	for (p = napi->gro_list; p; p = p->next) { -		unsigned long diffs; - -		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; -		diffs |= p->vlan_tci ^ skb->vlan_tci; -		if (maclen == ETH_HLEN) -			diffs |= compare_ether_header(skb_mac_header(p), -						      skb_gro_mac_header(skb)); -		else if (!diffs) -			diffs = memcmp(skb_mac_header(p), -				       skb_gro_mac_header(skb), -				       maclen); -		NAPI_GRO_CB(p)->same_flow = !diffs; -		NAPI_GRO_CB(p)->flush = 0; -	} -	return dev_gro_receive(napi, skb); -} -gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) {  	case GRO_NORMAL: @@ -3647,7 +3796,6 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  	return ret;  } -EXPORT_SYMBOL(napi_skb_finish);  static void skb_gro_reset_offset(struct sk_buff *skb)  { @@ -3670,7 +3818,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	skb_gro_reset_offset(skb); -	return napi_skb_finish(__napi_gro_receive(napi, skb), skb); +	return napi_skb_finish(dev_gro_receive(napi, skb), skb);  }  EXPORT_SYMBOL(napi_gro_receive); @@ -3699,7 +3847,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)  }  EXPORT_SYMBOL(napi_get_frags); -gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, +static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,  			       gro_result_t ret)  {  	switch (ret) { @@ -3724,7 +3872,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,  	return ret;  } -EXPORT_SYMBOL(napi_frags_finish);  static struct sk_buff *napi_frags_skb(struct napi_struct *napi)  { @@ -3769,7 +3916,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)  	if (!skb)  		return GRO_DROP; -	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags); @@ -3957,7 +4104,7 @@ static void net_rx_action(struct softirq_action *h)  		 * Allow this to run for 2 jiffies since which will allow  		 * an average latency of 1.5/HZ.  		 */ -		if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) +		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))  			goto softnet_break;  		local_irq_enable(); @@ -4035,525 +4182,231 @@ softnet_break:  	goto out;  } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - *	register_gifconf	-	register a SIOCGIF handler - *	@family: Address family - *	@gifconf: Function handler - * - *	Register protocol dependent address dumping routines. The handler - *	that is passed must not be freed or reused until it has been replaced - *	by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ -	if (family >= NPROTO) -		return -EINVAL; -	gifconf_list[family] = gifconf; -	return 0; -} -EXPORT_SYMBOL(register_gifconf); - - -/* - *	Map an interface index to its name (SIOCGIFNAME) - */ - -/* - *	We need this ioctl for efficient implementation of the - *	if_indextoname() function required by the IPv6 API.  Without - *	it, we would have to search all the interfaces to find a - *	match.  --pb - */ - -static int dev_ifname(struct net *net, struct ifreq __user *arg) -{ +struct netdev_upper {  	struct net_device *dev; -	struct ifreq ifr; - -	/* -	 *	Fetch the caller's info block. -	 */ +	bool master; +	struct list_head list; +	struct rcu_head rcu; +	struct list_head search_list; +}; -	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) -		return -EFAULT; +static void __append_search_uppers(struct list_head *search_list, +				   struct net_device *dev) +{ +	struct netdev_upper *upper; -	rcu_read_lock(); -	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); -	if (!dev) { -		rcu_read_unlock(); -		return -ENODEV; +	list_for_each_entry(upper, &dev->upper_dev_list, list) { +		/* check if this upper is not already in search list */ +		if (list_empty(&upper->search_list)) +			list_add_tail(&upper->search_list, search_list);  	} - -	strcpy(ifr.ifr_name, dev->name); -	rcu_read_unlock(); - -	if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) -		return -EFAULT; -	return 0;  } -/* - *	Perform a SIOCGIFCONF call. This structure will change - *	size eventually, and there is nothing I can do about it. - *	Thus we will need a 'compatibility mode'. - */ - -static int dev_ifconf(struct net *net, char __user *arg) +static bool __netdev_search_upper_dev(struct net_device *dev, +				      struct net_device *upper_dev)  { -	struct ifconf ifc; -	struct net_device *dev; -	char __user *pos; -	int len; -	int total; -	int i; - -	/* -	 *	Fetch the caller's info block. -	 */ - -	if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) -		return -EFAULT; +	LIST_HEAD(search_list); +	struct netdev_upper *upper; +	struct netdev_upper *tmp; +	bool ret = false; -	pos = ifc.ifc_buf; -	len = ifc.ifc_len; - -	/* -	 *	Loop over the interfaces, and write an info block for each. -	 */ - -	total = 0; -	for_each_netdev(net, dev) { -		for (i = 0; i < NPROTO; i++) { -			if (gifconf_list[i]) { -				int done; -				if (!pos) -					done = gifconf_list[i](dev, NULL, 0); -				else -					done = gifconf_list[i](dev, pos + total, -							       len - total); -				if (done < 0) -					return -EFAULT; -				total += done; -			} +	__append_search_uppers(&search_list, dev); +	list_for_each_entry(upper, &search_list, search_list) { +		if (upper->dev == upper_dev) { +			ret = true; +			break;  		} +		__append_search_uppers(&search_list, upper->dev);  	} - -	/* -	 *	All done.  Write the updated control block back to the caller. -	 */ -	ifc.ifc_len = total; - -	/* -	 * 	Both BSD and Solaris return 0 here, so we do too. -	 */ -	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +	list_for_each_entry_safe(upper, tmp, &search_list, search_list) +		INIT_LIST_HEAD(&upper->search_list); +	return ret;  } -#ifdef CONFIG_PROC_FS - -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static struct netdev_upper *__netdev_find_upper(struct net_device *dev, +						struct net_device *upper_dev)  { -	struct net *net = seq_file_net(seq); -	struct net_device *dev; -	struct hlist_node *p; -	struct hlist_head *h; -	unsigned int count = 0, offset = get_offset(*pos); +	struct netdev_upper *upper; -	h = &net->dev_name_head[get_bucket(*pos)]; -	hlist_for_each_entry_rcu(dev, p, h, name_hlist) { -		if (++count == offset) -			return dev; +	list_for_each_entry(upper, &dev->upper_dev_list, list) { +		if (upper->dev == upper_dev) +			return upper;  	} -  	return NULL;  } -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ -	struct net_device *dev; -	unsigned int bucket; - -	do { -		dev = dev_from_same_bucket(seq, pos); -		if (dev) -			return dev; - -		bucket = get_bucket(*pos) + 1; -		*pos = set_bucket_offset(bucket, 1); -	} while (bucket < NETDEV_HASHENTRIES); - -	return NULL; -} - -/* - *	This is invoked by the /proc filesystem handler to display a device - *	in detail. +/** + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock.   */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(RCU) -{ -	rcu_read_lock(); -	if (!*pos) -		return SEQ_START_TOKEN; - -	if (get_bucket(*pos) >= NETDEV_HASHENTRIES) -		return NULL; - -	return dev_from_bucket(seq, pos); -} - -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +bool netdev_has_upper_dev(struct net_device *dev, +			  struct net_device *upper_dev)  { -	++*pos; -	return dev_from_bucket(seq, pos); -} - -void dev_seq_stop(struct seq_file *seq, void *v) -	__releases(RCU) -{ -	rcu_read_unlock(); -} - -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ -	struct rtnl_link_stats64 temp; -	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); +	ASSERT_RTNL(); -	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " -		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", -		   dev->name, stats->rx_bytes, stats->rx_packets, -		   stats->rx_errors, -		   stats->rx_dropped + stats->rx_missed_errors, -		   stats->rx_fifo_errors, -		   stats->rx_length_errors + stats->rx_over_errors + -		    stats->rx_crc_errors + stats->rx_frame_errors, -		   stats->rx_compressed, stats->multicast, -		   stats->tx_bytes, stats->tx_packets, -		   stats->tx_errors, stats->tx_dropped, -		   stats->tx_fifo_errors, stats->collisions, -		   stats->tx_carrier_errors + -		    stats->tx_aborted_errors + -		    stats->tx_window_errors + -		    stats->tx_heartbeat_errors, -		   stats->tx_compressed); +	return __netdev_find_upper(dev, upper_dev);  } +EXPORT_SYMBOL(netdev_has_upper_dev); -/* - *	Called from the PROCfs module. This now uses the new arbitrary sized - *	/proc/net interface to create /proc/net/dev +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock.   */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ -	if (v == SEQ_START_TOKEN) -		seq_puts(seq, "Inter-|   Receive                            " -			      "                    |  Transmit\n" -			      " face |bytes    packets errs drop fifo frame " -			      "compressed multicast|bytes    packets errs " -			      "drop fifo colls carrier compressed\n"); -	else -		dev_seq_printf_stats(seq, v); -	return 0; -} - -static struct softnet_data *softnet_get_online(loff_t *pos) -{ -	struct softnet_data *sd = NULL; - -	while (*pos < nr_cpu_ids) -		if (cpu_online(*pos)) { -			sd = &per_cpu(softnet_data, *pos); -			break; -		} else -			++*pos; -	return sd; -} - -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) -{ -	return softnet_get_online(pos); -} - -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ -	++*pos; -	return softnet_get_online(pos); -} - -static void softnet_seq_stop(struct seq_file *seq, void *v) +bool netdev_has_any_upper_dev(struct net_device *dev)  { -} - -static int softnet_seq_show(struct seq_file *seq, void *v) -{ -	struct softnet_data *sd = v; +	ASSERT_RTNL(); -	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", -		   sd->processed, sd->dropped, sd->time_squeeze, 0, -		   0, 0, 0, 0, /* was fastroute */ -		   sd->cpu_collision, sd->received_rps); -	return 0; +	return !list_empty(&dev->upper_dev_list);  } +EXPORT_SYMBOL(netdev_has_any_upper_dev); -static const struct seq_operations dev_seq_ops = { -	.start = dev_seq_start, -	.next  = dev_seq_next, -	.stop  = dev_seq_stop, -	.show  = dev_seq_show, -}; - -static int dev_seq_open(struct inode *inode, struct file *file) +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  { -	return seq_open_net(inode, file, &dev_seq_ops, -			    sizeof(struct seq_net_private)); -} +	struct netdev_upper *upper; -static const struct file_operations dev_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = dev_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; +	ASSERT_RTNL(); -static const struct seq_operations softnet_seq_ops = { -	.start = softnet_seq_start, -	.next  = softnet_seq_next, -	.stop  = softnet_seq_stop, -	.show  = softnet_seq_show, -}; +	if (list_empty(&dev->upper_dev_list)) +		return NULL; -static int softnet_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open(file, &softnet_seq_ops); +	upper = list_first_entry(&dev->upper_dev_list, +				 struct netdev_upper, list); +	if (likely(upper->master)) +		return upper->dev; +	return NULL;  } +EXPORT_SYMBOL(netdev_master_upper_dev_get); -static const struct file_operations softnet_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = softnet_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release, -}; - -static void *ptype_get_idx(loff_t pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  { -	struct packet_type *pt = NULL; -	loff_t i = 0; -	int t; +	struct netdev_upper *upper; -	list_for_each_entry_rcu(pt, &ptype_all, list) { -		if (i == pos) -			return pt; -		++i; -	} - -	for (t = 0; t < PTYPE_HASH_SIZE; t++) { -		list_for_each_entry_rcu(pt, &ptype_base[t], list) { -			if (i == pos) -				return pt; -			++i; -		} -	} +	upper = list_first_or_null_rcu(&dev->upper_dev_list, +				       struct netdev_upper, list); +	if (upper && likely(upper->master)) +		return upper->dev;  	return NULL;  } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(RCU) +static int __netdev_upper_dev_link(struct net_device *dev, +				   struct net_device *upper_dev, bool master)  { -	rcu_read_lock(); -	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; -} +	struct netdev_upper *upper; -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ -	struct packet_type *pt; -	struct list_head *nxt; -	int hash; +	ASSERT_RTNL(); -	++*pos; -	if (v == SEQ_START_TOKEN) -		return ptype_get_idx(0); +	if (dev == upper_dev) +		return -EBUSY; -	pt = v; -	nxt = pt->list.next; -	if (pt->type == htons(ETH_P_ALL)) { -		if (nxt != &ptype_all) -			goto found; -		hash = 0; -		nxt = ptype_base[0].next; -	} else -		hash = ntohs(pt->type) & PTYPE_HASH_MASK; +	/* To prevent loops, check if dev is not upper device to upper_dev. */ +	if (__netdev_search_upper_dev(upper_dev, dev)) +		return -EBUSY; -	while (nxt == &ptype_base[hash]) { -		if (++hash >= PTYPE_HASH_SIZE) -			return NULL; -		nxt = ptype_base[hash].next; -	} -found: -	return list_entry(nxt, struct packet_type, list); -} +	if (__netdev_find_upper(dev, upper_dev)) +		return -EEXIST; -static void ptype_seq_stop(struct seq_file *seq, void *v) -	__releases(RCU) -{ -	rcu_read_unlock(); -} +	if (master && netdev_master_upper_dev_get(dev)) +		return -EBUSY; -static int ptype_seq_show(struct seq_file *seq, void *v) -{ -	struct packet_type *pt = v; +	upper = kmalloc(sizeof(*upper), GFP_KERNEL); +	if (!upper) +		return -ENOMEM; -	if (v == SEQ_START_TOKEN) -		seq_puts(seq, "Type Device      Function\n"); -	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { -		if (pt->type == htons(ETH_P_ALL)) -			seq_puts(seq, "ALL "); -		else -			seq_printf(seq, "%04x", ntohs(pt->type)); +	upper->dev = upper_dev; +	upper->master = master; +	INIT_LIST_HEAD(&upper->search_list); -		seq_printf(seq, " %-8s %pF\n", -			   pt->dev ? pt->dev->name : "", pt->func); -	} +	/* Ensure that master upper link is always the first item in list. */ +	if (master) +		list_add_rcu(&upper->list, &dev->upper_dev_list); +	else +		list_add_tail_rcu(&upper->list, &dev->upper_dev_list); +	dev_hold(upper_dev);  	return 0;  } -static const struct seq_operations ptype_seq_ops = { -	.start = ptype_seq_start, -	.next  = ptype_seq_next, -	.stop  = ptype_seq_stop, -	.show  = ptype_seq_show, -}; - -static int ptype_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &ptype_seq_ops, -			sizeof(struct seq_net_private)); -} - -static const struct file_operations ptype_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = ptype_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; - - -static int __net_init dev_proc_net_init(struct net *net) -{ -	int rc = -ENOMEM; - -	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) -		goto out; -	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) -		goto out_dev; -	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) -		goto out_softnet; - -	if (wext_proc_init(net)) -		goto out_ptype; -	rc = 0; -out: -	return rc; -out_ptype: -	proc_net_remove(net, "ptype"); -out_softnet: -	proc_net_remove(net, "softnet_stat"); -out_dev: -	proc_net_remove(net, "dev"); -	goto out; -} - -static void __net_exit dev_proc_net_exit(struct net *net) -{ -	wext_proc_exit(net); - -	proc_net_remove(net, "ptype"); -	proc_net_remove(net, "softnet_stat"); -	proc_net_remove(net, "dev"); -} - -static struct pernet_operations __net_initdata dev_proc_ops = { -	.init = dev_proc_net_init, -	.exit = dev_proc_net_exit, -}; - -static int __init dev_proc_init(void) +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, +			  struct net_device *upper_dev)  { -	return register_pernet_subsys(&dev_proc_ops); +	return __netdev_upper_dev_link(dev, upper_dev, false);  } -#else -#define dev_proc_init() 0 -#endif	/* CONFIG_PROC_FS */ - +EXPORT_SYMBOL(netdev_upper_dev_link);  /** - *	netdev_set_master	-	set up master pointer - *	@slave: slave device - *	@master: new master device + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device   * - *	Changes the master device of the slave. Pass %NULL to break the - *	bonding. The caller must hold the RTNL semaphore. On a failure - *	a negative errno code is returned. On success the reference counts - *	are adjusted and the function returns zero. + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero.   */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +int netdev_master_upper_dev_link(struct net_device *dev, +				 struct net_device *upper_dev)  { -	struct net_device *old = slave->master; - -	ASSERT_RTNL(); - -	if (master) { -		if (old) -			return -EBUSY; -		dev_hold(master); -	} - -	slave->master = master; - -	if (old) -		dev_put(old); -	return 0; +	return __netdev_upper_dev_link(dev, upper_dev, true);  } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_master_upper_dev_link);  /** - *	netdev_set_bond_master	-	set up bonding master/slave pair - *	@slave: slave device - *	@master: new master device + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device   * - *	Changes the master device of the slave. Pass %NULL to break the - *	bonding. The caller must hold the RTNL semaphore. On a failure - *	a negative errno code is returned. On success %RTM_NEWLINK is sent - *	to the routing socket and the function returns zero. + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock.   */ -int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +void netdev_upper_dev_unlink(struct net_device *dev, +			     struct net_device *upper_dev)  { -	int err; +	struct netdev_upper *upper;  	ASSERT_RTNL(); -	err = netdev_set_master(slave, master); -	if (err) -		return err; -	if (master) -		slave->flags |= IFF_SLAVE; -	else -		slave->flags &= ~IFF_SLAVE; - -	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); -	return 0; +	upper = __netdev_find_upper(dev, upper_dev); +	if (!upper) +		return; +	list_del_rcu(&upper->list); +	dev_put(upper_dev); +	kfree_rcu(upper, rcu);  } -EXPORT_SYMBOL(netdev_set_bond_master); +EXPORT_SYMBOL(netdev_upper_dev_unlink);  static void dev_change_rx_flags(struct net_device *dev, int flags)  { @@ -4880,7 +4733,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  	else  		dev->mtu = new_mtu; -	if (!err && dev->flags & IFF_UP) +	if (!err)  		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);  	return err;  } @@ -4916,372 +4769,33 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)  	if (!netif_device_present(dev))  		return -ENODEV;  	err = ops->ndo_set_mac_address(dev, sa); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); +	if (err) +		return err; +	dev->addr_assign_type = NET_ADDR_SET; +	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	add_device_randomness(dev->dev_addr, dev->addr_len); -	return err; +	return 0;  }  EXPORT_SYMBOL(dev_set_mac_address); -/* - *	Perform the SIOCxIFxxx calls, inside rcu_read_lock() - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ -	int err; -	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); - -	if (!dev) -		return -ENODEV; - -	switch (cmd) { -	case SIOCGIFFLAGS:	/* Get interface flags */ -		ifr->ifr_flags = (short) dev_get_flags(dev); -		return 0; - -	case SIOCGIFMETRIC:	/* Get the metric on the interface -				   (currently unused) */ -		ifr->ifr_metric = 0; -		return 0; - -	case SIOCGIFMTU:	/* Get the MTU of a device */ -		ifr->ifr_mtu = dev->mtu; -		return 0; - -	case SIOCGIFHWADDR: -		if (!dev->addr_len) -			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); -		else -			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, -			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -		ifr->ifr_hwaddr.sa_family = dev->type; -		return 0; - -	case SIOCGIFSLAVE: -		err = -EINVAL; -		break; - -	case SIOCGIFMAP: -		ifr->ifr_map.mem_start = dev->mem_start; -		ifr->ifr_map.mem_end   = dev->mem_end; -		ifr->ifr_map.base_addr = dev->base_addr; -		ifr->ifr_map.irq       = dev->irq; -		ifr->ifr_map.dma       = dev->dma; -		ifr->ifr_map.port      = dev->if_port; -		return 0; - -	case SIOCGIFINDEX: -		ifr->ifr_ifindex = dev->ifindex; -		return 0; - -	case SIOCGIFTXQLEN: -		ifr->ifr_qlen = dev->tx_queue_len; -		return 0; - -	default: -		/* dev_ioctl() should ensure this case -		 * is never reached -		 */ -		WARN_ON(1); -		err = -ENOTTY; -		break; - -	} -	return err; -} - -/* - *	Perform the SIOCxIFxxx calls, inside rtnl_lock() - */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ -	int err; -	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); -	const struct net_device_ops *ops; - -	if (!dev) -		return -ENODEV; - -	ops = dev->netdev_ops; - -	switch (cmd) { -	case SIOCSIFFLAGS:	/* Set interface flags */ -		return dev_change_flags(dev, ifr->ifr_flags); - -	case SIOCSIFMETRIC:	/* Set the metric on the interface -				   (currently unused) */ -		return -EOPNOTSUPP; - -	case SIOCSIFMTU:	/* Set the MTU of a device */ -		return dev_set_mtu(dev, ifr->ifr_mtu); - -	case SIOCSIFHWADDR: -		return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - -	case SIOCSIFHWBROADCAST: -		if (ifr->ifr_hwaddr.sa_family != dev->type) -			return -EINVAL; -		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, -		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -		return 0; - -	case SIOCSIFMAP: -		if (ops->ndo_set_config) { -			if (!netif_device_present(dev)) -				return -ENODEV; -			return ops->ndo_set_config(dev, &ifr->ifr_map); -		} -		return -EOPNOTSUPP; - -	case SIOCADDMULTI: -		if (!ops->ndo_set_rx_mode || -		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -			return -EINVAL; -		if (!netif_device_present(dev)) -			return -ENODEV; -		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - -	case SIOCDELMULTI: -		if (!ops->ndo_set_rx_mode || -		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -			return -EINVAL; -		if (!netif_device_present(dev)) -			return -ENODEV; -		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - -	case SIOCSIFTXQLEN: -		if (ifr->ifr_qlen < 0) -			return -EINVAL; -		dev->tx_queue_len = ifr->ifr_qlen; -		return 0; - -	case SIOCSIFNAME: -		ifr->ifr_newname[IFNAMSIZ-1] = '\0'; -		return dev_change_name(dev, ifr->ifr_newname); - -	case SIOCSHWTSTAMP: -		err = net_hwtstamp_validate(ifr); -		if (err) -			return err; -		/* fall through */ - -	/* -	 *	Unknown or private ioctl -	 */ -	default: -		if ((cmd >= SIOCDEVPRIVATE && -		    cmd <= SIOCDEVPRIVATE + 15) || -		    cmd == SIOCBONDENSLAVE || -		    cmd == SIOCBONDRELEASE || -		    cmd == SIOCBONDSETHWADDR || -		    cmd == SIOCBONDSLAVEINFOQUERY || -		    cmd == SIOCBONDINFOQUERY || -		    cmd == SIOCBONDCHANGEACTIVE || -		    cmd == SIOCGMIIPHY || -		    cmd == SIOCGMIIREG || -		    cmd == SIOCSMIIREG || -		    cmd == SIOCBRADDIF || -		    cmd == SIOCBRDELIF || -		    cmd == SIOCSHWTSTAMP || -		    cmd == SIOCWANDEV) { -			err = -EOPNOTSUPP; -			if (ops->ndo_do_ioctl) { -				if (netif_device_present(dev)) -					err = ops->ndo_do_ioctl(dev, ifr, cmd); -				else -					err = -ENODEV; -			} -		} else -			err = -EINVAL; - -	} -	return err; -} - -/* - *	This function handles all "interface"-type I/O control requests. The actual - *	'doing' part of this is dev_ifsioc above. - */ -  /** - *	dev_ioctl	-	network device ioctl - *	@net: the applicable net namespace - *	@cmd: command to issue - *	@arg: pointer to a struct ifreq in user space + *	dev_change_carrier - Change device carrier + *	@dev: device + *	@new_carrier: new value   * - *	Issue ioctl functions to devices. This is normally called by the - *	user space syscall interfaces but can sometimes be useful for - *	other purposes. The return value is the return from the syscall if - *	positive or a negative errno code on error. + *	Change device carrier   */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_change_carrier(struct net_device *dev, bool new_carrier)  { -	struct ifreq ifr; -	int ret; -	char *colon; - -	/* One special case: SIOCGIFCONF takes ifconf argument -	   and requires shared lock, because it sleeps writing -	   to user space. -	 */ - -	if (cmd == SIOCGIFCONF) { -		rtnl_lock(); -		ret = dev_ifconf(net, (char __user *) arg); -		rtnl_unlock(); -		return ret; -	} -	if (cmd == SIOCGIFNAME) -		return dev_ifname(net, (struct ifreq __user *)arg); - -	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) -		return -EFAULT; - -	ifr.ifr_name[IFNAMSIZ-1] = 0; - -	colon = strchr(ifr.ifr_name, ':'); -	if (colon) -		*colon = 0; - -	/* -	 *	See which interface the caller is talking about. -	 */ - -	switch (cmd) { -	/* -	 *	These ioctl calls: -	 *	- can be done by all. -	 *	- atomic and do not require locking. -	 *	- return a value -	 */ -	case SIOCGIFFLAGS: -	case SIOCGIFMETRIC: -	case SIOCGIFMTU: -	case SIOCGIFHWADDR: -	case SIOCGIFSLAVE: -	case SIOCGIFMAP: -	case SIOCGIFINDEX: -	case SIOCGIFTXQLEN: -		dev_load(net, ifr.ifr_name); -		rcu_read_lock(); -		ret = dev_ifsioc_locked(net, &ifr, cmd); -		rcu_read_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	case SIOCETHTOOL: -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ethtool(net, &ifr); -		rtnl_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	/* -	 *	These ioctl calls: -	 *	- require superuser power. -	 *	- require strict serialization. -	 *	- return a value -	 */ -	case SIOCGMIIPHY: -	case SIOCGMIIREG: -	case SIOCSIFNAME: -		if (!capable(CAP_NET_ADMIN)) -			return -EPERM; -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ifsioc(net, &ifr, cmd); -		rtnl_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	/* -	 *	These ioctl calls: -	 *	- require superuser power. -	 *	- require strict serialization. -	 *	- do not return a value -	 */ -	case SIOCSIFFLAGS: -	case SIOCSIFMETRIC: -	case SIOCSIFMTU: -	case SIOCSIFMAP: -	case SIOCSIFHWADDR: -	case SIOCSIFSLAVE: -	case SIOCADDMULTI: -	case SIOCDELMULTI: -	case SIOCSIFHWBROADCAST: -	case SIOCSIFTXQLEN: -	case SIOCSMIIREG: -	case SIOCBONDENSLAVE: -	case SIOCBONDRELEASE: -	case SIOCBONDSETHWADDR: -	case SIOCBONDCHANGEACTIVE: -	case SIOCBRADDIF: -	case SIOCBRDELIF: -	case SIOCSHWTSTAMP: -		if (!capable(CAP_NET_ADMIN)) -			return -EPERM; -		/* fall through */ -	case SIOCBONDSLAVEINFOQUERY: -	case SIOCBONDINFOQUERY: -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ifsioc(net, &ifr, cmd); -		rtnl_unlock(); -		return ret; - -	case SIOCGIFMEM: -		/* Get the per device memory space. We can add this but -		 * currently do not support it */ -	case SIOCSIFMEM: -		/* Set the per device memory buffer space. -		 * Not applicable in our case */ -	case SIOCSIFLINK: -		return -ENOTTY; +	const struct net_device_ops *ops = dev->netdev_ops; -	/* -	 *	Unknown or private ioctl. -	 */ -	default: -		if (cmd == SIOCWANDEV || -		    (cmd >= SIOCDEVPRIVATE && -		     cmd <= SIOCDEVPRIVATE + 15)) { -			dev_load(net, ifr.ifr_name); -			rtnl_lock(); -			ret = dev_ifsioc(net, &ifr, cmd); -			rtnl_unlock(); -			if (!ret && copy_to_user(arg, &ifr, -						 sizeof(struct ifreq))) -				ret = -EFAULT; -			return ret; -		} -		/* Take care of Wireless Extensions */ -		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) -			return wext_handle_ioctl(net, &ifr, cmd, arg); -		return -ENOTTY; -	} +	if (!ops->ndo_change_carrier) +		return -EOPNOTSUPP; +	if (!netif_device_present(dev)) +		return -ENODEV; +	return ops->ndo_change_carrier(dev, new_carrier);  } - +EXPORT_SYMBOL(dev_change_carrier);  /**   *	dev_new_index	-	allocate an ifindex @@ -5369,11 +4883,15 @@ static void rollback_registered_many(struct list_head *head)  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); -		/* Notifier chain MUST detach us from master device. */ -		WARN_ON(dev->master); +		/* Notifier chain MUST detach us all upper devices. */ +		WARN_ON(netdev_has_any_upper_dev(dev));  		/* Remove entries from kobject tree */  		netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS +		/* Remove XPS queueing entries */ +		netif_reset_xps_queues_gt(dev, 0); +#endif  	}  	synchronize_net(); @@ -5551,10 +5069,9 @@ static int netif_alloc_rx_queues(struct net_device *dev)  	BUG_ON(count < 1);  	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); -	if (!rx) { -		pr_err("netdev: Unable to allocate %u rx queues\n", count); +	if (!rx)  		return -ENOMEM; -	} +  	dev->_rx = rx;  	for (i = 0; i < count; i++) @@ -5585,10 +5102,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev)  	BUG_ON(count < 1);  	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); -	if (!tx) { -		pr_err("netdev: Unable to allocate %u tx queues\n", count); +	if (!tx)  		return -ENOMEM; -	} +  	dev->_tx = tx;  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5647,6 +5163,14 @@ int register_netdevice(struct net_device *dev)  		}  	} +	if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) && +	    (!dev->netdev_ops->ndo_vlan_rx_add_vid || +	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { +		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); +		ret = -EINVAL; +		goto err_uninit; +	} +  	ret = -EBUSY;  	if (!dev->ifindex)  		dev->ifindex = dev_new_index(net); @@ -5702,6 +5226,13 @@ int register_netdevice(struct net_device *dev)  	list_netdevice(dev);  	add_device_randomness(dev->dev_addr, dev->addr_len); +	/* If the device has permanent device address, driver should +	 * set dev_addr and also addr_assign_type should be set to +	 * NET_ADDR_PERM (default value). +	 */ +	if (dev->addr_assign_type == NET_ADDR_PERM) +		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); +  	/* Notify protocols, that a new device appeared. */  	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);  	ret = notifier_to_errno(ret); @@ -6008,6 +5539,14 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  static const struct ethtool_ops default_ethtool_ops; +void netdev_set_default_ethtool_ops(struct net_device *dev, +				    const struct ethtool_ops *ops) +{ +	if (dev->ethtool_ops == &default_ethtool_ops) +		dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +  /**   *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for @@ -6052,10 +5591,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	alloc_size += NETDEV_ALIGN - 1;  	p = kzalloc(alloc_size, GFP_KERNEL); -	if (!p) { -		pr_err("alloc_netdev: Unable to allocate device\n"); +	if (!p)  		return NULL; -	}  	dev = PTR_ALIGN(p, NETDEV_ALIGN);  	dev->padded = (char *)dev - (char *)p; @@ -6078,6 +5615,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	INIT_LIST_HEAD(&dev->napi_list);  	INIT_LIST_HEAD(&dev->unreg_list);  	INIT_LIST_HEAD(&dev->link_watch_list); +	INIT_LIST_HEAD(&dev->upper_dev_list);  	dev->priv_flags = IFF_XMIT_DST_RELEASE;  	setup(dev); @@ -6264,7 +5802,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  		goto out;  	/* Ensure the device has been registrered */ -	err = -EINVAL;  	if (dev->reg_state != NETREG_REGISTERED)  		goto out; @@ -6319,6 +5856,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	dev_uc_flush(dev);  	dev_mc_flush(dev); +	/* Send a netdev-removed uevent to the old namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); +  	/* Actually switch the network namespace */  	dev_net_set(dev, net); @@ -6330,6 +5870,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  			dev->iflink = dev->ifindex;  	} +	/* Send a netdev-add uevent to the new namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_ADD); +  	/* Fixup kobjects */  	err = device_rename(&dev->dev, dev->name);  	WARN_ON(err); @@ -6662,6 +6205,8 @@ static int __init net_dev_init(void)  	for (i = 0; i < PTYPE_HASH_SIZE; i++)  		INIT_LIST_HEAD(&ptype_base[i]); +	INIT_LIST_HEAD(&offload_base); +  	if (register_pernet_subsys(&netdev_net_ops))  		goto out; @@ -6714,19 +6259,9 @@ static int __init net_dev_init(void)  	hotcpu_notifier(dev_cpu_callback, 0);  	dst_init(); -	dev_mcast_init();  	rc = 0;  out:  	return rc;  }  subsys_initcall(net_dev_init); - -static int __init initialize_hashrnd(void) -{ -	get_random_bytes(&hashrnd, sizeof(hashrnd)); -	return 0; -} - -late_initcall_sync(initialize_hashrnd); - diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 87cc17db2d5..bd2eb9d3e36 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -15,7 +15,6 @@  #include <linux/rtnetlink.h>  #include <linux/export.h>  #include <linux/list.h> -#include <linux/proc_fs.h>  /*   * General list handling functions @@ -319,7 +318,8 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr,  	 */  	ha = list_first_entry(&dev->dev_addrs.list,  			      struct netdev_hw_addr, list); -	if (ha->addr == dev->dev_addr && ha->refcount == 1) +	if (!memcmp(ha->addr, addr, dev->addr_len) && +	    ha->type == addr_type && ha->refcount == 1)  		return -ENOENT;  	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, @@ -726,76 +726,3 @@ void dev_mc_init(struct net_device *dev)  	__hw_addr_init(&dev->mc);  }  EXPORT_SYMBOL(dev_mc_init); - -#ifdef CONFIG_PROC_FS -#include <linux/seq_file.h> - -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ -	struct netdev_hw_addr *ha; -	struct net_device *dev = v; - -	if (v == SEQ_START_TOKEN) -		return 0; - -	netif_addr_lock_bh(dev); -	netdev_for_each_mc_addr(ha, dev) { -		int i; - -		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, -			   dev->name, ha->refcount, ha->global_use); - -		for (i = 0; i < dev->addr_len; i++) -			seq_printf(seq, "%02x", ha->addr[i]); - -		seq_putc(seq, '\n'); -	} -	netif_addr_unlock_bh(dev); -	return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { -	.start = dev_seq_start, -	.next  = dev_seq_next, -	.stop  = dev_seq_stop, -	.show  = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &dev_mc_seq_ops, -			    sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = dev_mc_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ -	if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) -		return -ENOMEM; -	return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ -	proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { -	.init = dev_mc_net_init, -	.exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ -	register_pernet_subsys(&dev_mc_net_ops); -} - diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c new file mode 100644 index 00000000000..6cc0481faad --- /dev/null +++ b/net/core/dev_ioctl.c @@ -0,0 +1,576 @@ +#include <linux/kmod.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/net_tstamp.h> +#include <linux/wireless.h> +#include <net/wext.h> + +/* + *	Map an interface index to its name (SIOCGIFNAME) + */ + +/* + *	We need this ioctl for efficient implementation of the + *	if_indextoname() function required by the IPv6 API.  Without + *	it, we would have to search all the interfaces to find a + *	match.  --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ +	struct net_device *dev; +	struct ifreq ifr; +	unsigned seq; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +retry: +	seq = read_seqcount_begin(&devnet_rename_seq); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); +	if (!dev) { +		rcu_read_unlock(); +		return -ENODEV; +	} + +	strcpy(ifr.ifr_name, dev->name); +	rcu_read_unlock(); +	if (read_seqcount_retry(&devnet_rename_seq, seq)) +		goto retry; + +	if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) +		return -EFAULT; +	return 0; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + *	register_gifconf	-	register a SIOCGIF handler + *	@family: Address family + *	@gifconf: Function handler + * + *	Register protocol dependent address dumping routines. The handler + *	that is passed must not be freed or reused until it has been replaced + *	by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ +	if (family >= NPROTO) +		return -EINVAL; +	gifconf_list[family] = gifconf; +	return 0; +} +EXPORT_SYMBOL(register_gifconf); + +/* + *	Perform a SIOCGIFCONF call. This structure will change + *	size eventually, and there is nothing I can do about it. + *	Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ +	struct ifconf ifc; +	struct net_device *dev; +	char __user *pos; +	int len; +	int total; +	int i; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) +		return -EFAULT; + +	pos = ifc.ifc_buf; +	len = ifc.ifc_len; + +	/* +	 *	Loop over the interfaces, and write an info block for each. +	 */ + +	total = 0; +	for_each_netdev(net, dev) { +		for (i = 0; i < NPROTO; i++) { +			if (gifconf_list[i]) { +				int done; +				if (!pos) +					done = gifconf_list[i](dev, NULL, 0); +				else +					done = gifconf_list[i](dev, pos + total, +							       len - total); +				if (done < 0) +					return -EFAULT; +				total += done; +			} +		} +	} + +	/* +	 *	All done.  Write the updated control block back to the caller. +	 */ +	ifc.ifc_len = total; + +	/* +	 * 	Both BSD and Solaris return 0 here, so we do too. +	 */ +	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +/* + *	Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ +	int err; +	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + +	if (!dev) +		return -ENODEV; + +	switch (cmd) { +	case SIOCGIFFLAGS:	/* Get interface flags */ +		ifr->ifr_flags = (short) dev_get_flags(dev); +		return 0; + +	case SIOCGIFMETRIC:	/* Get the metric on the interface +				   (currently unused) */ +		ifr->ifr_metric = 0; +		return 0; + +	case SIOCGIFMTU:	/* Get the MTU of a device */ +		ifr->ifr_mtu = dev->mtu; +		return 0; + +	case SIOCGIFHWADDR: +		if (!dev->addr_len) +			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); +		else +			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, +			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		ifr->ifr_hwaddr.sa_family = dev->type; +		return 0; + +	case SIOCGIFSLAVE: +		err = -EINVAL; +		break; + +	case SIOCGIFMAP: +		ifr->ifr_map.mem_start = dev->mem_start; +		ifr->ifr_map.mem_end   = dev->mem_end; +		ifr->ifr_map.base_addr = dev->base_addr; +		ifr->ifr_map.irq       = dev->irq; +		ifr->ifr_map.dma       = dev->dma; +		ifr->ifr_map.port      = dev->if_port; +		return 0; + +	case SIOCGIFINDEX: +		ifr->ifr_ifindex = dev->ifindex; +		return 0; + +	case SIOCGIFTXQLEN: +		ifr->ifr_qlen = dev->tx_queue_len; +		return 0; + +	default: +		/* dev_ioctl() should ensure this case +		 * is never reached +		 */ +		WARN_ON(1); +		err = -ENOTTY; +		break; + +	} +	return err; +} + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ +	struct hwtstamp_config cfg; +	enum hwtstamp_tx_types tx_type; +	enum hwtstamp_rx_filters rx_filter; +	int tx_type_valid = 0; +	int rx_filter_valid = 0; + +	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) +		return -EFAULT; + +	if (cfg.flags) /* reserved for future extensions */ +		return -EINVAL; + +	tx_type = cfg.tx_type; +	rx_filter = cfg.rx_filter; + +	switch (tx_type) { +	case HWTSTAMP_TX_OFF: +	case HWTSTAMP_TX_ON: +	case HWTSTAMP_TX_ONESTEP_SYNC: +		tx_type_valid = 1; +		break; +	} + +	switch (rx_filter) { +	case HWTSTAMP_FILTER_NONE: +	case HWTSTAMP_FILTER_ALL: +	case HWTSTAMP_FILTER_SOME: +	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: +	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: +	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: +		rx_filter_valid = 1; +		break; +	} + +	if (!tx_type_valid || !rx_filter_valid) +		return -ERANGE; + +	return 0; +} + +/* + *	Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ +	int err; +	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); +	const struct net_device_ops *ops; + +	if (!dev) +		return -ENODEV; + +	ops = dev->netdev_ops; + +	switch (cmd) { +	case SIOCSIFFLAGS:	/* Set interface flags */ +		return dev_change_flags(dev, ifr->ifr_flags); + +	case SIOCSIFMETRIC:	/* Set the metric on the interface +				   (currently unused) */ +		return -EOPNOTSUPP; + +	case SIOCSIFMTU:	/* Set the MTU of a device */ +		return dev_set_mtu(dev, ifr->ifr_mtu); + +	case SIOCSIFHWADDR: +		return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + +	case SIOCSIFHWBROADCAST: +		if (ifr->ifr_hwaddr.sa_family != dev->type) +			return -EINVAL; +		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, +		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); +		return 0; + +	case SIOCSIFMAP: +		if (ops->ndo_set_config) { +			if (!netif_device_present(dev)) +				return -ENODEV; +			return ops->ndo_set_config(dev, &ifr->ifr_map); +		} +		return -EOPNOTSUPP; + +	case SIOCADDMULTI: +		if (!ops->ndo_set_rx_mode || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + +	case SIOCDELMULTI: +		if (!ops->ndo_set_rx_mode || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + +	case SIOCSIFTXQLEN: +		if (ifr->ifr_qlen < 0) +			return -EINVAL; +		dev->tx_queue_len = ifr->ifr_qlen; +		return 0; + +	case SIOCSIFNAME: +		ifr->ifr_newname[IFNAMSIZ-1] = '\0'; +		return dev_change_name(dev, ifr->ifr_newname); + +	case SIOCSHWTSTAMP: +		err = net_hwtstamp_validate(ifr); +		if (err) +			return err; +		/* fall through */ + +	/* +	 *	Unknown or private ioctl +	 */ +	default: +		if ((cmd >= SIOCDEVPRIVATE && +		    cmd <= SIOCDEVPRIVATE + 15) || +		    cmd == SIOCBONDENSLAVE || +		    cmd == SIOCBONDRELEASE || +		    cmd == SIOCBONDSETHWADDR || +		    cmd == SIOCBONDSLAVEINFOQUERY || +		    cmd == SIOCBONDINFOQUERY || +		    cmd == SIOCBONDCHANGEACTIVE || +		    cmd == SIOCGMIIPHY || +		    cmd == SIOCGMIIREG || +		    cmd == SIOCSMIIREG || +		    cmd == SIOCBRADDIF || +		    cmd == SIOCBRDELIF || +		    cmd == SIOCSHWTSTAMP || +		    cmd == SIOCWANDEV) { +			err = -EOPNOTSUPP; +			if (ops->ndo_do_ioctl) { +				if (netif_device_present(dev)) +					err = ops->ndo_do_ioctl(dev, ifr, cmd); +				else +					err = -ENODEV; +			} +		} else +			err = -EINVAL; + +	} +	return err; +} + +/** + *	dev_load 	- load a network module + *	@net: the applicable net namespace + *	@name: name of interface + * + *	If a network interface is not present and the process has suitable + *	privileges this function loads the module. If module loading is not + *	available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ +	struct net_device *dev; +	int no_module; + +	rcu_read_lock(); +	dev = dev_get_by_name_rcu(net, name); +	rcu_read_unlock(); + +	no_module = !dev; +	if (no_module && capable(CAP_NET_ADMIN)) +		no_module = request_module("netdev-%s", name); +	if (no_module && capable(CAP_SYS_MODULE)) { +		if (!request_module("%s", name)) +			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n", +				name); +	} +} +EXPORT_SYMBOL(dev_load); + +/* + *	This function handles all "interface"-type I/O control requests. The actual + *	'doing' part of this is dev_ifsioc above. + */ + +/** + *	dev_ioctl	-	network device ioctl + *	@net: the applicable net namespace + *	@cmd: command to issue + *	@arg: pointer to a struct ifreq in user space + * + *	Issue ioctl functions to devices. This is normally called by the + *	user space syscall interfaces but can sometimes be useful for + *	other purposes. The return value is the return from the syscall if + *	positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ +	struct ifreq ifr; +	int ret; +	char *colon; + +	/* One special case: SIOCGIFCONF takes ifconf argument +	   and requires shared lock, because it sleeps writing +	   to user space. +	 */ + +	if (cmd == SIOCGIFCONF) { +		rtnl_lock(); +		ret = dev_ifconf(net, (char __user *) arg); +		rtnl_unlock(); +		return ret; +	} +	if (cmd == SIOCGIFNAME) +		return dev_ifname(net, (struct ifreq __user *)arg); + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +	ifr.ifr_name[IFNAMSIZ-1] = 0; + +	colon = strchr(ifr.ifr_name, ':'); +	if (colon) +		*colon = 0; + +	/* +	 *	See which interface the caller is talking about. +	 */ + +	switch (cmd) { +	/* +	 *	These ioctl calls: +	 *	- can be done by all. +	 *	- atomic and do not require locking. +	 *	- return a value +	 */ +	case SIOCGIFFLAGS: +	case SIOCGIFMETRIC: +	case SIOCGIFMTU: +	case SIOCGIFHWADDR: +	case SIOCGIFSLAVE: +	case SIOCGIFMAP: +	case SIOCGIFINDEX: +	case SIOCGIFTXQLEN: +		dev_load(net, ifr.ifr_name); +		rcu_read_lock(); +		ret = dev_ifsioc_locked(net, &ifr, cmd); +		rcu_read_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	case SIOCETHTOOL: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ethtool(net, &ifr); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- return a value +	 */ +	case SIOCGMIIPHY: +	case SIOCGMIIREG: +	case SIOCSIFNAME: +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			return -EPERM; +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */ +	case SIOCSIFMAP: +	case SIOCSIFTXQLEN: +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	/* +	 *	These ioctl calls: +	 *	- require local superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */ +	case SIOCSIFFLAGS: +	case SIOCSIFMETRIC: +	case SIOCSIFMTU: +	case SIOCSIFHWADDR: +	case SIOCSIFSLAVE: +	case SIOCADDMULTI: +	case SIOCDELMULTI: +	case SIOCSIFHWBROADCAST: +	case SIOCSMIIREG: +	case SIOCBONDENSLAVE: +	case SIOCBONDRELEASE: +	case SIOCBONDSETHWADDR: +	case SIOCBONDCHANGEACTIVE: +	case SIOCBRADDIF: +	case SIOCBRDELIF: +	case SIOCSHWTSTAMP: +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	case SIOCBONDSLAVEINFOQUERY: +	case SIOCBONDINFOQUERY: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		return ret; + +	case SIOCGIFMEM: +		/* Get the per device memory space. We can add this but +		 * currently do not support it */ +	case SIOCSIFMEM: +		/* Set the per device memory buffer space. +		 * Not applicable in our case */ +	case SIOCSIFLINK: +		return -ENOTTY; + +	/* +	 *	Unknown or private ioctl. +	 */ +	default: +		if (cmd == SIOCWANDEV || +		    (cmd >= SIOCDEVPRIVATE && +		     cmd <= SIOCDEVPRIVATE + 15)) { +			dev_load(net, ifr.ifr_name); +			rtnl_lock(); +			ret = dev_ifsioc(net, &ifr, cmd); +			rtnl_unlock(); +			if (!ret && copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +				ret = -EFAULT; +			return ret; +		} +		/* Take care of Wireless Extensions */ +		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) +			return wext_handle_ioctl(net, &ifr, cmd, arg); +		return -ENOTTY; +	} +} diff --git a/net/core/dst.c b/net/core/dst.c index ee6153e2cf4..35fd12f1a69 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -179,6 +179,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,  	dst_init_metrics(dst, dst_default_metrics, true);  	dst->expires = 0UL;  	dst->path = dst; +	dst->from = NULL;  #ifdef CONFIG_XFRM  	dst->xfrm = NULL;  #endif diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d64cc2e3fa..3e9b2c3e30f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -77,6 +77,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]  	[NETIF_F_TSO_ECN_BIT] =          "tx-tcp-ecn-segmentation",  	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",  	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation", +	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",  	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",  	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp", @@ -175,7 +176,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)  	if (sset == ETH_SS_FEATURES)  		return ARRAY_SIZE(netdev_features_strings); -	if (ops && ops->get_sset_count && ops->get_strings) +	if (ops->get_sset_count && ops->get_strings)  		return ops->get_sset_count(dev, sset);  	else  		return -EOPNOTSUPP; @@ -311,7 +312,7 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)  {  	ASSERT_RTNL(); -	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) +	if (!dev->ethtool_ops->get_settings)  		return -EOPNOTSUPP;  	memset(cmd, 0, sizeof(struct ethtool_cmd)); @@ -355,7 +356,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  	memset(&info, 0, sizeof(info));  	info.cmd = ETHTOOL_GDRVINFO; -	if (ops && ops->get_drvinfo) { +	if (ops->get_drvinfo) {  		ops->get_drvinfo(dev, &info);  	} else if (dev->dev.parent && dev->dev.parent->driver) {  		strlcpy(info.bus_info, dev_name(dev->dev.parent), @@ -370,7 +371,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  	 * this method of obtaining string set info is deprecated;  	 * Use ETHTOOL_GSSET_INFO instead.  	 */ -	if (ops && ops->get_sset_count) { +	if (ops->get_sset_count) {  		int rc;  		rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -383,9 +384,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  		if (rc >= 0)  			info.n_priv_flags = rc;  	} -	if (ops && ops->get_regs_len) +	if (ops->get_regs_len)  		info.regdump_len = ops->get_regs_len(dev); -	if (ops && ops->get_eeprom_len) +	if (ops->get_eeprom_len)  		info.eedump_len = ops->get_eeprom_len(dev);  	if (copy_to_user(useraddr, &info, sizeof(info))) @@ -590,13 +591,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  	struct ethtool_rxnfc rx_rings;  	u32 user_size, dev_size, i;  	u32 *indir; +	const struct ethtool_ops *ops = dev->ethtool_ops;  	int ret; -	if (!dev->ethtool_ops->get_rxfh_indir_size || -	    !dev->ethtool_ops->set_rxfh_indir || -	    !dev->ethtool_ops->get_rxnfc) +	if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir || +	    !ops->get_rxnfc)  		return -EOPNOTSUPP; -	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + +	dev_size = ops->get_rxfh_indir_size(dev);  	if (dev_size == 0)  		return -EOPNOTSUPP; @@ -613,7 +615,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  		return -ENOMEM;  	rx_rings.cmd = ETHTOOL_GRXRINGS; -	ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); +	ret = ops->get_rxnfc(dev, &rx_rings, NULL);  	if (ret)  		goto out; @@ -639,7 +641,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  		}  	} -	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); +	ret = ops->set_rxfh_indir(dev, indir);  out:  	kfree(indir); @@ -1082,9 +1084,10 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)  {  	struct ethtool_value id;  	static bool busy; +	const struct ethtool_ops *ops = dev->ethtool_ops;  	int rc; -	if (!dev->ethtool_ops->set_phys_id) +	if (!ops->set_phys_id)  		return -EOPNOTSUPP;  	if (busy) @@ -1093,7 +1096,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)  	if (copy_from_user(&id, useraddr, sizeof(id)))  		return -EFAULT; -	rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); +	rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);  	if (rc < 0)  		return rc; @@ -1118,7 +1121,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)  			i = n;  			do {  				rtnl_lock(); -				rc = dev->ethtool_ops->set_phys_id(dev, +				rc = ops->set_phys_id(dev,  				    (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);  				rtnl_unlock();  				if (rc) @@ -1133,7 +1136,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)  	dev_put(dev);  	busy = false; -	(void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); +	(void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);  	return rc;  } @@ -1275,7 +1278,7 @@ static int ethtool_get_dump_flag(struct net_device *dev,  	struct ethtool_dump dump;  	const struct ethtool_ops *ops = dev->ethtool_ops; -	if (!dev->ethtool_ops->get_dump_flag) +	if (!ops->get_dump_flag)  		return -EOPNOTSUPP;  	if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1299,8 +1302,7 @@ static int ethtool_get_dump_data(struct net_device *dev,  	const struct ethtool_ops *ops = dev->ethtool_ops;  	void *data = NULL; -	if (!dev->ethtool_ops->get_dump_data || -		!dev->ethtool_ops->get_dump_flag) +	if (!ops->get_dump_data || !ops->get_dump_flag)  		return -EOPNOTSUPP;  	if (copy_from_user(&dump, useraddr, sizeof(dump))) @@ -1346,13 +1348,9 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)  	info.cmd = ETHTOOL_GET_TS_INFO;  	if (phydev && phydev->drv && phydev->drv->ts_info) { -  		err = phydev->drv->ts_info(phydev, &info); - -	} else if (dev->ethtool_ops && dev->ethtool_ops->get_ts_info) { - +	} else if (ops->get_ts_info) {  		err = ops->get_ts_info(dev, &info); -  	} else {  		info.so_timestamping =  			SOF_TIMESTAMPING_RX_SOFTWARE | @@ -1460,7 +1458,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_GEEE:  		break;  	default: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  	} diff --git a/net/core/filter.c b/net/core/filter.c index 3d92ebb7fbc..2e20b55a783 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -39,6 +39,7 @@  #include <linux/reciprocal_div.h>  #include <linux/ratelimit.h>  #include <linux/seccomp.h> +#include <linux/if_vlan.h>  /* No hurry in this branch   * @@ -341,6 +342,12 @@ load_b:  		case BPF_S_ANC_CPU:  			A = raw_smp_processor_id();  			continue; +		case BPF_S_ANC_VLAN_TAG: +			A = vlan_tx_tag_get(skb); +			continue; +		case BPF_S_ANC_VLAN_TAG_PRESENT: +			A = !!vlan_tx_tag_present(skb); +			continue;  		case BPF_S_ANC_NLATTR: {  			struct nlattr *nla; @@ -525,6 +532,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,  	};  	int pc; +	bool anc_found;  	if (flen == 0 || flen > BPF_MAXINSNS)  		return -EINVAL; @@ -585,8 +593,10 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  		case BPF_S_LD_W_ABS:  		case BPF_S_LD_H_ABS:  		case BPF_S_LD_B_ABS: +			anc_found = false;  #define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\  				code = BPF_S_ANC_##CODE;	\ +				anc_found = true;		\  				break  			switch (ftest->k) {  			ANCILLARY(PROTOCOL); @@ -600,7 +610,13 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  			ANCILLARY(RXHASH);  			ANCILLARY(CPU);  			ANCILLARY(ALU_XOR_X); +			ANCILLARY(VLAN_TAG); +			ANCILLARY(VLAN_TAG_PRESENT);  			} + +			/* ancillary operation unknown or unsupported */ +			if (anc_found == false && ftest->k >= SKF_AD_OFF) +				return -EINVAL;  		}  		ftest->code = code;  	} @@ -705,6 +721,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)  	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;  	int err; +	if (sock_flag(sk, SOCK_FILTER_LOCKED)) +		return -EPERM; +  	/* Make sure new filter is there and in the right amounts. */  	if (fprog->filter == NULL)  		return -EINVAL; @@ -741,6 +760,9 @@ int sk_detach_filter(struct sock *sk)  	int ret = -ENOENT;  	struct sk_filter *filter; +	if (sock_flag(sk, SOCK_FILTER_LOCKED)) +		return -EPERM; +  	filter = rcu_dereference_protected(sk->sk_filter,  					   sock_owned_by_user(sk));  	if (filter) { @@ -751,3 +773,133 @@ int sk_detach_filter(struct sock *sk)  	return ret;  }  EXPORT_SYMBOL_GPL(sk_detach_filter); + +static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) +{ +	static const u16 decodes[] = { +		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K, +		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X, +		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K, +		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X, +		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K, +		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X, +		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X, +		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K, +		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X, +		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K, +		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X, +		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K, +		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X, +		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K, +		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X, +		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K, +		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X, +		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K, +		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X, +		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG, +		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS, +		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS, +		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN, +		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND, +		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND, +		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND, +		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM, +		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN, +		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH, +		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM, +		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX, +		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA, +		[BPF_S_RET_K]		= BPF_RET|BPF_K, +		[BPF_S_RET_A]		= BPF_RET|BPF_A, +		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K, +		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM, +		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM, +		[BPF_S_ST]		= BPF_ST, +		[BPF_S_STX]		= BPF_STX, +		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA, +		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K, +		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X, +		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K, +		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X, +		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K, +		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X, +		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K, +		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X, +	}; +	u16 code; + +	code = filt->code; + +	to->code = decodes[code]; +	to->jt = filt->jt; +	to->jf = filt->jf; + +	if (code == BPF_S_ALU_DIV_K) { +		/* +		 * When loaded this rule user gave us X, which was +		 * translated into R = r(X). Now we calculate the +		 * RR = r(R) and report it back. If next time this +		 * value is loaded and RRR = r(RR) is calculated +		 * then the R == RRR will be true. +		 * +		 * One exception. X == 1 translates into R == 0 and +		 * we can't calculate RR out of it with r(). +		 */ + +		if (filt->k == 0) +			to->k = 1; +		else +			to->k = reciprocal_value(filt->k); + +		BUG_ON(reciprocal_value(to->k) != filt->k); +	} else +		to->k = filt->k; +} + +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +{ +	struct sk_filter *filter; +	int i, ret; + +	lock_sock(sk); +	filter = rcu_dereference_protected(sk->sk_filter, +			sock_owned_by_user(sk)); +	ret = 0; +	if (!filter) +		goto out; +	ret = filter->len; +	if (!len) +		goto out; +	ret = -EINVAL; +	if (len < filter->len) +		goto out; + +	ret = -EFAULT; +	for (i = 0; i < filter->len; i++) { +		struct sock_filter fb; + +		sk_decode_filter(&filter->insns[i], &fb); +		if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) +			goto out; +	} + +	ret = filter->len; +out: +	release_sock(sk); +	return ret; +} diff --git a/net/core/flow.c b/net/core/flow.c index e318c7e9804..c56ea6f7f6c 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -132,14 +132,14 @@ static void __flow_cache_shrink(struct flow_cache *fc,  				int shrink_to)  {  	struct flow_cache_entry *fle; -	struct hlist_node *entry, *tmp; +	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0;  	for (i = 0; i < flow_cache_hash_size(fc); i++) {  		int saved = 0; -		hlist_for_each_entry_safe(fle, entry, tmp, +		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) {  			if (saved < shrink_to &&  			    flow_entry_valid(fle)) { @@ -211,7 +211,6 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,  	struct flow_cache *fc = &flow_cache_global;  	struct flow_cache_percpu *fcp;  	struct flow_cache_entry *fle, *tfle; -	struct hlist_node *entry;  	struct flow_cache_object *flo;  	size_t keysize;  	unsigned int hash; @@ -235,7 +234,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,  		flow_new_hash_rnd(fc, fcp);  	hash = flow_hash_code(fc, fcp, key, keysize); -	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { +	hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {  		if (tfle->net == net &&  		    tfle->family == family &&  		    tfle->dir == dir && @@ -286,7 +285,7 @@ nocache:  		else  			fle->genid--;  	} else { -		if (flo && !IS_ERR(flo)) +		if (!IS_ERR_OR_NULL(flo))  			flo->ops->delete(flo);  	}  ret_object: @@ -301,13 +300,13 @@ static void flow_cache_flush_tasklet(unsigned long data)  	struct flow_cache *fc = info->cache;  	struct flow_cache_percpu *fcp;  	struct flow_cache_entry *fle; -	struct hlist_node *entry, *tmp; +	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0;  	fcp = this_cpu_ptr(fc->percpu);  	for (i = 0; i < flow_cache_hash_size(fc); i++) { -		hlist_for_each_entry_safe(fle, entry, tmp, +		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) {  			if (flow_entry_valid(fle))  				continue; @@ -327,11 +326,9 @@ static void flow_cache_flush_tasklet(unsigned long data)  static void flow_cache_flush_per_cpu(void *data)  {  	struct flow_flush_info *info = data; -	int cpu;  	struct tasklet_struct *tasklet; -	cpu = smp_processor_id(); -	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; +	tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet);  	tasklet->data = (unsigned long)info;  	tasklet_schedule(tasklet);  } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 466820b6e34..9d4c7201400 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -143,3 +143,176 @@ ipv6:  	return true;  }  EXPORT_SYMBOL(skb_flow_dissect); + +static u32 hashrnd __read_mostly; + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_rxhash(struct sk_buff *skb) +{ +	struct flow_keys keys; +	u32 hash; + +	if (!skb_flow_dissect(skb, &keys)) +		return; + +	if (keys.ports) +		skb->l4_rxhash = 1; + +	/* get a consistent hash (same value on both flow directions) */ +	if (((__force u32)keys.dst < (__force u32)keys.src) || +	    (((__force u32)keys.dst == (__force u32)keys.src) && +	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { +		swap(keys.dst, keys.src); +		swap(keys.port16[0], keys.port16[1]); +	} + +	hash = jhash_3words((__force u32)keys.dst, +			    (__force u32)keys.src, +			    (__force u32)keys.ports, hashrnd); +	if (!hash) +		hash = 1; + +	skb->rxhash = hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +		  unsigned int num_tx_queues) +{ +	u32 hash; +	u16 qoffset = 0; +	u16 qcount = num_tx_queues; + +	if (skb_rx_queue_recorded(skb)) { +		hash = skb_get_rx_queue(skb); +		while (unlikely(hash >= num_tx_queues)) +			hash -= num_tx_queues; +		return hash; +	} + +	if (dev->num_tc) { +		u8 tc = netdev_get_prio_tc_map(dev, skb->priority); +		qoffset = dev->tc_to_txq[tc].offset; +		qcount = dev->tc_to_txq[tc].count; +	} + +	if (skb->sk && skb->sk->sk_hash) +		hash = skb->sk->sk_hash; +	else +		hash = (__force u16) skb->protocol; +	hash = jhash_1word(hash, hashrnd); + +	return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ +	if (unlikely(queue_index >= dev->real_num_tx_queues)) { +		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", +				     dev->name, queue_index, +				     dev->real_num_tx_queues); +		return 0; +	} +	return queue_index; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS +	struct xps_dev_maps *dev_maps; +	struct xps_map *map; +	int queue_index = -1; + +	rcu_read_lock(); +	dev_maps = rcu_dereference(dev->xps_maps); +	if (dev_maps) { +		map = rcu_dereference( +		    dev_maps->cpu_map[raw_smp_processor_id()]); +		if (map) { +			if (map->len == 1) +				queue_index = map->queues[0]; +			else { +				u32 hash; +				if (skb->sk && skb->sk->sk_hash) +					hash = skb->sk->sk_hash; +				else +					hash = (__force u16) skb->protocol ^ +					    skb->rxhash; +				hash = jhash_1word(hash, hashrnd); +				queue_index = map->queues[ +				    ((u64)hash * map->len) >> 32]; +			} +			if (unlikely(queue_index >= dev->real_num_tx_queues)) +				queue_index = -1; +		} +	} +	rcu_read_unlock(); + +	return queue_index; +#else +	return -1; +#endif +} + +u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; +	int queue_index = sk_tx_queue_get(sk); + +	if (queue_index < 0 || skb->ooo_okay || +	    queue_index >= dev->real_num_tx_queues) { +		int new_index = get_xps_queue(dev, skb); +		if (new_index < 0) +			new_index = skb_tx_hash(dev, skb); + +		if (queue_index != new_index && sk) { +			struct dst_entry *dst = +				    rcu_dereference_check(sk->sk_dst_cache, 1); + +			if (dst && skb_dst(skb) == dst) +				sk_tx_queue_set(sk, queue_index); + +		} + +		queue_index = new_index; +	} + +	return queue_index; +} +EXPORT_SYMBOL(__netdev_pick_tx); + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, +				    struct sk_buff *skb) +{ +	int queue_index = 0; + +	if (dev->real_num_tx_queues != 1) { +		const struct net_device_ops *ops = dev->netdev_ops; +		if (ops->ndo_select_queue) +			queue_index = ops->ndo_select_queue(dev, skb); +		else +			queue_index = __netdev_pick_tx(dev, skb); +		queue_index = dev_cap_txqueue(dev, queue_index); +	} + +	skb_set_queue_mapping(skb, queue_index); +	return netdev_get_tx_queue(dev, queue_index); +} + +static int __init initialize_hashrnd(void) +{ +	get_random_bytes(&hashrnd, sizeof(hashrnd)); +	return 0; +} + +late_initcall_sync(initialize_hashrnd); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 22571488730..3863b8f639c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -290,15 +290,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device  			goto out_entries;  	} -	if (tbl->entry_size) -		n = kzalloc(tbl->entry_size, GFP_ATOMIC); -	else { -		int sz = sizeof(*n) + tbl->key_len; - -		sz = ALIGN(sz, NEIGH_PRIV_ALIGN); -		sz += dev->neigh_priv_len; -		n = kzalloc(sz, GFP_ATOMIC); -	} +	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);  	if (!n)  		goto out_entries; @@ -778,6 +770,9 @@ static void neigh_periodic_work(struct work_struct *work)  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); +	if (atomic_read(&tbl->entries) < tbl->gc_thresh1) +		goto out; +  	/*  	 *	periodically recompute ReachableTime from random function  	 */ @@ -832,6 +827,7 @@ next_elt:  		nht = rcu_dereference_protected(tbl->nht,  						lockdep_is_held(&tbl->lock));  	} +out:  	/* Cycle through all hash buckets every base_reachable_time/2 ticks.  	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2  	 * base_reachable_time. @@ -1542,6 +1538,12 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)  	if (!tbl->nht || !tbl->phash_buckets)  		panic("cannot allocate neighbour cache hashes"); +	if (!tbl->entry_size) +		tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + +					tbl->key_len, NEIGH_PRIV_ALIGN); +	else +		WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); +  	rwlock_init(&tbl->lock);  	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);  	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); @@ -1787,8 +1789,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) ||  	    /* approximative value for deprecated QUEUE_LEN (in packets) */  	    nla_put_u32(skb, NDTPA_QUEUE_LEN, -			DIV_ROUND_UP(parms->queue_len_bytes, -				     SKB_TRUESIZE(ETH_FRAME_LEN))) || +			parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) ||  	    nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) ||  	    nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) ||  	    nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || @@ -2770,6 +2771,8 @@ EXPORT_SYMBOL(neigh_app_ns);  #endif /* CONFIG_ARPD */  #ifdef CONFIG_SYSCTL +static int zero; +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);  static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,  			   size_t *lenp, loff_t *ppos) @@ -2777,9 +2780,13 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,  	int size, ret;  	ctl_table tmp = *ctl; +	tmp.extra1 = &zero; +	tmp.extra2 = &unres_qlen_max;  	tmp.data = &size; -	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); -	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + +	size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +  	if (write && !ret)  		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);  	return ret; @@ -2865,7 +2872,8 @@ static struct neigh_sysctl_table {  			.procname	= "unres_qlen_bytes",  			.maxlen		= sizeof(int),  			.mode		= 0644, -			.proc_handler	= proc_dointvec, +			.extra1		= &zero, +			.proc_handler   = proc_dointvec_minmax,  		},  		[NEIGH_VAR_PROXY_QLEN] = {  			.procname	= "proxy_qlen", @@ -2987,6 +2995,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;  	} +	/* Don't export sysctls to unprivileged users */ +	if (neigh_parms_net(p)->user_ns != &init_user_ns) +		t->neigh_vars[0].procname = NULL; +  	snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",  		p_name, dev_name_source);  	t->sysctl_header = diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c new file mode 100644 index 00000000000..3174f1998ee --- /dev/null +++ b/net/core/net-procfs.c @@ -0,0 +1,411 @@ +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/wext.h> + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ +	struct net *net = seq_file_net(seq); +	struct net_device *dev; +	struct hlist_head *h; +	unsigned int count = 0, offset = get_offset(*pos); + +	h = &net->dev_name_head[get_bucket(*pos)]; +	hlist_for_each_entry_rcu(dev, h, name_hlist) { +		if (++count == offset) +			return dev; +	} + +	return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ +	struct net_device *dev; +	unsigned int bucket; + +	do { +		dev = dev_from_same_bucket(seq, pos); +		if (dev) +			return dev; + +		bucket = get_bucket(*pos) + 1; +		*pos = set_bucket_offset(bucket, 1); +	} while (bucket < NETDEV_HASHENTRIES); + +	return NULL; +} + +/* + *	This is invoked by the /proc filesystem handler to display a device + *	in detail. + */ +static void *dev_seq_start(struct seq_file *seq, loff_t *pos) +	__acquires(RCU) +{ +	rcu_read_lock(); +	if (!*pos) +		return SEQ_START_TOKEN; + +	if (get_bucket(*pos) >= NETDEV_HASHENTRIES) +		return NULL; + +	return dev_from_bucket(seq, pos); +} + +static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return dev_from_bucket(seq, pos); +} + +static void dev_seq_stop(struct seq_file *seq, void *v) +	__releases(RCU) +{ +	rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ +	struct rtnl_link_stats64 temp; +	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + +	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " +		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", +		   dev->name, stats->rx_bytes, stats->rx_packets, +		   stats->rx_errors, +		   stats->rx_dropped + stats->rx_missed_errors, +		   stats->rx_fifo_errors, +		   stats->rx_length_errors + stats->rx_over_errors + +		    stats->rx_crc_errors + stats->rx_frame_errors, +		   stats->rx_compressed, stats->multicast, +		   stats->tx_bytes, stats->tx_packets, +		   stats->tx_errors, stats->tx_dropped, +		   stats->tx_fifo_errors, stats->collisions, +		   stats->tx_carrier_errors + +		    stats->tx_aborted_errors + +		    stats->tx_window_errors + +		    stats->tx_heartbeat_errors, +		   stats->tx_compressed); +} + +/* + *	Called from the PROCfs module. This now uses the new arbitrary sized + *	/proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "Inter-|   Receive                            " +			      "                    |  Transmit\n" +			      " face |bytes    packets errs drop fifo frame " +			      "compressed multicast|bytes    packets errs " +			      "drop fifo colls carrier compressed\n"); +	else +		dev_seq_printf_stats(seq, v); +	return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ +	struct softnet_data *sd = NULL; + +	while (*pos < nr_cpu_ids) +		if (cpu_online(*pos)) { +			sd = &per_cpu(softnet_data, *pos); +			break; +		} else +			++*pos; +	return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ +	return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ +	struct softnet_data *sd = v; + +	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +		   sd->processed, sd->dropped, sd->time_squeeze, 0, +		   0, 0, 0, 0, /* was fastroute */ +		   sd->cpu_collision, sd->received_rps); +	return 0; +} + +static const struct seq_operations dev_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &dev_seq_ops, +			    sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { +	.start = softnet_seq_start, +	.next  = softnet_seq_next, +	.stop  = softnet_seq_stop, +	.show  = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = softnet_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ +	struct packet_type *pt = NULL; +	loff_t i = 0; +	int t; + +	list_for_each_entry_rcu(pt, &ptype_all, list) { +		if (i == pos) +			return pt; +		++i; +	} + +	for (t = 0; t < PTYPE_HASH_SIZE; t++) { +		list_for_each_entry_rcu(pt, &ptype_base[t], list) { +			if (i == pos) +				return pt; +			++i; +		} +	} +	return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) +	__acquires(RCU) +{ +	rcu_read_lock(); +	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct packet_type *pt; +	struct list_head *nxt; +	int hash; + +	++*pos; +	if (v == SEQ_START_TOKEN) +		return ptype_get_idx(0); + +	pt = v; +	nxt = pt->list.next; +	if (pt->type == htons(ETH_P_ALL)) { +		if (nxt != &ptype_all) +			goto found; +		hash = 0; +		nxt = ptype_base[0].next; +	} else +		hash = ntohs(pt->type) & PTYPE_HASH_MASK; + +	while (nxt == &ptype_base[hash]) { +		if (++hash >= PTYPE_HASH_SIZE) +			return NULL; +		nxt = ptype_base[hash].next; +	} +found: +	return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) +	__releases(RCU) +{ +	rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ +	struct packet_type *pt = v; + +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "Type Device      Function\n"); +	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { +		if (pt->type == htons(ETH_P_ALL)) +			seq_puts(seq, "ALL "); +		else +			seq_printf(seq, "%04x", ntohs(pt->type)); + +		seq_printf(seq, " %-8s %pF\n", +			   pt->dev ? pt->dev->name : "", pt->func); +	} + +	return 0; +} + +static const struct seq_operations ptype_seq_ops = { +	.start = ptype_seq_start, +	.next  = ptype_seq_next, +	.stop  = ptype_seq_stop, +	.show  = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &ptype_seq_ops, +			sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = ptype_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ +	int rc = -ENOMEM; + +	if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) +		goto out; +	if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, +			 &softnet_seq_fops)) +		goto out_dev; +	if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) +		goto out_softnet; + +	if (wext_proc_init(net)) +		goto out_ptype; +	rc = 0; +out: +	return rc; +out_ptype: +	remove_proc_entry("ptype", net->proc_net); +out_softnet: +	remove_proc_entry("softnet_stat", net->proc_net); +out_dev: +	remove_proc_entry("dev", net->proc_net); +	goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ +	wext_proc_exit(net); + +	remove_proc_entry("ptype", net->proc_net); +	remove_proc_entry("softnet_stat", net->proc_net); +	remove_proc_entry("dev", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { +	.init = dev_proc_net_init, +	.exit = dev_proc_net_exit, +}; + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ +	struct netdev_hw_addr *ha; +	struct net_device *dev = v; + +	if (v == SEQ_START_TOKEN) +		return 0; + +	netif_addr_lock_bh(dev); +	netdev_for_each_mc_addr(ha, dev) { +		int i; + +		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, +			   dev->name, ha->refcount, ha->global_use); + +		for (i = 0; i < dev->addr_len; i++) +			seq_printf(seq, "%02x", ha->addr[i]); + +		seq_putc(seq, '\n'); +	} +	netif_addr_unlock_bh(dev); +	return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &dev_mc_seq_ops, +			    sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_mc_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + +static int __net_init dev_mc_net_init(struct net *net) +{ +	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) +		return -ENOMEM; +	return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ +	remove_proc_entry("dev_mcast", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { +	.init = dev_mc_net_init, +	.exit = dev_mc_net_exit, +}; + +int __init dev_proc_init(void) +{ +	int ret = register_pernet_subsys(&dev_proc_ops); +	if (!ret) +		return register_pernet_subsys(&dev_mc_net_ops); +	return ret; +} diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bcf02f608cb..7427ab5e27d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -18,11 +18,10 @@  #include <net/sock.h>  #include <net/net_namespace.h>  #include <linux/rtnetlink.h> -#include <linux/wireless.h>  #include <linux/vmalloc.h>  #include <linux/export.h>  #include <linux/jiffies.h> -#include <net/wext.h> +#include <linux/pm_runtime.h>  #include "net-sysfs.h" @@ -73,11 +72,12 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  			    const char *buf, size_t len,  			    int (*set)(struct net_device *, unsigned long))  { -	struct net_device *net = to_net_dev(dev); +	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	unsigned long new;  	int ret = -EINVAL; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	ret = kstrtoul(buf, 0, &new); @@ -87,8 +87,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  	if (!rtnl_trylock())  		return restart_syscall(); -	if (dev_isalive(net)) { -		if ((ret = (*set)(net, new)) == 0) +	if (dev_isalive(netdev)) { +		if ((ret = (*set)(netdev, new)) == 0)  			ret = len;  	}  	rtnl_unlock(); @@ -127,6 +127,19 @@ static ssize_t show_broadcast(struct device *dev,  	return -EINVAL;  } +static int change_carrier(struct net_device *net, unsigned long new_carrier) +{ +	if (!netif_running(net)) +		return -EINVAL; +	return dev_change_carrier(net, (bool) new_carrier); +} + +static ssize_t store_carrier(struct device *dev, struct device_attribute *attr, +			 const char *buf, size_t len) +{ +	return netdev_store(dev, attr, buf, len, change_carrier); +} +  static ssize_t show_carrier(struct device *dev,  			    struct device_attribute *attr, char *buf)  { @@ -264,6 +277,9 @@ static ssize_t store_tx_queue_len(struct device *dev,  				  struct device_attribute *attr,  				  const char *buf, size_t len)  { +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +  	return netdev_store(dev, attr, buf, len, change_tx_queue_len);  } @@ -271,10 +287,11 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,  			     const char *buf, size_t len)  {  	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	size_t count = len;  	ssize_t ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	/* ignore trailing newline */ @@ -328,7 +345,7 @@ static struct device_attribute net_class_attributes[] = {  	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),  	__ATTR(address, S_IRUGO, show_address, NULL),  	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL), -	__ATTR(carrier, S_IRUGO, show_carrier, NULL), +	__ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier),  	__ATTR(speed, S_IRUGO, show_speed, NULL),  	__ATTR(duplex, S_IRUGO, show_duplex, NULL),  	__ATTR(dormant, S_IRUGO, show_dormant, NULL), @@ -429,6 +446,17 @@ static struct attribute_group netstat_group = {  	.name  = "statistics",  	.attrs  = netstat_attrs,  }; + +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) +static struct attribute *wireless_attrs[] = { +	NULL +}; + +static struct attribute_group wireless_group = { +	.name = "wireless", +	.attrs = wireless_attrs, +}; +#endif  #endif /* CONFIG_SYSFS */  #ifdef CONFIG_RPS @@ -975,68 +1003,14 @@ static ssize_t show_xps_map(struct netdev_queue *queue,  	return len;  } -static DEFINE_MUTEX(xps_map_mutex); -#define xmap_dereference(P)		\ -	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) - -static void xps_queue_release(struct netdev_queue *queue) -{ -	struct net_device *dev = queue->dev; -	struct xps_dev_maps *dev_maps; -	struct xps_map *map; -	unsigned long index; -	int i, pos, nonempty = 0; - -	index = get_netdev_queue_index(queue); - -	mutex_lock(&xps_map_mutex); -	dev_maps = xmap_dereference(dev->xps_maps); - -	if (dev_maps) { -		for_each_possible_cpu(i) { -			map = xmap_dereference(dev_maps->cpu_map[i]); -			if (!map) -				continue; - -			for (pos = 0; pos < map->len; pos++) -				if (map->queues[pos] == index) -					break; - -			if (pos < map->len) { -				if (map->len > 1) -					map->queues[pos] = -					    map->queues[--map->len]; -				else { -					RCU_INIT_POINTER(dev_maps->cpu_map[i], -					    NULL); -					kfree_rcu(map, rcu); -					map = NULL; -				} -			} -			if (map) -				nonempty = 1; -		} - -		if (!nonempty) { -			RCU_INIT_POINTER(dev->xps_maps, NULL); -			kfree_rcu(dev_maps, rcu); -		} -	} -	mutex_unlock(&xps_map_mutex); -} -  static ssize_t store_xps_map(struct netdev_queue *queue,  		      struct netdev_queue_attribute *attribute,  		      const char *buf, size_t len)  {  	struct net_device *dev = queue->dev; -	cpumask_var_t mask; -	int err, i, cpu, pos, map_len, alloc_len, need_set;  	unsigned long index; -	struct xps_map *map, *new_map; -	struct xps_dev_maps *dev_maps, *new_dev_maps; -	int nonempty = 0; -	int numa_node_id = -2; +	cpumask_var_t mask; +	int err;  	if (!capable(CAP_NET_ADMIN))  		return -EPERM; @@ -1052,105 +1026,11 @@ static ssize_t store_xps_map(struct netdev_queue *queue,  		return err;  	} -	new_dev_maps = kzalloc(max_t(unsigned int, -	    XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); -	if (!new_dev_maps) { -		free_cpumask_var(mask); -		return -ENOMEM; -	} - -	mutex_lock(&xps_map_mutex); - -	dev_maps = xmap_dereference(dev->xps_maps); - -	for_each_possible_cpu(cpu) { -		map = dev_maps ? -			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; -		new_map = map; -		if (map) { -			for (pos = 0; pos < map->len; pos++) -				if (map->queues[pos] == index) -					break; -			map_len = map->len; -			alloc_len = map->alloc_len; -		} else -			pos = map_len = alloc_len = 0; - -		need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); -#ifdef CONFIG_NUMA -		if (need_set) { -			if (numa_node_id == -2) -				numa_node_id = cpu_to_node(cpu); -			else if (numa_node_id != cpu_to_node(cpu)) -				numa_node_id = -1; -		} -#endif -		if (need_set && pos >= map_len) { -			/* Need to add queue to this CPU's map */ -			if (map_len >= alloc_len) { -				alloc_len = alloc_len ? -				    2 * alloc_len : XPS_MIN_MAP_ALLOC; -				new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), -						       GFP_KERNEL, -						       cpu_to_node(cpu)); -				if (!new_map) -					goto error; -				new_map->alloc_len = alloc_len; -				for (i = 0; i < map_len; i++) -					new_map->queues[i] = map->queues[i]; -				new_map->len = map_len; -			} -			new_map->queues[new_map->len++] = index; -		} else if (!need_set && pos < map_len) { -			/* Need to remove queue from this CPU's map */ -			if (map_len > 1) -				new_map->queues[pos] = -				    new_map->queues[--new_map->len]; -			else -				new_map = NULL; -		} -		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); -	} - -	/* Cleanup old maps */ -	for_each_possible_cpu(cpu) { -		map = dev_maps ? -			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; -		if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) -			kfree_rcu(map, rcu); -		if (new_dev_maps->cpu_map[cpu]) -			nonempty = 1; -	} - -	if (nonempty) { -		rcu_assign_pointer(dev->xps_maps, new_dev_maps); -	} else { -		kfree(new_dev_maps); -		RCU_INIT_POINTER(dev->xps_maps, NULL); -	} - -	if (dev_maps) -		kfree_rcu(dev_maps, rcu); - -	netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : -					    NUMA_NO_NODE); - -	mutex_unlock(&xps_map_mutex); +	err = netif_set_xps_queue(dev, mask, index);  	free_cpumask_var(mask); -	return len; - -error: -	mutex_unlock(&xps_map_mutex); -	if (new_dev_maps) -		for_each_possible_cpu(i) -			kfree(rcu_dereference_protected( -				new_dev_maps->cpu_map[i], -				1)); -	kfree(new_dev_maps); -	free_cpumask_var(mask); -	return -ENOMEM; +	return err ? : len;  }  static struct netdev_queue_attribute xps_cpus_attribute = @@ -1169,10 +1049,6 @@ static void netdev_queue_release(struct kobject *kobj)  {  	struct netdev_queue *queue = to_netdev_queue(kobj); -#ifdef CONFIG_XPS -	xps_queue_release(queue); -#endif -  	memset(kobj, 0, sizeof(*kobj));  	dev_put(queue->dev);  } @@ -1320,7 +1196,6 @@ struct kobj_ns_type_operations net_ns_type_operations = {  };  EXPORT_SYMBOL_GPL(net_ns_type_operations); -#ifdef CONFIG_HOTPLUG  static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  {  	struct net_device *dev = to_net_dev(d); @@ -1339,7 +1214,6 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  exit:  	return retval;  } -#endif  /*   *	netdev_release -- destroy and free a dead device. @@ -1368,9 +1242,7 @@ static struct class net_class = {  #ifdef CONFIG_SYSFS  	.dev_attrs = net_class_attributes,  #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_HOTPLUG  	.dev_uevent = netdev_uevent, -#endif  	.ns_type = &net_ns_type_operations,  	.namespace = net_namespace,  }; @@ -1386,6 +1258,8 @@ void netdev_unregister_kobject(struct net_device * net)  	remove_queue_kobjects(net); +	pm_runtime_set_memalloc_noio(dev, false); +  	device_del(dev);  } @@ -1409,6 +1283,15 @@ int netdev_register_kobject(struct net_device *net)  		groups++;  	*groups++ = &netstat_group; + +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) +	if (net->ieee80211_ptr) +		*groups++ = &wireless_group; +#if IS_ENABLED(CONFIG_WIRELESS_EXT) +	else if (net->wireless_handlers) +		*groups++ = &wireless_group; +#endif +#endif  #endif /* CONFIG_SYSFS */  	error = device_add(dev); @@ -1421,6 +1304,8 @@ int netdev_register_kobject(struct net_device *net)  		return error;  	} +	pm_runtime_set_memalloc_noio(dev, true); +  	return error;  } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 42f1e1c7514..80e271d9e64 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -13,6 +13,7 @@  #include <linux/proc_fs.h>  #include <linux/file.h>  #include <linux/export.h> +#include <linux/user_namespace.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h> @@ -145,7 +146,7 @@ static void ops_free_list(const struct pernet_operations *ops,  /*   * setup_net runs the initializers for the network namespace object.   */ -static __net_init int setup_net(struct net *net) +static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)  {  	/* Must be called with net_mutex held */  	const struct pernet_operations *ops, *saved_ops; @@ -155,6 +156,7 @@ static __net_init int setup_net(struct net *net)  	atomic_set(&net->count, 1);  	atomic_set(&net->passive, 1);  	net->dev_base_seq = 1; +	net->user_ns = user_ns;  #ifdef NETNS_REFCNT_DEBUG  	atomic_set(&net->use_count, 0); @@ -232,7 +234,8 @@ void net_drop_ns(void *p)  		net_free(ns);  } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) +struct net *copy_net_ns(unsigned long flags, +			struct user_namespace *user_ns, struct net *old_net)  {  	struct net *net;  	int rv; @@ -243,8 +246,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)  	net = net_alloc();  	if (!net)  		return ERR_PTR(-ENOMEM); + +	get_user_ns(user_ns); +  	mutex_lock(&net_mutex); -	rv = setup_net(net); +	rv = setup_net(net, user_ns);  	if (rv == 0) {  		rtnl_lock();  		list_add_tail_rcu(&net->list, &net_namespace_list); @@ -252,6 +258,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)  	}  	mutex_unlock(&net_mutex);  	if (rv < 0) { +		put_user_ns(user_ns);  		net_drop_ns(net);  		return ERR_PTR(rv);  	} @@ -308,6 +315,7 @@ static void cleanup_net(struct work_struct *work)  	/* Finally it is safe to free my network namespace structure */  	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {  		list_del_init(&net->exit_list); +		put_user_ns(net->user_ns);  		net_drop_ns(net);  	}  } @@ -336,7 +344,7 @@ struct net *get_net_ns_by_fd(int fd)  	if (IS_ERR(file))  		return ERR_CAST(file); -	ei = PROC_I(file->f_dentry->d_inode); +	ei = PROC_I(file_inode(file));  	if (ei->ns_ops == &netns_operations)  		net = get_net(ei->ns);  	else @@ -347,13 +355,6 @@ struct net *get_net_ns_by_fd(int fd)  }  #else -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ -	if (flags & CLONE_NEWNET) -		return ERR_PTR(-EINVAL); -	return old_net; -} -  struct net *get_net_ns_by_fd(int fd)  {  	return ERR_PTR(-EINVAL); @@ -380,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid)  }  EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ +	return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ +	proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { +	.init = net_ns_net_init, +	.exit = net_ns_net_exit, +}; +  static int __init net_ns_init(void)  {  	struct net_generic *ng; @@ -402,7 +418,7 @@ static int __init net_ns_init(void)  	rcu_assign_pointer(init_net.gen, ng);  	mutex_lock(&net_mutex); -	if (setup_net(&init_net)) +	if (setup_net(&init_net, &init_user_ns))  		panic("Could not setup the initial network namespace");  	rtnl_lock(); @@ -411,6 +427,8 @@ static int __init net_ns_init(void)  	mutex_unlock(&net_mutex); +	register_pernet_subsys(&net_ns_ops); +  	return 0;  } @@ -629,16 +647,29 @@ static void netns_put(void *ns)  static int netns_install(struct nsproxy *nsproxy, void *ns)  { +	struct net *net = ns; + +	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || +	    !nsown_capable(CAP_SYS_ADMIN)) +		return -EPERM; +  	put_net(nsproxy->net_ns); -	nsproxy->net_ns = get_net(ns); +	nsproxy->net_ns = get_net(net);  	return 0;  } +static unsigned int netns_inum(void *ns) +{ +	struct net *net = ns; +	return net->proc_inum; +} +  const struct proc_ns_operations netns_operations = {  	.name		= "net",  	.type		= CLONE_NEWNET,  	.get		= netns_get,  	.put		= netns_put,  	.install	= netns_install, +	.inum		= netns_inum,  };  #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 77a0388fc3b..fa32899006a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -29,6 +29,9 @@  #include <linux/if_vlan.h>  #include <net/tcp.h>  #include <net/udp.h> +#include <net/addrconf.h> +#include <net/ndisc.h> +#include <net/ip6_checksum.h>  #include <asm/unaligned.h>  #include <trace/events/napi.h> @@ -44,6 +47,8 @@ static struct sk_buff_head skb_pool;  static atomic_t trapped; +static struct srcu_struct netpoll_srcu; +  #define USEC_PER_POLL	50  #define NETPOLL_RX_ENABLED  1  #define NETPOLL_RX_DROP     2 @@ -55,7 +60,8 @@ static atomic_t trapped;  	 MAX_UDP_CHUNK)  static void zap_completion_queue(void); -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); +static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo); +static void netpoll_async_cleanup(struct work_struct *work);  static unsigned int carrier_timeout = 4;  module_param(carrier_timeout, uint, 0644); @@ -181,13 +187,13 @@ static void poll_napi(struct net_device *dev)  	}  } -static void service_arp_queue(struct netpoll_info *npi) +static void service_neigh_queue(struct netpoll_info *npi)  {  	if (npi) {  		struct sk_buff *skb; -		while ((skb = skb_dequeue(&npi->arp_tx))) -			netpoll_arp_reply(skb, npi); +		while ((skb = skb_dequeue(&npi->neigh_tx))) +			netpoll_neigh_reply(skb, npi);  	}  } @@ -196,35 +202,76 @@ static void netpoll_poll_dev(struct net_device *dev)  	const struct net_device_ops *ops;  	struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); -	if (!dev || !netif_running(dev)) +	/* Don't do any rx activity if the dev_lock mutex is held +	 * the dev_open/close paths use this to block netpoll activity +	 * while changing device state +	 */ +	if (!mutex_trylock(&ni->dev_lock)) +		return; + +	if (!netif_running(dev)) { +		mutex_unlock(&ni->dev_lock);  		return; +	}  	ops = dev->netdev_ops; -	if (!ops->ndo_poll_controller) +	if (!ops->ndo_poll_controller) { +		mutex_unlock(&ni->dev_lock);  		return; +	}  	/* Process pending work on NIC */  	ops->ndo_poll_controller(dev);  	poll_napi(dev); +	mutex_unlock(&ni->dev_lock); +  	if (dev->flags & IFF_SLAVE) {  		if (ni) { -			struct net_device *bond_dev = dev->master; +			struct net_device *bond_dev;  			struct sk_buff *skb; -			struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); -			while ((skb = skb_dequeue(&ni->arp_tx))) { +			struct netpoll_info *bond_ni; + +			bond_dev = netdev_master_upper_dev_get_rcu(dev); +			bond_ni = rcu_dereference_bh(bond_dev->npinfo); +			while ((skb = skb_dequeue(&ni->neigh_tx))) {  				skb->dev = bond_dev; -				skb_queue_tail(&bond_ni->arp_tx, skb); +				skb_queue_tail(&bond_ni->neigh_tx, skb);  			}  		}  	} -	service_arp_queue(ni); +	service_neigh_queue(ni);  	zap_completion_queue();  } +int netpoll_rx_disable(struct net_device *dev) +{ +	struct netpoll_info *ni; +	int idx; +	might_sleep(); +	idx = srcu_read_lock(&netpoll_srcu); +	ni = srcu_dereference(dev->npinfo, &netpoll_srcu); +	if (ni) +		mutex_lock(&ni->dev_lock); +	srcu_read_unlock(&netpoll_srcu, idx); +	return 0; +} +EXPORT_SYMBOL(netpoll_rx_disable); + +void netpoll_rx_enable(struct net_device *dev) +{ +	struct netpoll_info *ni; +	rcu_read_lock(); +	ni = rcu_dereference(dev->npinfo); +	if (ni) +		mutex_unlock(&ni->dev_lock); +	rcu_read_unlock(); +} +EXPORT_SYMBOL(netpoll_rx_enable); +  static void refill_skbs(void)  {  	struct sk_buff *skb; @@ -381,9 +428,14 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  	struct iphdr *iph;  	struct ethhdr *eth;  	static atomic_t ip_ident; +	struct ipv6hdr *ip6h;  	udp_len = len + sizeof(*udph); -	ip_len = udp_len + sizeof(*iph); +	if (np->ipv6) +		ip_len = udp_len + sizeof(*ip6h); +	else +		ip_len = udp_len + sizeof(*iph); +  	total_len = ip_len + LL_RESERVED_SPACE(np->dev);  	skb = find_skb(np, total_len + np->dev->needed_tailroom, @@ -400,34 +452,66 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  	udph->source = htons(np->local_port);  	udph->dest = htons(np->remote_port);  	udph->len = htons(udp_len); -	udph->check = 0; -	udph->check = csum_tcpudp_magic(np->local_ip, -					np->remote_ip, -					udp_len, IPPROTO_UDP, -					csum_partial(udph, udp_len, 0)); -	if (udph->check == 0) -		udph->check = CSUM_MANGLED_0; -	skb_push(skb, sizeof(*iph)); -	skb_reset_network_header(skb); -	iph = ip_hdr(skb); +	if (np->ipv6) { +		udph->check = 0; +		udph->check = csum_ipv6_magic(&np->local_ip.in6, +					      &np->remote_ip.in6, +					      udp_len, IPPROTO_UDP, +					      csum_partial(udph, udp_len, 0)); +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0; + +		skb_push(skb, sizeof(*ip6h)); +		skb_reset_network_header(skb); +		ip6h = ipv6_hdr(skb); + +		/* ip6h->version = 6; ip6h->priority = 0; */ +		put_unaligned(0x60, (unsigned char *)ip6h); +		ip6h->flow_lbl[0] = 0; +		ip6h->flow_lbl[1] = 0; +		ip6h->flow_lbl[2] = 0; + +		ip6h->payload_len = htons(sizeof(struct udphdr) + len); +		ip6h->nexthdr = IPPROTO_UDP; +		ip6h->hop_limit = 32; +		ip6h->saddr = np->local_ip.in6; +		ip6h->daddr = np->remote_ip.in6; + +		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		skb_reset_mac_header(skb); +		skb->protocol = eth->h_proto = htons(ETH_P_IPV6); +	} else { +		udph->check = 0; +		udph->check = csum_tcpudp_magic(np->local_ip.ip, +						np->remote_ip.ip, +						udp_len, IPPROTO_UDP, +						csum_partial(udph, udp_len, 0)); +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0; + +		skb_push(skb, sizeof(*iph)); +		skb_reset_network_header(skb); +		iph = ip_hdr(skb); + +		/* iph->version = 4; iph->ihl = 5; */ +		put_unaligned(0x45, (unsigned char *)iph); +		iph->tos      = 0; +		put_unaligned(htons(ip_len), &(iph->tot_len)); +		iph->id       = htons(atomic_inc_return(&ip_ident)); +		iph->frag_off = 0; +		iph->ttl      = 64; +		iph->protocol = IPPROTO_UDP; +		iph->check    = 0; +		put_unaligned(np->local_ip.ip, &(iph->saddr)); +		put_unaligned(np->remote_ip.ip, &(iph->daddr)); +		iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); -	/* iph->version = 4; iph->ihl = 5; */ -	put_unaligned(0x45, (unsigned char *)iph); -	iph->tos      = 0; -	put_unaligned(htons(ip_len), &(iph->tot_len)); -	iph->id       = htons(atomic_inc_return(&ip_ident)); -	iph->frag_off = 0; -	iph->ttl      = 64; -	iph->protocol = IPPROTO_UDP; -	iph->check    = 0; -	put_unaligned(np->local_ip, &(iph->saddr)); -	put_unaligned(np->remote_ip, &(iph->daddr)); -	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); +		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		skb_reset_mac_header(skb); +		skb->protocol = eth->h_proto = htons(ETH_P_IP); +	} -	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -	skb_reset_mac_header(skb); -	skb->protocol = eth->h_proto = htons(ETH_P_IP);  	memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN);  	memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); @@ -437,18 +521,16 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  }  EXPORT_SYMBOL(netpoll_send_udp); -static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) +static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo)  { -	struct arphdr *arp; -	unsigned char *arp_ptr; -	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; +	int size, type = ARPOP_REPLY;  	__be32 sip, tip;  	unsigned char *sha;  	struct sk_buff *send_skb;  	struct netpoll *np, *tmp;  	unsigned long flags;  	int hlen, tlen; -	int hits = 0; +	int hits = 0, proto;  	if (list_empty(&npinfo->rx_np))  		return; @@ -466,94 +548,214 @@ static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo)  	if (!hits)  		return; -	/* No arp on this interface */ -	if (skb->dev->flags & IFF_NOARP) -		return; +	proto = ntohs(eth_hdr(skb)->h_proto); +	if (proto == ETH_P_IP) { +		struct arphdr *arp; +		unsigned char *arp_ptr; +		/* No arp on this interface */ +		if (skb->dev->flags & IFF_NOARP) +			return; -	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) -		return; +		if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) +			return; -	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); -	arp = arp_hdr(skb); +		skb_reset_network_header(skb); +		skb_reset_transport_header(skb); +		arp = arp_hdr(skb); -	if ((arp->ar_hrd != htons(ARPHRD_ETHER) && -	     arp->ar_hrd != htons(ARPHRD_IEEE802)) || -	    arp->ar_pro != htons(ETH_P_IP) || -	    arp->ar_op != htons(ARPOP_REQUEST)) -		return; +		if ((arp->ar_hrd != htons(ARPHRD_ETHER) && +		     arp->ar_hrd != htons(ARPHRD_IEEE802)) || +		    arp->ar_pro != htons(ETH_P_IP) || +		    arp->ar_op != htons(ARPOP_REQUEST)) +			return; -	arp_ptr = (unsigned char *)(arp+1); -	/* save the location of the src hw addr */ -	sha = arp_ptr; -	arp_ptr += skb->dev->addr_len; -	memcpy(&sip, arp_ptr, 4); -	arp_ptr += 4; -	/* If we actually cared about dst hw addr, -	   it would get copied here */ -	arp_ptr += skb->dev->addr_len; -	memcpy(&tip, arp_ptr, 4); +		arp_ptr = (unsigned char *)(arp+1); +		/* save the location of the src hw addr */ +		sha = arp_ptr; +		arp_ptr += skb->dev->addr_len; +		memcpy(&sip, arp_ptr, 4); +		arp_ptr += 4; +		/* If we actually cared about dst hw addr, +		   it would get copied here */ +		arp_ptr += skb->dev->addr_len; +		memcpy(&tip, arp_ptr, 4); -	/* Should we ignore arp? */ -	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) -		return; +		/* Should we ignore arp? */ +		if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) +			return; -	size = arp_hdr_len(skb->dev); +		size = arp_hdr_len(skb->dev); -	spin_lock_irqsave(&npinfo->rx_lock, flags); -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (tip != np->local_ip) -			continue; +		spin_lock_irqsave(&npinfo->rx_lock, flags); +		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { +			if (tip != np->local_ip.ip) +				continue; -		hlen = LL_RESERVED_SPACE(np->dev); -		tlen = np->dev->needed_tailroom; -		send_skb = find_skb(np, size + hlen + tlen, hlen); -		if (!send_skb) -			continue; +			hlen = LL_RESERVED_SPACE(np->dev); +			tlen = np->dev->needed_tailroom; +			send_skb = find_skb(np, size + hlen + tlen, hlen); +			if (!send_skb) +				continue; -		skb_reset_network_header(send_skb); -		arp = (struct arphdr *) skb_put(send_skb, size); -		send_skb->dev = skb->dev; -		send_skb->protocol = htons(ETH_P_ARP); +			skb_reset_network_header(send_skb); +			arp = (struct arphdr *) skb_put(send_skb, size); +			send_skb->dev = skb->dev; +			send_skb->protocol = htons(ETH_P_ARP); -		/* Fill the device header for the ARP frame */ -		if (dev_hard_header(send_skb, skb->dev, ptype, -				    sha, np->dev->dev_addr, -				    send_skb->len) < 0) { -			kfree_skb(send_skb); -			continue; +			/* Fill the device header for the ARP frame */ +			if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP, +					    sha, np->dev->dev_addr, +					    send_skb->len) < 0) { +				kfree_skb(send_skb); +				continue; +			} + +			/* +			 * Fill out the arp protocol part. +			 * +			 * we only support ethernet device type, +			 * which (according to RFC 1390) should +			 * always equal 1 (Ethernet). +			 */ + +			arp->ar_hrd = htons(np->dev->type); +			arp->ar_pro = htons(ETH_P_IP); +			arp->ar_hln = np->dev->addr_len; +			arp->ar_pln = 4; +			arp->ar_op = htons(type); + +			arp_ptr = (unsigned char *)(arp + 1); +			memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); +			arp_ptr += np->dev->addr_len; +			memcpy(arp_ptr, &tip, 4); +			arp_ptr += 4; +			memcpy(arp_ptr, sha, np->dev->addr_len); +			arp_ptr += np->dev->addr_len; +			memcpy(arp_ptr, &sip, 4); + +			netpoll_send_skb(np, send_skb); + +			/* If there are several rx_hooks for the same address, +			   we're fine by sending a single reply */ +			break;  		} +		spin_unlock_irqrestore(&npinfo->rx_lock, flags); +	} else if( proto == ETH_P_IPV6) { +#if IS_ENABLED(CONFIG_IPV6) +		struct nd_msg *msg; +		u8 *lladdr = NULL; +		struct ipv6hdr *hdr; +		struct icmp6hdr *icmp6h; +		const struct in6_addr *saddr; +		const struct in6_addr *daddr; +		struct inet6_dev *in6_dev = NULL; +		struct in6_addr *target; -		/* -		 * Fill out the arp protocol part. -		 * -		 * we only support ethernet device type, -		 * which (according to RFC 1390) should -		 * always equal 1 (Ethernet). -		 */ +		in6_dev = in6_dev_get(skb->dev); +		if (!in6_dev || !in6_dev->cnf.accept_ra) +			return; -		arp->ar_hrd = htons(np->dev->type); -		arp->ar_pro = htons(ETH_P_IP); -		arp->ar_hln = np->dev->addr_len; -		arp->ar_pln = 4; -		arp->ar_op = htons(type); +		if (!pskb_may_pull(skb, skb->len)) +			return; -		arp_ptr = (unsigned char *)(arp + 1); -		memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); -		arp_ptr += np->dev->addr_len; -		memcpy(arp_ptr, &tip, 4); -		arp_ptr += 4; -		memcpy(arp_ptr, sha, np->dev->addr_len); -		arp_ptr += np->dev->addr_len; -		memcpy(arp_ptr, &sip, 4); +		msg = (struct nd_msg *)skb_transport_header(skb); + +		__skb_push(skb, skb->data - skb_transport_header(skb)); + +		if (ipv6_hdr(skb)->hop_limit != 255) +			return; +		if (msg->icmph.icmp6_code != 0) +			return; +		if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) +			return; -		netpoll_send_skb(np, send_skb); +		saddr = &ipv6_hdr(skb)->saddr; +		daddr = &ipv6_hdr(skb)->daddr; -		/* If there are several rx_hooks for the same address, -		   we're fine by sending a single reply */ -		break; +		size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + +		spin_lock_irqsave(&npinfo->rx_lock, flags); +		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { +			if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) +				continue; + +			hlen = LL_RESERVED_SPACE(np->dev); +			tlen = np->dev->needed_tailroom; +			send_skb = find_skb(np, size + hlen + tlen, hlen); +			if (!send_skb) +				continue; + +			send_skb->protocol = htons(ETH_P_IPV6); +			send_skb->dev = skb->dev; + +			skb_reset_network_header(send_skb); +			skb_put(send_skb, sizeof(struct ipv6hdr)); +			hdr = ipv6_hdr(send_skb); + +			*(__be32*)hdr = htonl(0x60000000); + +			hdr->payload_len = htons(size); +			hdr->nexthdr = IPPROTO_ICMPV6; +			hdr->hop_limit = 255; +			hdr->saddr = *saddr; +			hdr->daddr = *daddr; + +			send_skb->transport_header = send_skb->tail; +			skb_put(send_skb, size); + +			icmp6h = (struct icmp6hdr *)skb_transport_header(skb); +			icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; +			icmp6h->icmp6_router = 0; +			icmp6h->icmp6_solicited = 1; +			target = (struct in6_addr *)(skb_transport_header(send_skb) + sizeof(struct icmp6hdr)); +			*target = msg->target; +			icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, +							      IPPROTO_ICMPV6, +							      csum_partial(icmp6h, +									   size, 0)); + +			if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6, +					    lladdr, np->dev->dev_addr, +					    send_skb->len) < 0) { +				kfree_skb(send_skb); +				continue; +			} + +			netpoll_send_skb(np, send_skb); + +			/* If there are several rx_hooks for the same address, +			   we're fine by sending a single reply */ +			break; +		} +		spin_unlock_irqrestore(&npinfo->rx_lock, flags); +#endif  	} -	spin_unlock_irqrestore(&npinfo->rx_lock, flags); +} + +static bool pkt_is_ns(struct sk_buff *skb) +{ +	struct nd_msg *msg; +	struct ipv6hdr *hdr; + +	if (skb->protocol != htons(ETH_P_ARP)) +		return false; +	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) +		return false; + +	msg = (struct nd_msg *)skb_transport_header(skb); +	__skb_push(skb, skb->data - skb_transport_header(skb)); +	hdr = ipv6_hdr(skb); + +	if (hdr->nexthdr != IPPROTO_ICMPV6) +		return false; +	if (hdr->hop_limit != 255) +		return false; +	if (msg->icmph.icmp6_code != 0) +		return false; +	if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) +		return false; + +	return true;  }  int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) @@ -571,9 +773,11 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)  		goto out;  	/* check if netpoll clients need ARP */ -	if (skb->protocol == htons(ETH_P_ARP) && -	    atomic_read(&trapped)) { -		skb_queue_tail(&npinfo->arp_tx, skb); +	if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { +		skb_queue_tail(&npinfo->neigh_tx, skb); +		return 1; +	} else if (pkt_is_ns(skb) && atomic_read(&trapped)) { +		skb_queue_tail(&npinfo->neigh_tx, skb);  		return 1;  	} @@ -584,60 +788,100 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)  	}  	proto = ntohs(eth_hdr(skb)->h_proto); -	if (proto != ETH_P_IP) +	if (proto != ETH_P_IP && proto != ETH_P_IPV6)  		goto out;  	if (skb->pkt_type == PACKET_OTHERHOST)  		goto out;  	if (skb_shared(skb))  		goto out; -	if (!pskb_may_pull(skb, sizeof(struct iphdr))) -		goto out; -	iph = (struct iphdr *)skb->data; -	if (iph->ihl < 5 || iph->version != 4) -		goto out; -	if (!pskb_may_pull(skb, iph->ihl*4)) -		goto out; -	iph = (struct iphdr *)skb->data; -	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) -		goto out; +	if (proto == ETH_P_IP) { +		if (!pskb_may_pull(skb, sizeof(struct iphdr))) +			goto out; +		iph = (struct iphdr *)skb->data; +		if (iph->ihl < 5 || iph->version != 4) +			goto out; +		if (!pskb_may_pull(skb, iph->ihl*4)) +			goto out; +		iph = (struct iphdr *)skb->data; +		if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) +			goto out; -	len = ntohs(iph->tot_len); -	if (skb->len < len || len < iph->ihl*4) -		goto out; +		len = ntohs(iph->tot_len); +		if (skb->len < len || len < iph->ihl*4) +			goto out; -	/* -	 * Our transport medium may have padded the buffer out. -	 * Now We trim to the true length of the frame. -	 */ -	if (pskb_trim_rcsum(skb, len)) -		goto out; +		/* +		 * Our transport medium may have padded the buffer out. +		 * Now We trim to the true length of the frame. +		 */ +		if (pskb_trim_rcsum(skb, len)) +			goto out; -	iph = (struct iphdr *)skb->data; -	if (iph->protocol != IPPROTO_UDP) -		goto out; +		iph = (struct iphdr *)skb->data; +		if (iph->protocol != IPPROTO_UDP) +			goto out; -	len -= iph->ihl*4; -	uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); -	ulen = ntohs(uh->len); +		len -= iph->ihl*4; +		uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); +		ulen = ntohs(uh->len); -	if (ulen != len) -		goto out; -	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) -		goto out; +		if (ulen != len) +			goto out; +		if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) +			goto out; +		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { +			if (np->local_ip.ip && np->local_ip.ip != iph->daddr) +				continue; +			if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr) +				continue; +			if (np->local_port && np->local_port != ntohs(uh->dest)) +				continue; -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (np->local_ip && np->local_ip != iph->daddr) -			continue; -		if (np->remote_ip && np->remote_ip != iph->saddr) -			continue; -		if (np->local_port && np->local_port != ntohs(uh->dest)) -			continue; +			np->rx_hook(np, ntohs(uh->source), +				       (char *)(uh+1), +				       ulen - sizeof(struct udphdr)); +			hits++; +		} +	} else { +#if IS_ENABLED(CONFIG_IPV6) +		const struct ipv6hdr *ip6h; -		np->rx_hook(np, ntohs(uh->source), -			       (char *)(uh+1), -			       ulen - sizeof(struct udphdr)); -		hits++; +		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) +			goto out; +		ip6h = (struct ipv6hdr *)skb->data; +		if (ip6h->version != 6) +			goto out; +		len = ntohs(ip6h->payload_len); +		if (!len) +			goto out; +		if (len + sizeof(struct ipv6hdr) > skb->len) +			goto out; +		if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr))) +			goto out; +		ip6h = ipv6_hdr(skb); +		if (!pskb_may_pull(skb, sizeof(struct udphdr))) +			goto out; +		uh = udp_hdr(skb); +		ulen = ntohs(uh->len); +		if (ulen != skb->len) +			goto out; +		if (udp6_csum_init(skb, uh, IPPROTO_UDP)) +			goto out; +		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { +			if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr)) +				continue; +			if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr)) +				continue; +			if (np->local_port && np->local_port != ntohs(uh->dest)) +				continue; + +			np->rx_hook(np, ntohs(uh->source), +				       (char *)(uh+1), +				       ulen - sizeof(struct udphdr)); +			hits++; +		} +#endif  	}  	if (!hits) @@ -658,23 +902,51 @@ out:  void netpoll_print_options(struct netpoll *np)  {  	np_info(np, "local port %d\n", np->local_port); -	np_info(np, "local IP %pI4\n", &np->local_ip); +	if (np->ipv6) +		np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); +	else +		np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);  	np_info(np, "interface '%s'\n", np->dev_name);  	np_info(np, "remote port %d\n", np->remote_port); -	np_info(np, "remote IP %pI4\n", &np->remote_ip); +	if (np->ipv6) +		np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); +	else +		np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);  	np_info(np, "remote ethernet address %pM\n", np->remote_mac);  }  EXPORT_SYMBOL(netpoll_print_options); +static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) +{ +	const char *end; + +	if (!strchr(str, ':') && +	    in4_pton(str, -1, (void *)addr, -1, &end) > 0) { +		if (!*end) +			return 0; +	} +	if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { +#if IS_ENABLED(CONFIG_IPV6) +		if (!*end) +			return 1; +#else +		return -1; +#endif +	} +	return -1; +} +  int netpoll_parse_options(struct netpoll *np, char *opt)  {  	char *cur=opt, *delim; +	int ipv6;  	if (*cur != '@') {  		if ((delim = strchr(cur, '@')) == NULL)  			goto parse_failed;  		*delim = 0; -		np->local_port = simple_strtol(cur, NULL, 10); +		if (kstrtou16(cur, 10, &np->local_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++; @@ -683,7 +955,11 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  		if ((delim = strchr(cur, '/')) == NULL)  			goto parse_failed;  		*delim = 0; -		np->local_ip = in_aton(cur); +		ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); +		if (ipv6 < 0) +			goto parse_failed; +		else +			np->ipv6 = (bool)ipv6;  		cur = delim;  	}  	cur++; @@ -705,7 +981,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  		*delim = 0;  		if (*cur == ' ' || *cur == '\t')  			np_info(np, "warning: whitespace is not allowed\n"); -		np->remote_port = simple_strtol(cur, NULL, 10); +		if (kstrtou16(cur, 10, &np->remote_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++; @@ -714,7 +991,13 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  	if ((delim = strchr(cur, '/')) == NULL)  		goto parse_failed;  	*delim = 0; -	np->remote_ip = in_aton(cur); +	ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); +	if (ipv6 < 0) +		goto parse_failed; +	else if (np->ipv6 != (bool)ipv6) +		goto parse_failed; +	else +		np->ipv6 = (bool)ipv6;  	cur = delim + 1;  	if (*cur != 0) { @@ -742,6 +1025,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  	np->dev = ndev;  	strlcpy(np->dev_name, ndev->name, IFNAMSIZ); +	INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);  	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||  	    !ndev->netdev_ops->ndo_poll_controller) { @@ -762,7 +1046,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  		INIT_LIST_HEAD(&npinfo->rx_np);  		spin_lock_init(&npinfo->rx_lock); -		skb_queue_head_init(&npinfo->arp_tx); +		mutex_init(&npinfo->dev_lock); +		skb_queue_head_init(&npinfo->neigh_tx);  		skb_queue_head_init(&npinfo->txq);  		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -775,7 +1060,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  				goto free_npinfo;  		}  	} else { -		npinfo = ndev->npinfo; +		npinfo = rtnl_dereference(ndev->npinfo);  		atomic_inc(&npinfo->refcnt);  	} @@ -806,14 +1091,19 @@ int netpoll_setup(struct netpoll *np)  	struct in_device *in_dev;  	int err; -	if (np->dev_name) -		ndev = dev_get_by_name(&init_net, np->dev_name); +	rtnl_lock(); +	if (np->dev_name) { +		struct net *net = current->nsproxy->net_ns; +		ndev = __dev_get_by_name(net, np->dev_name); +	}  	if (!ndev) {  		np_err(np, "%s doesn't exist, aborting\n", np->dev_name); -		return -ENODEV; +		err = -ENODEV; +		goto unlock;  	} +	dev_hold(ndev); -	if (ndev->master) { +	if (netdev_master_upper_dev_get(ndev)) {  		np_err(np, "%s is a slave device, aborting\n", np->dev_name);  		err = -EBUSY;  		goto put; @@ -824,15 +1114,14 @@ int netpoll_setup(struct netpoll *np)  		np_info(np, "device %s not up yet, forcing it\n", np->dev_name); -		rtnl_lock();  		err = dev_open(ndev); -		rtnl_unlock();  		if (err) {  			np_err(np, "failed to open %s\n", ndev->name);  			goto put;  		} +		rtnl_unlock();  		atleast = jiffies + HZ/10;  		atmost = jiffies + carrier_timeout * HZ;  		while (!netif_carrier_ok(ndev)) { @@ -852,39 +1141,70 @@ int netpoll_setup(struct netpoll *np)  			np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");  			msleep(4000);  		} +		rtnl_lock();  	} -	if (!np->local_ip) { -		rcu_read_lock(); -		in_dev = __in_dev_get_rcu(ndev); +	if (!np->local_ip.ip) { +		if (!np->ipv6) { +			in_dev = __in_dev_get_rtnl(ndev); + +			if (!in_dev || !in_dev->ifa_list) { +				np_err(np, "no IP address for %s, aborting\n", +				       np->dev_name); +				err = -EDESTADDRREQ; +				goto put; +			} + +			np->local_ip.ip = in_dev->ifa_list->ifa_local; +			np_info(np, "local IP %pI4\n", &np->local_ip.ip); +		} else { +#if IS_ENABLED(CONFIG_IPV6) +			struct inet6_dev *idev; -		if (!in_dev || !in_dev->ifa_list) { -			rcu_read_unlock(); -			np_err(np, "no IP address for %s, aborting\n", -			       np->dev_name);  			err = -EDESTADDRREQ; +			idev = __in6_dev_get(ndev); +			if (idev) { +				struct inet6_ifaddr *ifp; + +				read_lock_bh(&idev->lock); +				list_for_each_entry(ifp, &idev->addr_list, if_list) { +					if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) +						continue; +					np->local_ip.in6 = ifp->addr; +					err = 0; +					break; +				} +				read_unlock_bh(&idev->lock); +			} +			if (err) { +				np_err(np, "no IPv6 address for %s, aborting\n", +				       np->dev_name); +				goto put; +			} else +				np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); +#else +			np_err(np, "IPv6 is not supported %s, aborting\n", +			       np->dev_name); +			err = -EINVAL;  			goto put; +#endif  		} - -		np->local_ip = in_dev->ifa_list->ifa_local; -		rcu_read_unlock(); -		np_info(np, "local IP %pI4\n", &np->local_ip);  	}  	/* fill up the skb queue */  	refill_skbs(); -	rtnl_lock();  	err = __netpoll_setup(np, ndev, GFP_KERNEL); -	rtnl_unlock(); -  	if (err)  		goto put; +	rtnl_unlock();  	return 0;  put:  	dev_put(ndev); +unlock: +	rtnl_unlock();  	return err;  }  EXPORT_SYMBOL(netpoll_setup); @@ -892,6 +1212,7 @@ EXPORT_SYMBOL(netpoll_setup);  static int __init netpoll_init(void)  {  	skb_queue_head_init(&skb_pool); +	init_srcu_struct(&netpoll_srcu);  	return 0;  }  core_initcall(netpoll_init); @@ -901,7 +1222,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)  	struct netpoll_info *npinfo =  			container_of(rcu_head, struct netpoll_info, rcu); -	skb_queue_purge(&npinfo->arp_tx); +	skb_queue_purge(&npinfo->neigh_tx);  	skb_queue_purge(&npinfo->txq);  	/* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -919,7 +1240,11 @@ void __netpoll_cleanup(struct netpoll *np)  	struct netpoll_info *npinfo;  	unsigned long flags; -	npinfo = np->dev->npinfo; +	/* rtnl_dereference would be preferable here but +	 * rcu_cleanup_netpoll path can put us in here safely without +	 * holding the rtnl, so plain rcu_dereference it is +	 */ +	npinfo = rtnl_dereference(np->dev->npinfo);  	if (!npinfo)  		return; @@ -931,6 +1256,8 @@ void __netpoll_cleanup(struct netpoll *np)  		spin_unlock_irqrestore(&npinfo->rx_lock, flags);  	} +	synchronize_srcu(&netpoll_srcu); +  	if (atomic_dec_and_test(&npinfo->refcnt)) {  		const struct net_device_ops *ops; @@ -938,25 +1265,27 @@ void __netpoll_cleanup(struct netpoll *np)  		if (ops->ndo_netpoll_cleanup)  			ops->ndo_netpoll_cleanup(np->dev); -		RCU_INIT_POINTER(np->dev->npinfo, NULL); +		rcu_assign_pointer(np->dev->npinfo, NULL);  		call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);  	}  }  EXPORT_SYMBOL_GPL(__netpoll_cleanup); -static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +static void netpoll_async_cleanup(struct work_struct *work)  { -	struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); +	struct netpoll *np = container_of(work, struct netpoll, cleanup_work); +	rtnl_lock();  	__netpoll_cleanup(np); +	rtnl_unlock();  	kfree(np);  } -void __netpoll_free_rcu(struct netpoll *np) +void __netpoll_free_async(struct netpoll *np)  { -	call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); +	schedule_work(&np->cleanup_work);  } -EXPORT_SYMBOL_GPL(__netpoll_free_rcu); +EXPORT_SYMBOL_GPL(__netpoll_free_async);  void netpoll_cleanup(struct netpoll *np)  { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 79285a36035..0777d0aa18c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,11 +27,7 @@  #include <linux/fdtable.h> -#define PRIOIDX_SZ 128 - -static unsigned long prioidx_map[PRIOIDX_SZ]; -static DEFINE_SPINLOCK(prioidx_map_lock); -static atomic_t max_prioidx = ATOMIC_INIT(0); +#define PRIOMAP_MIN_SZ		128  static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)  { @@ -39,136 +35,155 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr  			    struct cgroup_netprio_state, css);  } -static int get_prioidx(u32 *prio) +/* + * Extend @dev->priomap so that it's large enough to accomodate + * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful + * return.  Must be called under rtnl lock. + */ +static int extend_netdev_table(struct net_device *dev, u32 target_idx)  { -	unsigned long flags; -	u32 prioidx; +	struct netprio_map *old, *new; +	size_t new_sz, new_len; -	spin_lock_irqsave(&prioidx_map_lock, flags); -	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); -	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { -		spin_unlock_irqrestore(&prioidx_map_lock, flags); -		return -ENOSPC; -	} -	set_bit(prioidx, prioidx_map); -	if (atomic_read(&max_prioidx) < prioidx) -		atomic_set(&max_prioidx, prioidx); -	spin_unlock_irqrestore(&prioidx_map_lock, flags); -	*prio = prioidx; -	return 0; -} - -static void put_prioidx(u32 idx) -{ -	unsigned long flags; +	/* is the existing priomap large enough? */ +	old = rtnl_dereference(dev->priomap); +	if (old && old->priomap_len > target_idx) +		return 0; -	spin_lock_irqsave(&prioidx_map_lock, flags); -	clear_bit(idx, prioidx_map); -	spin_unlock_irqrestore(&prioidx_map_lock, flags); -} +	/* +	 * Determine the new size.  Let's keep it power-of-two.  We start +	 * from PRIOMAP_MIN_SZ and double it until it's large enough to +	 * accommodate @target_idx. +	 */ +	new_sz = PRIOMAP_MIN_SZ; +	while (true) { +		new_len = (new_sz - offsetof(struct netprio_map, priomap)) / +			sizeof(new->priomap[0]); +		if (new_len > target_idx) +			break; +		new_sz *= 2; +		/* overflowed? */ +		if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) +			return -ENOSPC; +	} -static int extend_netdev_table(struct net_device *dev, u32 new_len) -{ -	size_t new_size = sizeof(struct netprio_map) + -			   ((sizeof(u32) * new_len)); -	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); -	struct netprio_map *old_priomap; +	/* allocate & copy */ +	new = kzalloc(new_sz, GFP_KERNEL); +	if (!new) +		return -ENOMEM; -	old_priomap  = rtnl_dereference(dev->priomap); +	if (old) +		memcpy(new->priomap, old->priomap, +		       old->priomap_len * sizeof(old->priomap[0])); -	if (!new_priomap) { -		pr_warn("Unable to alloc new priomap!\n"); -		return -ENOMEM; -	} +	new->priomap_len = new_len; -	if (old_priomap) -		memcpy(new_priomap->priomap, old_priomap->priomap, -		       old_priomap->priomap_len * -		       sizeof(old_priomap->priomap[0])); +	/* install the new priomap */ +	rcu_assign_pointer(dev->priomap, new); +	if (old) +		kfree_rcu(old, rcu); +	return 0; +} -	new_priomap->priomap_len = new_len; +/** + * netprio_prio - return the effective netprio of a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * + * Should be called under RCU read or rtnl lock. + */ +static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +{ +	struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); -	rcu_assign_pointer(dev->priomap, new_priomap); -	if (old_priomap) -		kfree_rcu(old_priomap, rcu); +	if (map && cgrp->id < map->priomap_len) +		return map->priomap[cgrp->id];  	return 0;  } -static int write_update_netdev_table(struct net_device *dev) +/** + * netprio_set_prio - set netprio on a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * @prio: prio to set + * + * Set netprio to @prio on @cgrp-@dev pair.  Should be called under rtnl + * lock and may fail under memory pressure for non-zero @prio. + */ +static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, +			    u32 prio)  { -	int ret = 0; -	u32 max_len;  	struct netprio_map *map; +	int ret; -	max_len = atomic_read(&max_prioidx) + 1; +	/* avoid extending priomap for zero writes */  	map = rtnl_dereference(dev->priomap); -	if (!map || map->priomap_len < max_len) -		ret = extend_netdev_table(dev, max_len); +	if (!prio && (!map || map->priomap_len <= cgrp->id)) +		return 0; -	return ret; +	ret = extend_netdev_table(dev, cgrp->id); +	if (ret) +		return ret; + +	map = rtnl_dereference(dev->priomap); +	map->priomap[cgrp->id] = prio; +	return 0;  } -static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)  {  	struct cgroup_netprio_state *cs; -	int ret = -EINVAL;  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);  	if (!cs)  		return ERR_PTR(-ENOMEM); -	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) -		goto out; - -	ret = get_prioidx(&cs->prioidx); -	if (ret < 0) { -		pr_warn("No space in priority index array\n"); -		goto out; -	} -  	return &cs->css; -out: -	kfree(cs); -	return ERR_PTR(ret);  } -static void cgrp_destroy(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup *cgrp)  { -	struct cgroup_netprio_state *cs; +	struct cgroup *parent = cgrp->parent;  	struct net_device *dev; -	struct netprio_map *map; +	int ret = 0; + +	if (!parent) +		return 0; -	cs = cgrp_netprio_state(cgrp);  	rtnl_lock(); +	/* +	 * Inherit prios from the parent.  As all prios are set during +	 * onlining, there is no need to clear them on offline. +	 */  	for_each_netdev(&init_net, dev) { -		map = rtnl_dereference(dev->priomap); -		if (map && cs->prioidx < map->priomap_len) -			map->priomap[cs->prioidx] = 0; +		u32 prio = netprio_prio(parent, dev); + +		ret = netprio_set_prio(cgrp, dev, prio); +		if (ret) +			break;  	}  	rtnl_unlock(); -	put_prioidx(cs->prioidx); -	kfree(cs); +	return ret; +} + +static void cgrp_css_free(struct cgroup *cgrp) +{ +	kfree(cgrp_netprio_state(cgrp));  }  static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)  { -	return (u64)cgrp_netprio_state(cgrp)->prioidx; +	return cgrp->id;  }  static int read_priomap(struct cgroup *cont, struct cftype *cft,  			struct cgroup_map_cb *cb)  {  	struct net_device *dev; -	u32 prioidx = cgrp_netprio_state(cont)->prioidx; -	u32 priority; -	struct netprio_map *map;  	rcu_read_lock(); -	for_each_netdev_rcu(&init_net, dev) { -		map = rcu_dereference(dev->priomap); -		priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; -		cb->fill(cb, dev->name, priority); -	} +	for_each_netdev_rcu(&init_net, dev) +		cb->fill(cb, dev->name, netprio_prio(cont, dev));  	rcu_read_unlock();  	return 0;  } @@ -176,66 +191,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft,  static int write_priomap(struct cgroup *cgrp, struct cftype *cft,  			 const char *buffer)  { -	char *devname = kstrdup(buffer, GFP_KERNEL); -	int ret = -EINVAL; -	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; -	unsigned long priority; -	char *priostr; +	char devname[IFNAMSIZ + 1];  	struct net_device *dev; -	struct netprio_map *map; - -	if (!devname) -		return -ENOMEM; - -	/* -	 * Minimally sized valid priomap string -	 */ -	if (strlen(devname) < 3) -		goto out_free_devname; - -	priostr = strstr(devname, " "); -	if (!priostr) -		goto out_free_devname; - -	/* -	 *Separate the devname from the associated priority -	 *and advance the priostr pointer to the priority value -	 */ -	*priostr = '\0'; -	priostr++; - -	/* -	 * If the priostr points to NULL, we're at the end of the passed -	 * in string, and its not a valid write -	 */ -	if (*priostr == '\0') -		goto out_free_devname; - -	ret = kstrtoul(priostr, 10, &priority); -	if (ret < 0) -		goto out_free_devname; +	u32 prio; +	int ret; -	ret = -ENODEV; +	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) +		return -EINVAL;  	dev = dev_get_by_name(&init_net, devname);  	if (!dev) -		goto out_free_devname; +		return -ENODEV;  	rtnl_lock(); -	ret = write_update_netdev_table(dev); -	if (ret < 0) -		goto out_put_dev; -	map = rtnl_dereference(dev->priomap); -	if (map) -		map->priomap[prioidx] = priority; +	ret = netprio_set_prio(cgrp, dev, prio); -out_put_dev:  	rtnl_unlock();  	dev_put(dev); - -out_free_devname: -	kfree(devname);  	return ret;  } @@ -248,7 +221,7 @@ static int update_netprio(const void *v, struct file *file, unsigned n)  	return 0;  } -void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct task_struct *p;  	void *v; @@ -276,22 +249,13 @@ static struct cftype ss_files[] = {  struct cgroup_subsys net_prio_subsys = {  	.name		= "net_prio", -	.create		= cgrp_create, -	.destroy	= cgrp_destroy, +	.css_alloc	= cgrp_css_alloc, +	.css_online	= cgrp_css_online, +	.css_free	= cgrp_css_free,  	.attach		= net_prio_attach,  	.subsys_id	= net_prio_subsys_id,  	.base_cftypes	= ss_files,  	.module		= THIS_MODULE, - -	/* -	 * net_prio has artificial limit on the number of cgroups and -	 * disallows nesting making it impossible to co-mount it with other -	 * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ -	 * limit and properly nest configuration such that children follow -	 * their parents' configurations by default and are allowed to -	 * override and remove the following. -	 */ -	.broken_hierarchy = true,  };  static int netprio_device_event(struct notifier_block *unused, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d1dc14c2aac..6048fc1da1c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -164,6 +164,7 @@  #ifdef CONFIG_XFRM  #include <net/xfrm.h>  #endif +#include <net/netns/generic.h>  #include <asm/byteorder.h>  #include <linux/rcupdate.h>  #include <linux/bitops.h> @@ -212,7 +213,6 @@  #define PKTGEN_MAGIC 0xbe9be955  #define PG_PROC_DIR "pktgen"  #define PGCTRL	    "pgctrl" -static struct proc_dir_entry *pg_proc_dir;  #define MAX_CFLOWS  65536 @@ -397,7 +397,15 @@ struct pktgen_hdr {  	__be32 tv_usec;  }; -static bool pktgen_exiting __read_mostly; + +static int pg_net_id __read_mostly; + +struct pktgen_net { +	struct net		*net; +	struct proc_dir_entry	*proc_dir; +	struct list_head	pktgen_threads; +	bool			pktgen_exiting; +};  struct pktgen_thread {  	spinlock_t if_lock;		/* for list of devices */ @@ -414,25 +422,12 @@ struct pktgen_thread {  	wait_queue_head_t queue;  	struct completion start_done; +	struct pktgen_net *net;  };  #define REMOVE 1  #define FIND   0 -static inline ktime_t ktime_now(void) -{ -	struct timespec ts; -	ktime_get_ts(&ts); - -	return timespec_to_ktime(ts); -} - -/* This works even if 32 bit because of careful byte order choice */ -static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) -{ -	return cmp1.tv64 < cmp2.tv64; -} -  static const char version[] =  	"Packet Generator for packet performance testing. "  	"Version: " VERSION "\n"; @@ -442,9 +437,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);  static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,  					  const char *ifname, bool exact);  static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_reset_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); +static void pktgen_run_all_threads(struct pktgen_net *pn); +static void pktgen_reset_all_threads(struct pktgen_net *pn); +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);  static void pktgen_stop(struct pktgen_thread *t);  static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -456,7 +451,6 @@ static int pg_clone_skb_d  __read_mostly;  static int debug  __read_mostly;  static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads);  static struct notifier_block pktgen_notifier_block = {  	.notifier_call = pktgen_device_event, @@ -478,6 +472,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,  {  	int err = 0;  	char data[128]; +	struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);  	if (!capable(CAP_NET_ADMIN)) {  		err = -EPERM; @@ -494,13 +489,13 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,  	data[count - 1] = 0;	/* Make string */  	if (!strcmp(data, "stop")) -		pktgen_stop_all_threads_ifs(); +		pktgen_stop_all_threads_ifs(pn);  	else if (!strcmp(data, "start")) -		pktgen_run_all_threads(); +		pktgen_run_all_threads(pn);  	else if (!strcmp(data, "reset")) -		pktgen_reset_all_threads(); +		pktgen_reset_all_threads(pn);  	else  		pr_warning("Unknown command: %s\n", data); @@ -675,7 +670,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\n");  	/* not really stopped, more like last-running-at */ -	stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; +	stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;  	idle = pkt_dev->idle_acc;  	do_div(idle, NSEC_PER_USEC); @@ -1795,10 +1790,13 @@ static ssize_t pktgen_thread_write(struct file *file,  			return -EFAULT;  		i += len;  		mutex_lock(&pktgen_thread_lock); -		pktgen_add_device(t, f); +		ret = pktgen_add_device(t, f);  		mutex_unlock(&pktgen_thread_lock); -		ret = count; -		sprintf(pg_result, "OK: add_device=%s", f); +		if (!ret) { +			ret = count; +			sprintf(pg_result, "OK: add_device=%s", f); +		} else +			sprintf(pg_result, "ERROR: can not add device %s", f);  		goto out;  	} @@ -1838,13 +1836,14 @@ static const struct file_operations pktgen_thread_fops = {  };  /* Think find or remove for NN */ -static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, +					      const char *ifname, int remove)  {  	struct pktgen_thread *t;  	struct pktgen_dev *pkt_dev = NULL;  	bool exact = (remove == FIND); -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		pkt_dev = pktgen_find_dev(t, ifname, exact);  		if (pkt_dev) {  			if (remove) { @@ -1862,7 +1861,7 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)  /*   * mark a device for removal   */ -static void pktgen_mark_device(const char *ifname) +static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)  {  	struct pktgen_dev *pkt_dev = NULL;  	const int max_tries = 10, msec_per_try = 125; @@ -1873,7 +1872,7 @@ static void pktgen_mark_device(const char *ifname)  	while (1) { -		pkt_dev = __pktgen_NN_threads(ifname, REMOVE); +		pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);  		if (pkt_dev == NULL)  			break;	/* success */ @@ -1894,21 +1893,21 @@ static void pktgen_mark_device(const char *ifname)  	mutex_unlock(&pktgen_thread_lock);  } -static void pktgen_change_name(struct net_device *dev) +static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)  {  	struct pktgen_thread *t; -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		struct pktgen_dev *pkt_dev;  		list_for_each_entry(pkt_dev, &t->if_list, list) {  			if (pkt_dev->odev != dev)  				continue; -			remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); +			remove_proc_entry(pkt_dev->entry->name, pn->proc_dir);  			pkt_dev->entry = proc_create_data(dev->name, 0600, -							  pg_proc_dir, +							  pn->proc_dir,  							  &pktgen_if_fops,  							  pkt_dev);  			if (!pkt_dev->entry) @@ -1923,8 +1922,9 @@ static int pktgen_device_event(struct notifier_block *unused,  			       unsigned long event, void *ptr)  {  	struct net_device *dev = ptr; +	struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); -	if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting) +	if (pn->pktgen_exiting)  		return NOTIFY_DONE;  	/* It is OK that we do not hold the group lock right now, @@ -1933,18 +1933,19 @@ static int pktgen_device_event(struct notifier_block *unused,  	switch (event) {  	case NETDEV_CHANGENAME: -		pktgen_change_name(dev); +		pktgen_change_name(pn, dev);  		break;  	case NETDEV_UNREGISTER: -		pktgen_mark_device(dev->name); +		pktgen_mark_device(pn, dev->name);  		break;  	}  	return NOTIFY_DONE;  } -static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, +static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn, +						 struct pktgen_dev *pkt_dev,  						 const char *ifname)  {  	char b[IFNAMSIZ+5]; @@ -1958,13 +1959,14 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,  	}  	b[i] = 0; -	return dev_get_by_name(&init_net, b); +	return dev_get_by_name(pn->net, b);  }  /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(const struct pktgen_net *pn, +			    struct pktgen_dev *pkt_dev, const char *ifname)  {  	struct net_device *odev;  	int err; @@ -1975,7 +1977,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)  		pkt_dev->odev = NULL;  	} -	odev = pktgen_dev_get_by_name(pkt_dev, ifname); +	odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);  	if (!odev) {  		pr_err("no such netdevice: \"%s\"\n", ifname);  		return -ENODEV; @@ -2141,12 +2143,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  		return;  	} -	start_time = ktime_now(); +	start_time = ktime_get();  	if (remaining < 100000) {  		/* for small delays (<100us), just loop until limit is reached */  		do { -			end_time = ktime_now(); -		} while (ktime_lt(end_time, spin_until)); +			end_time = ktime_get(); +		} while (ktime_compare(end_time, spin_until) < 0);  	} else {  		/* see do_nanosleep */  		hrtimer_init_sleeper(&t, current); @@ -2162,7 +2164,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  			hrtimer_cancel(&t.timer);  		} while (t.task && pkt_dev->running && !signal_pending(current));  		__set_current_state(TASK_RUNNING); -		end_time = ktime_now(); +		end_time = ktime_get();  	}  	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); @@ -2217,9 +2219,10 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)  static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)  {  	struct xfrm_state *x = pkt_dev->flows[flow].x; +	struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);  	if (!x) {  		/*slow path: we dont already have xfrm_state*/ -		x = xfrm_stateonly_find(&init_net, DUMMY_MARK, +		x = xfrm_stateonly_find(pn->net, DUMMY_MARK,  					(xfrm_address_t *)&pkt_dev->cur_daddr,  					(xfrm_address_t *)&pkt_dev->cur_saddr,  					AF_INET, @@ -2427,11 +2430,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		}  	} else {		/* IPV6 * */ -		if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; -		else { +		if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {  			int i;  			/* Only random destinations yet */ @@ -2916,8 +2915,7 @@ static void pktgen_run(struct pktgen_thread *t)  			pktgen_clear_counters(pkt_dev);  			pkt_dev->running = 1;	/* Cranke yeself! */  			pkt_dev->skb = NULL; -			pkt_dev->started_at = -				pkt_dev->next_tx = ktime_now(); +			pkt_dev->started_at = pkt_dev->next_tx = ktime_get();  			set_pkt_overhead(pkt_dev); @@ -2931,7 +2929,7 @@ static void pktgen_run(struct pktgen_thread *t)  		t->control &= ~(T_STOP);  } -static void pktgen_stop_all_threads_ifs(void) +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -2939,7 +2937,7 @@ static void pktgen_stop_all_threads_ifs(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= T_STOP;  	mutex_unlock(&pktgen_thread_lock); @@ -2975,28 +2973,28 @@ signal:  	return 0;  } -static int pktgen_wait_all_threads_run(void) +static int pktgen_wait_all_threads_run(struct pktgen_net *pn)  {  	struct pktgen_thread *t;  	int sig = 1;  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		sig = pktgen_wait_thread_run(t);  		if (sig == 0)  			break;  	}  	if (sig == 0) -		list_for_each_entry(t, &pktgen_threads, th_list) +		list_for_each_entry(t, &pn->pktgen_threads, th_list)  			t->control |= (T_STOP);  	mutex_unlock(&pktgen_thread_lock);  	return sig;  } -static void pktgen_run_all_threads(void) +static void pktgen_run_all_threads(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -3004,7 +3002,7 @@ static void pktgen_run_all_threads(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= (T_RUN);  	mutex_unlock(&pktgen_thread_lock); @@ -3012,10 +3010,10 @@ static void pktgen_run_all_threads(void)  	/* Propagate thread->control  */  	schedule_timeout_interruptible(msecs_to_jiffies(125)); -	pktgen_wait_all_threads_run(); +	pktgen_wait_all_threads_run(pn);  } -static void pktgen_reset_all_threads(void) +static void pktgen_reset_all_threads(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -3023,7 +3021,7 @@ static void pktgen_reset_all_threads(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= (T_REMDEVALL);  	mutex_unlock(&pktgen_thread_lock); @@ -3031,7 +3029,7 @@ static void pktgen_reset_all_threads(void)  	/* Propagate thread->control  */  	schedule_timeout_interruptible(msecs_to_jiffies(125)); -	pktgen_wait_all_threads_run(); +	pktgen_wait_all_threads_run(pn);  }  static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3076,7 +3074,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)  	kfree_skb(pkt_dev->skb);  	pkt_dev->skb = NULL; -	pkt_dev->stopped_at = ktime_now(); +	pkt_dev->stopped_at = ktime_get();  	pkt_dev->running = 0;  	show_results(pkt_dev, nr_frags); @@ -3095,7 +3093,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)  			continue;  		if (best == NULL)  			best = pkt_dev; -		else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) +		else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)  			best = pkt_dev;  	}  	if_unlock(t); @@ -3173,21 +3171,19 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)  static void pktgen_rem_thread(struct pktgen_thread *t)  {  	/* Remove from the thread list */ - -	remove_proc_entry(t->tsk->comm, pg_proc_dir); - +	remove_proc_entry(t->tsk->comm, t->net->proc_dir);  }  static void pktgen_resched(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	schedule(); -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	while (atomic_read(&(pkt_dev->skb->users)) != 1) {  		if (signal_pending(current)) @@ -3198,7 +3194,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  		else  			cpu_relax();  	} -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_xmit(struct pktgen_dev *pkt_dev) @@ -3220,7 +3216,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	 * "never transmit"  	 */  	if (unlikely(pkt_dev->delay == ULLONG_MAX)) { -		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); +		pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);  		return;  	} @@ -3321,7 +3317,7 @@ static int pktgen_thread_worker(void *arg)  		pkt_dev = next_to_run(t);  		if (unlikely(!pkt_dev && t->control == 0)) { -			if (pktgen_exiting) +			if (t->net->pktgen_exiting)  				break;  			wait_event_interruptible_timeout(t->queue,  							 t->control != 0, @@ -3443,7 +3439,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	/* We don't allow a device to be on several threads */ -	pkt_dev = __pktgen_NN_threads(ifname, FIND); +	pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);  	if (pkt_dev) {  		pr_err("ERROR: interface already used\n");  		return -EBUSY; @@ -3478,13 +3474,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	pkt_dev->svlan_id = 0xffff;  	pkt_dev->node = -1; -	err = pktgen_setup_dev(pkt_dev, ifname); +	err = pktgen_setup_dev(t->net, pkt_dev, ifname);  	if (err)  		goto out1;  	if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)  		pkt_dev->clone_skb = pg_clone_skb_d; -	pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, +	pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,  					  &pktgen_if_fops, pkt_dev);  	if (!pkt_dev->entry) {  		pr_err("cannot create %s/%s procfs entry\n", @@ -3509,7 +3505,7 @@ out1:  	return err;  } -static int __init pktgen_create_thread(int cpu) +static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)  {  	struct pktgen_thread *t;  	struct proc_dir_entry *pe; @@ -3527,7 +3523,7 @@ static int __init pktgen_create_thread(int cpu)  	INIT_LIST_HEAD(&t->if_list); -	list_add_tail(&t->th_list, &pktgen_threads); +	list_add_tail(&t->th_list, &pn->pktgen_threads);  	init_completion(&t->start_done);  	p = kthread_create_on_node(pktgen_thread_worker, @@ -3543,7 +3539,7 @@ static int __init pktgen_create_thread(int cpu)  	kthread_bind(p, cpu);  	t->tsk = p; -	pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, +	pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,  			      &pktgen_thread_fops, t);  	if (!pe) {  		pr_err("cannot create %s/%s procfs entry\n", @@ -3554,6 +3550,7 @@ static int __init pktgen_create_thread(int cpu)  		return -EINVAL;  	} +	t->net = pn;  	wake_up_process(p);  	wait_for_completion(&t->start_done); @@ -3579,6 +3576,7 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,  static int pktgen_remove_device(struct pktgen_thread *t,  				struct pktgen_dev *pkt_dev)  { +	struct pktgen_net *pn = t->net;  	pr_debug("remove_device pkt_dev=%p\n", pkt_dev); @@ -3599,7 +3597,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,  	_rem_dev_from_if_list(t, pkt_dev);  	if (pkt_dev->entry) -		remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); +		remove_proc_entry(pkt_dev->entry->name, pn->proc_dir);  #ifdef CONFIG_XFRM  	free_SAs(pkt_dev); @@ -3611,63 +3609,63 @@ static int pktgen_remove_device(struct pktgen_thread *t,  	return 0;  } -static int __init pg_init(void) +static int __net_init pg_net_init(struct net *net)  { -	int cpu; +	struct pktgen_net *pn = net_generic(net, pg_net_id);  	struct proc_dir_entry *pe; -	int ret = 0; +	int cpu, ret = 0; -	pr_info("%s", version); - -	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); -	if (!pg_proc_dir) +	pn->net = net; +	INIT_LIST_HEAD(&pn->pktgen_threads); +	pn->pktgen_exiting = false; +	pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); +	if (!pn->proc_dir) { +		pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);  		return -ENODEV; - -	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); +	} +	pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);  	if (pe == NULL) { -		pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); +		pr_err("cannot create %s procfs entry\n", PGCTRL);  		ret = -EINVAL; -		goto remove_dir; +		goto remove;  	} -	register_netdevice_notifier(&pktgen_notifier_block); -  	for_each_online_cpu(cpu) {  		int err; -		err = pktgen_create_thread(cpu); +		err = pktgen_create_thread(cpu, pn);  		if (err) -			pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", +			pr_warn("Cannot create thread for cpu %d (%d)\n",  				   cpu, err);  	} -	if (list_empty(&pktgen_threads)) { -		pr_err("ERROR: Initialization failed for all threads\n"); +	if (list_empty(&pn->pktgen_threads)) { +		pr_err("Initialization failed for all threads\n");  		ret = -ENODEV; -		goto unregister; +		goto remove_entry;  	}  	return 0; - unregister: -	unregister_netdevice_notifier(&pktgen_notifier_block); -	remove_proc_entry(PGCTRL, pg_proc_dir); - remove_dir: -	proc_net_remove(&init_net, PG_PROC_DIR); +remove_entry: +	remove_proc_entry(PGCTRL, pn->proc_dir); +remove: +	remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);  	return ret;  } -static void __exit pg_cleanup(void) +static void __net_exit pg_net_exit(struct net *net)  { +	struct pktgen_net *pn = net_generic(net, pg_net_id);  	struct pktgen_thread *t;  	struct list_head *q, *n;  	LIST_HEAD(list);  	/* Stop all interfaces & threads */ -	pktgen_exiting = true; +	pn->pktgen_exiting = true;  	mutex_lock(&pktgen_thread_lock); -	list_splice_init(&pktgen_threads, &list); +	list_splice_init(&pn->pktgen_threads, &list);  	mutex_unlock(&pktgen_thread_lock);  	list_for_each_safe(q, n, &list) { @@ -3677,12 +3675,36 @@ static void __exit pg_cleanup(void)  		kfree(t);  	} -	/* Un-register us from receiving netdevice events */ -	unregister_netdevice_notifier(&pktgen_notifier_block); +	remove_proc_entry(PGCTRL, pn->proc_dir); +	remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +} + +static struct pernet_operations pg_net_ops = { +	.init = pg_net_init, +	.exit = pg_net_exit, +	.id   = &pg_net_id, +	.size = sizeof(struct pktgen_net), +}; + +static int __init pg_init(void) +{ +	int ret = 0; + +	pr_info("%s", version); +	ret = register_pernet_subsys(&pg_net_ops); +	if (ret) +		return ret; +	ret = register_netdevice_notifier(&pktgen_notifier_block); +	if (ret) +		unregister_pernet_subsys(&pg_net_ops); + +	return ret; +} -	/* Clean up proc file system */ -	remove_proc_entry(PGCTRL, pg_proc_dir); -	proc_net_remove(&init_net, PG_PROC_DIR); +static void __exit pg_cleanup(void) +{ +	unregister_netdevice_notifier(&pktgen_notifier_block); +	unregister_pernet_subsys(&pg_net_ops);  }  module_init(pg_init); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index c31d9e8668c..4425148d2b5 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -186,8 +186,6 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,  	struct fastopen_queue *fastopenq =  	    inet_csk(lsk)->icsk_accept_queue.fastopenq; -	BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); -  	tcp_sk(sk)->fastopen_rsk = NULL;  	spin_lock_bh(&fastopenq->lock);  	fastopenq->qlen--; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 76d4c2c3c89..a585d45cc9d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -128,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].doit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].doit : NULL; +	return tab[msgindex].doit;  }  static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) @@ -143,7 +143,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].dumpit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].dumpit : NULL; +	return tab[msgindex].dumpit;  }  static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) @@ -158,7 +158,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].calcit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].calcit : NULL; +	return tab[msgindex].calcit;  }  /** @@ -780,6 +780,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,  	       + nla_total_size(4) /* IFLA_MTU */  	       + nla_total_size(4) /* IFLA_LINK */  	       + nla_total_size(4) /* IFLA_MASTER */ +	       + nla_total_size(1) /* IFLA_CARRIER */  	       + nla_total_size(4) /* IFLA_PROMISCUITY */  	       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */  	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ @@ -879,6 +880,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  	const struct rtnl_link_stats64 *stats;  	struct nlattr *attr, *af_spec;  	struct rtnl_af_ops *af_ops; +	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);  	ASSERT_RTNL();  	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -907,8 +909,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  #endif  	    (dev->ifindex != dev->iflink &&  	     nla_put_u32(skb, IFLA_LINK, dev->iflink)) || -	    (dev->master && -	     nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || +	    (upper_dev && +	     nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || +	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||  	    (dev->qdisc &&  	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||  	    (dev->ifalias && @@ -976,6 +979,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			 * report anything.  			 */  			ivi.spoofchk = -1; +			memset(ivi.mac, 0, sizeof(ivi.mac));  			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))  				break;  			vf_mac.vf = @@ -1057,7 +1061,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)  	int idx = 0, s_idx;  	struct net_device *dev;  	struct hlist_head *head; -	struct hlist_node *node;  	struct nlattr *tb[IFLA_MAX+1];  	u32 ext_filter_mask = 0; @@ -1077,7 +1080,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)  	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {  		idx = 0;  		head = &net->dev_index_head[h]; -		hlist_for_each_entry_rcu(dev, node, head, index_hlist) { +		hlist_for_each_entry_rcu(dev, head, index_hlist) {  			if (idx < s_idx)  				goto cont;  			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -1108,6 +1111,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {  	[IFLA_MTU]		= { .type = NLA_U32 },  	[IFLA_LINK]		= { .type = NLA_U32 },  	[IFLA_MASTER]		= { .type = NLA_U32 }, +	[IFLA_CARRIER]		= { .type = NLA_U8 },  	[IFLA_TXQLEN]		= { .type = NLA_U32 },  	[IFLA_WEIGHT]		= { .type = NLA_U32 },  	[IFLA_OPERSTATE]	= { .type = NLA_U8 }, @@ -1270,16 +1274,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)  static int do_set_master(struct net_device *dev, int ifindex)  { -	struct net_device *master_dev; +	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);  	const struct net_device_ops *ops;  	int err; -	if (dev->master) { -		if (dev->master->ifindex == ifindex) +	if (upper_dev) { +		if (upper_dev->ifindex == ifindex)  			return 0; -		ops = dev->master->netdev_ops; +		ops = upper_dev->netdev_ops;  		if (ops->ndo_del_slave) { -			err = ops->ndo_del_slave(dev->master, dev); +			err = ops->ndo_del_slave(upper_dev, dev);  			if (err)  				return err;  		} else { @@ -1288,12 +1292,12 @@ static int do_set_master(struct net_device *dev, int ifindex)  	}  	if (ifindex) { -		master_dev = __dev_get_by_index(dev_net(dev), ifindex); -		if (!master_dev) +		upper_dev = __dev_get_by_index(dev_net(dev), ifindex); +		if (!upper_dev)  			return -EINVAL; -		ops = master_dev->netdev_ops; +		ops = upper_dev->netdev_ops;  		if (ops->ndo_add_slave) { -			err = ops->ndo_add_slave(master_dev, dev); +			err = ops->ndo_add_slave(upper_dev, dev);  			if (err)  				return err;  		} else { @@ -1307,7 +1311,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		      struct nlattr **tb, char *ifname, int modified)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	int send_addr_notify = 0;  	int err;  	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { @@ -1316,6 +1319,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  			err = PTR_ERR(net);  			goto errout;  		} +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { +			err = -EPERM; +			goto errout; +		}  		err = dev_change_net_namespace(dev, net, ifname);  		put_net(net);  		if (err) @@ -1356,16 +1363,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		struct sockaddr *sa;  		int len; -		if (!ops->ndo_set_mac_address) { -			err = -EOPNOTSUPP; -			goto errout; -		} - -		if (!netif_device_present(dev)) { -			err = -ENODEV; -			goto errout; -		} -  		len = sizeof(sa_family_t) + dev->addr_len;  		sa = kmalloc(len, GFP_KERNEL);  		if (!sa) { @@ -1375,13 +1372,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		sa->sa_family = dev->type;  		memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),  		       dev->addr_len); -		err = ops->ndo_set_mac_address(dev, sa); +		err = dev_set_mac_address(dev, sa);  		kfree(sa);  		if (err)  			goto errout; -		send_addr_notify = 1;  		modified = 1; -		add_device_randomness(dev->dev_addr, dev->addr_len);  	}  	if (tb[IFLA_MTU]) { @@ -1418,7 +1413,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  	if (tb[IFLA_BROADCAST]) {  		nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); -		send_addr_notify = 1; +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	}  	if (ifm->ifi_flags || ifm->ifi_change) { @@ -1434,6 +1429,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		modified = 1;  	} +	if (tb[IFLA_CARRIER]) { +		err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); +		if (err) +			goto errout; +		modified = 1; +	} +  	if (tb[IFLA_TXQLEN])  		dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); @@ -1532,9 +1534,6 @@ errout:  		net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",  				     dev->name); -	if (send_addr_notify) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -  	return err;  } @@ -1638,7 +1637,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)  }  EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *src_net, struct net *net, +struct net_device *rtnl_create_link(struct net *net,  	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])  {  	int err; @@ -1668,9 +1667,11 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,  	if (tb[IFLA_MTU])  		dev->mtu = nla_get_u32(tb[IFLA_MTU]); -	if (tb[IFLA_ADDRESS]) +	if (tb[IFLA_ADDRESS]) {  		memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),  				nla_len(tb[IFLA_ADDRESS])); +		dev->addr_assign_type = NET_ADDR_SET; +	}  	if (tb[IFLA_BROADCAST])  		memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),  				nla_len(tb[IFLA_BROADCAST])); @@ -1836,7 +1837,7 @@ replay:  		if (IS_ERR(dest_net))  			return PTR_ERR(dest_net); -		dev = rtnl_create_link(net, dest_net, ifname, ops, tb); +		dev = rtnl_create_link(dest_net, ifname, ops, tb);  		if (IS_ERR(dev)) {  			err = PTR_ERR(dev);  			goto out; @@ -1988,6 +1989,7 @@ errout:  	if (err < 0)  		rtnl_set_sk_err(net, RTNLGRP_LINK, err);  } +EXPORT_SYMBOL(rtmsg_ifinfo);  static int nlmsg_populate_fdb_fill(struct sk_buff *skb,  				   struct net_device *dev, @@ -2050,7 +2052,6 @@ errout:  static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  {  	struct net *net = sock_net(skb->sk); -	struct net_device *master = NULL;  	struct ndmsg *ndm;  	struct nlattr *tb[NDA_MAX+1];  	struct net_device *dev; @@ -2089,10 +2090,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	/* Support fdb on master device the net/bridge default case */  	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&  	    (dev->priv_flags & IFF_BRIDGE_PORT)) { -		master = dev->master; -		err = master->netdev_ops->ndo_fdb_add(ndm, tb, -						      dev, addr, -						      nlh->nlmsg_flags); +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); +		const struct net_device_ops *ops = br_dev->netdev_ops; + +		err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);  		if (err)  			goto out;  		else @@ -2118,13 +2119,17 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  {  	struct net *net = sock_net(skb->sk);  	struct ndmsg *ndm; -	struct nlattr *llattr; +	struct nlattr *tb[NDA_MAX+1];  	struct net_device *dev;  	int err = -EINVAL;  	__u8 *addr; -	if (nlmsg_len(nlh) < sizeof(*ndm)) -		return -EINVAL; +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); +	if (err < 0) +		return err;  	ndm = nlmsg_data(nlh);  	if (ndm->ndm_ifindex == 0) { @@ -2138,22 +2143,27 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  		return -ENODEV;  	} -	llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR); -	if (llattr == NULL || nla_len(llattr) != ETH_ALEN) { -		pr_info("PF_BRIGDE: RTM_DELNEIGH with invalid address\n"); +	if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { +		pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); +		return -EINVAL; +	} + +	addr = nla_data(tb[NDA_LLADDR]); +	if (!is_valid_ether_addr(addr)) { +		pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ether address\n");  		return -EINVAL;  	} -	addr = nla_data(llattr);  	err = -EOPNOTSUPP;  	/* Support fdb on master device the net/bridge default case */  	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&  	    (dev->priv_flags & IFF_BRIDGE_PORT)) { -		struct net_device *master = dev->master; +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); +		const struct net_device_ops *ops = br_dev->netdev_ops; -		if (master->netdev_ops->ndo_fdb_del) -			err = master->netdev_ops->ndo_fdb_del(ndm, dev, addr); +		if (ops->ndo_fdb_del) +			err = ops->ndo_fdb_del(ndm, tb, dev, addr);  		if (err)  			goto out; @@ -2163,7 +2173,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	/* Embedded bridge, macvlan, and any other device support */  	if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) { -		err = dev->netdev_ops->ndo_fdb_del(ndm, dev, addr); +		err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);  		if (!err) {  			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); @@ -2192,7 +2202,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,  			goto skip;  		err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, -					      portid, seq, 0, NTF_SELF); +					      portid, seq, +					      RTM_NEWNEIGH, NTF_SELF);  		if (err < 0)  			return err;  skip: @@ -2236,9 +2247,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)  	rcu_read_lock();  	for_each_netdev_rcu(net, dev) {  		if (dev->priv_flags & IFF_BRIDGE_PORT) { -			struct net_device *master = dev->master; -			const struct net_device_ops *ops = master->netdev_ops; +			struct net_device *br_dev; +			const struct net_device_ops *ops; +			br_dev = netdev_master_upper_dev_get(dev); +			ops = br_dev->netdev_ops;  			if (ops->ndo_fdb_dump)  				idx = ops->ndo_fdb_dump(skb, cb, dev, idx);  		} @@ -2252,6 +2265,292 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)  	return skb->len;  } +int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, +			    struct net_device *dev, u16 mode) +{ +	struct nlmsghdr *nlh; +	struct ifinfomsg *ifm; +	struct nlattr *br_afspec; +	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; +	struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); +	if (nlh == NULL) +		return -EMSGSIZE; + +	ifm = nlmsg_data(nlh); +	ifm->ifi_family = AF_BRIDGE; +	ifm->__ifi_pad = 0; +	ifm->ifi_type = dev->type; +	ifm->ifi_index = dev->ifindex; +	ifm->ifi_flags = dev_get_flags(dev); +	ifm->ifi_change = 0; + + +	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || +	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || +	    nla_put_u8(skb, IFLA_OPERSTATE, operstate) || +	    (br_dev && +	     nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || +	    (dev->addr_len && +	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || +	    (dev->ifindex != dev->iflink && +	     nla_put_u32(skb, IFLA_LINK, dev->iflink))) +		goto nla_put_failure; + +	br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); +	if (!br_afspec) +		goto nla_put_failure; + +	if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || +	    nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { +		nla_nest_cancel(skb, br_afspec); +		goto nla_put_failure; +	} +	nla_nest_end(skb, br_afspec); + +	return nlmsg_end(skb, nlh); +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} +EXPORT_SYMBOL(ndo_dflt_bridge_getlink); + +static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	struct net_device *dev; +	int idx = 0; +	u32 portid = NETLINK_CB(cb->skb).portid; +	u32 seq = cb->nlh->nlmsg_seq; +	struct nlattr *extfilt; +	u32 filter_mask = 0; + +	extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct rtgenmsg), +				  IFLA_EXT_MASK); +	if (extfilt) +		filter_mask = nla_get_u32(extfilt); + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		const struct net_device_ops *ops = dev->netdev_ops; +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    br_dev->netdev_ops->ndo_bridge_getlink( +				    skb, portid, seq, dev, filter_mask) < 0) +				break; +			idx++; +		} + +		if (ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    ops->ndo_bridge_getlink(skb, portid, seq, dev, +						    filter_mask) < 0) +				break; +			idx++; +		} +	} +	rcu_read_unlock(); +	cb->args[0] = idx; + +	return skb->len; +} + +static inline size_t bridge_nlmsg_size(void) +{ +	return NLMSG_ALIGN(sizeof(struct ifinfomsg)) +		+ nla_total_size(IFNAMSIZ)	/* IFLA_IFNAME */ +		+ nla_total_size(MAX_ADDR_LEN)	/* IFLA_ADDRESS */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MASTER */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MTU */ +		+ nla_total_size(sizeof(u32))	/* IFLA_LINK */ +		+ nla_total_size(sizeof(u32))	/* IFLA_OPERSTATE */ +		+ nla_total_size(sizeof(u8))	/* IFLA_PROTINFO */ +		+ nla_total_size(sizeof(struct nlattr))	/* IFLA_AF_SPEC */ +		+ nla_total_size(sizeof(u16))	/* IFLA_BRIDGE_FLAGS */ +		+ nla_total_size(sizeof(u16));	/* IFLA_BRIDGE_MODE */ +} + +static int rtnl_bridge_notify(struct net_device *dev, u16 flags) +{ +	struct net *net = dev_net(dev); +	struct net_device *br_dev = netdev_master_upper_dev_get(dev); +	struct sk_buff *skb; +	int err = -EOPNOTSUPP; + +	skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); +	if (!skb) { +		err = -ENOMEM; +		goto errout; +	} + +	if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && +	    br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { +		err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); +		if (err < 0) +			goto errout; +	} + +	if ((flags & BRIDGE_FLAGS_SELF) && +	    dev->netdev_ops->ndo_bridge_getlink) { +		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); +		if (err < 0) +			goto errout; +	} + +	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); +	return 0; +errout: +	WARN_ON(err == -EMSGSIZE); +	kfree_skb(skb); +	rtnl_set_sk_err(net, RTNLGRP_LINK, err); +	return err; +} + +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, +			       void *arg) +{ +	struct net *net = sock_net(skb->sk); +	struct ifinfomsg *ifm; +	struct net_device *dev; +	struct nlattr *br_spec, *attr = NULL; +	int rem, err = -EOPNOTSUPP; +	u16 oflags, flags = 0; +	bool have_flags = false; + +	if (nlmsg_len(nlh) < sizeof(*ifm)) +		return -EINVAL; + +	ifm = nlmsg_data(nlh); +	if (ifm->ifi_family != AF_BRIDGE) +		return -EPFNOSUPPORT; + +	dev = __dev_get_by_index(net, ifm->ifi_index); +	if (!dev) { +		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); +		return -ENODEV; +	} + +	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); +	if (br_spec) { +		nla_for_each_nested(attr, br_spec, rem) { +			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { +				have_flags = true; +				flags = nla_get_u16(attr); +				break; +			} +		} +	} + +	oflags = flags; + +	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh); +		if (err) +			goto out; + +		flags &= ~BRIDGE_FLAGS_MASTER; +	} + +	if ((flags & BRIDGE_FLAGS_SELF)) { +		if (!dev->netdev_ops->ndo_bridge_setlink) +			err = -EOPNOTSUPP; +		else +			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh); + +		if (!err) +			flags &= ~BRIDGE_FLAGS_SELF; +	} + +	if (have_flags) +		memcpy(nla_data(attr), &flags, sizeof(flags)); +	/* Generate event to notify upper layer of bridge change */ +	if (!err) +		err = rtnl_bridge_notify(dev, oflags); +out: +	return err; +} + +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, +			       void *arg) +{ +	struct net *net = sock_net(skb->sk); +	struct ifinfomsg *ifm; +	struct net_device *dev; +	struct nlattr *br_spec, *attr = NULL; +	int rem, err = -EOPNOTSUPP; +	u16 oflags, flags = 0; +	bool have_flags = false; + +	if (nlmsg_len(nlh) < sizeof(*ifm)) +		return -EINVAL; + +	ifm = nlmsg_data(nlh); +	if (ifm->ifi_family != AF_BRIDGE) +		return -EPFNOSUPPORT; + +	dev = __dev_get_by_index(net, ifm->ifi_index); +	if (!dev) { +		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); +		return -ENODEV; +	} + +	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); +	if (br_spec) { +		nla_for_each_nested(attr, br_spec, rem) { +			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { +				have_flags = true; +				flags = nla_get_u16(attr); +				break; +			} +		} +	} + +	oflags = flags; + +	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh); +		if (err) +			goto out; + +		flags &= ~BRIDGE_FLAGS_MASTER; +	} + +	if ((flags & BRIDGE_FLAGS_SELF)) { +		if (!dev->netdev_ops->ndo_bridge_dellink) +			err = -EOPNOTSUPP; +		else +			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + +		if (!err) +			flags &= ~BRIDGE_FLAGS_SELF; +	} + +	if (have_flags) +		memcpy(nla_data(attr), &flags, sizeof(flags)); +	/* Generate event to notify upper layer of bridge change */ +	if (!err) +		err = rtnl_bridge_notify(dev, oflags); +out: +	return err; +} +  /* Protected by RTNL sempahore.  */  static struct rtattr **rta_buf;  static int rtattr_max; @@ -2282,7 +2581,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	sz_idx = type>>2;  	kind = type&3; -	if (kind != 2 && !capable(CAP_NET_ADMIN)) +	if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -2433,5 +2732,9 @@ void __init rtnetlink_init(void)  	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);  	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);  	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); + +	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); +	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); +	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);  } diff --git a/net/core/scm.c b/net/core/scm.c index ab570841a53..905dcc6ad1e 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -35,6 +35,7 @@  #include <net/sock.h>  #include <net/compat.h>  #include <net/scm.h> +#include <net/cls_cgroup.h>  /* @@ -51,11 +52,11 @@ static __inline__ int scm_check_creds(struct ucred *creds)  	if (!uid_valid(uid) || !gid_valid(gid))  		return -EINVAL; -	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && +	if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) &&  	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) || -	      uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) && +	      uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) &&  	    ((gid_eq(gid, cred->gid)   || gid_eq(gid, cred->egid) || -	      gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) { +	      gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) {  	       return 0;  	}  	return -EPERM; @@ -302,8 +303,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)  		}  		/* Bump the usage count and install the file. */  		sock = sock_from_file(fp[i], &err); -		if (sock) +		if (sock) {  			sock_update_netprioidx(sock->sk, current); +			sock_update_classid(sock->sk, current); +		}  		fd_install(new_fd, get_file(fp[i]));  	} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4007c1437fd..33245ef54c3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -104,47 +104,37 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = {  	.get = sock_pipe_buf_get,  }; -/* - *	Keep out-of-line to prevent kernel bloat. - *	__builtin_return_address is not used because it is not always - *	reliable. - */ -  /** - *	skb_over_panic	- 	private function - *	@skb: buffer - *	@sz: size - *	@here: address + *	skb_panic - private function for out-of-line support + *	@skb:	buffer + *	@sz:	size + *	@addr:	address + *	@msg:	skb_over_panic or skb_under_panic   * - *	Out of line support code for skb_put(). Not user callable. + *	Out-of-line support for skb_put() and skb_push(). + *	Called via the wrapper skb_over_panic() or skb_under_panic(). + *	Keep out of line to prevent kernel bloat. + *	__builtin_return_address is not used because it is not always reliable.   */ -static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, +		      const char msg[])  {  	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", -		 __func__, here, skb->len, sz, skb->head, skb->data, +		 msg, addr, skb->len, sz, skb->head, skb->data,  		 (unsigned long)skb->tail, (unsigned long)skb->end,  		 skb->dev ? skb->dev->name : "<NULL>");  	BUG();  } -/** - *	skb_under_panic	- 	private function - *	@skb: buffer - *	@sz: size - *	@here: address - * - *	Out of line support code for skb_push(). Not user callable. - */ - -static void skb_under_panic(struct sk_buff *skb, int sz, void *here) +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)  { -	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", -		 __func__, here, skb->len, sz, skb->head, skb->data, -		 (unsigned long)skb->tail, (unsigned long)skb->end, -		 skb->dev ? skb->dev->name : "<NULL>"); -	BUG(); +	skb_panic(skb, sz, addr, __func__);  } +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) +{ +	skb_panic(skb, sz, addr, __func__); +}  /*   * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells @@ -155,8 +145,9 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)   */  #define kmalloc_reserve(size, gfp, node, pfmemalloc) \  	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) -void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, -			 bool *pfmemalloc) + +static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, +			       unsigned long ip, bool *pfmemalloc)  {  	void *obj;  	bool ret_pfmemalloc = false; @@ -259,6 +250,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  	skb->end = skb->tail + size;  #ifdef NET_SKBUFF_DATA_USES_OFFSET  	skb->mac_header = ~0U; +	skb->transport_header = ~0U;  #endif  	/* make sure we initialize shinfo sequentially */ @@ -327,6 +319,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)  	skb->end = skb->tail + size;  #ifdef NET_SKBUFF_DATA_USES_OFFSET  	skb->mac_header = ~0U; +	skb->transport_header = ~0U;  #endif  	/* make sure we initialize shinfo sequentially */ @@ -348,10 +341,6 @@ struct netdev_alloc_cache {  };  static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) -#define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) -#define NETDEV_PAGECNT_MAX_BIAS	   NETDEV_FRAG_PAGE_MAX_SIZE -  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)  {  	struct netdev_alloc_cache *nc; @@ -519,7 +508,7 @@ static void skb_release_data(struct sk_buff *skb)  			uarg = skb_shinfo(skb)->destructor_arg;  			if (uarg->callback) -				uarg->callback(uarg); +				uarg->callback(uarg, true);  		}  		if (skb_has_frag_list(skb)) @@ -635,6 +624,26 @@ void kfree_skb(struct sk_buff *skb)  EXPORT_SYMBOL(kfree_skb);  /** + *	skb_tx_error - report an sk_buff xmit error + *	@skb: buffer that triggered an error + * + *	Report xmit error if a device callback is tracking this skb. + *	skb must be freed afterwards. + */ +void skb_tx_error(struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		struct ubuf_info *uarg; + +		uarg = skb_shinfo(skb)->destructor_arg; +		if (uarg->callback) +			uarg->callback(uarg, false); +		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; +	} +} +EXPORT_SYMBOL(skb_tx_error); + +/**   *	consume_skb - free an skbuff   *	@skb: buffer to free   * @@ -662,11 +671,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->transport_header	= old->transport_header;  	new->network_header	= old->network_header;  	new->mac_header		= old->mac_header; +	new->inner_transport_header = old->inner_transport_header; +	new->inner_network_header = old->inner_network_header;  	skb_dst_copy(new, old);  	new->rxhash		= old->rxhash;  	new->ooo_okay		= old->ooo_okay;  	new->l4_rxhash		= old->l4_rxhash;  	new->no_fcs		= old->no_fcs; +	new->encapsulation	= old->encapsulation;  #ifdef CONFIG_XFRM  	new->sp			= secpath_get(old->sp);  #endif @@ -797,7 +809,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  	for (i = 0; i < num_frags; i++)  		skb_frag_unref(skb, i); -	uarg->callback(uarg); +	uarg->callback(uarg, false);  	/* skb frags point to kernel buffers */  	for (i = num_frags - 1; i >= 0; i--) { @@ -872,6 +884,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->network_header   += offset;  	if (skb_mac_header_was_set(new))  		new->mac_header	      += offset; +	new->inner_transport_header += offset; +	new->inner_network_header   += offset;  #endif  	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;  	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -1069,6 +1083,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  	skb->network_header   += off;  	if (skb_mac_header_was_set(skb))  		skb->mac_header += off; +	skb->inner_transport_header += off; +	skb->inner_network_header += off;  	/* Only adjust this if it actually is csum_start rather than csum */  	if (skb->ip_summed == CHECKSUM_PARTIAL)  		skb->csum_start += nhead; @@ -1168,6 +1184,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	n->network_header   += off;  	if (skb_mac_header_was_set(skb))  		n->mac_header += off; +	n->inner_transport_header += off; +	n->inner_network_header	   += off;  #endif  	return n; @@ -1620,7 +1638,7 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)  static struct page *linear_to_page(struct page *page, unsigned int *len,  				   unsigned int *offset, -				   struct sk_buff *skb, struct sock *sk) +				   struct sock *sk)  {  	struct page_frag *pfrag = sk_page_frag(sk); @@ -1653,14 +1671,14 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd,  static bool spd_fill_page(struct splice_pipe_desc *spd,  			  struct pipe_inode_info *pipe, struct page *page,  			  unsigned int *len, unsigned int offset, -			  struct sk_buff *skb, bool linear, +			  bool linear,  			  struct sock *sk)  {  	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))  		return true;  	if (linear) { -		page = linear_to_page(page, len, &offset, skb, sk); +		page = linear_to_page(page, len, &offset, sk);  		if (!page)  			return true;  	} @@ -1677,23 +1695,9 @@ static bool spd_fill_page(struct splice_pipe_desc *spd,  	return false;  } -static inline void __segment_seek(struct page **page, unsigned int *poff, -				  unsigned int *plen, unsigned int off) -{ -	unsigned long n; - -	*poff += off; -	n = *poff / PAGE_SIZE; -	if (n) -		*page = nth_page(*page, n); - -	*poff = *poff % PAGE_SIZE; -	*plen -= off; -} -  static bool __splice_segment(struct page *page, unsigned int poff,  			     unsigned int plen, unsigned int *off, -			     unsigned int *len, struct sk_buff *skb, +			     unsigned int *len,  			     struct splice_pipe_desc *spd, bool linear,  			     struct sock *sk,  			     struct pipe_inode_info *pipe) @@ -1708,23 +1712,19 @@ static bool __splice_segment(struct page *page, unsigned int poff,  	}  	/* ignore any bits we already processed */ -	if (*off) { -		__segment_seek(&page, &poff, &plen, *off); -		*off = 0; -	} +	poff += *off; +	plen -= *off; +	*off = 0;  	do {  		unsigned int flen = min(*len, plen); -		/* the linear region may spread across several pages  */ -		flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - -		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) +		if (spd_fill_page(spd, pipe, page, &flen, poff, +				  linear, sk))  			return true; - -		__segment_seek(&page, &poff, &plen, flen); +		poff += flen; +		plen -= flen;  		*len -= flen; -  	} while (*len && plen);  	return false; @@ -1748,7 +1748,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,  	if (__splice_segment(virt_to_page(skb->data),  			     (unsigned long) skb->data & (PAGE_SIZE - 1),  			     skb_headlen(skb), -			     offset, len, skb, spd, +			     offset, len, spd,  			     skb_head_is_locked(skb),  			     sk, pipe))  		return true; @@ -1761,7 +1761,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,  		if (__splice_segment(skb_frag_page(f),  				     f->page_offset, skb_frag_size(f), -				     offset, len, skb, spd, false, sk, pipe)) +				     offset, len, spd, false, sk, pipe))  			return true;  	} @@ -2326,6 +2326,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)  {  	int pos = skb_headlen(skb); +	skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;  	if (len < pos)	/* Split line is inside header. */  		skb_split_inside_header(skb, skb1, len, pos);  	else		/* Second chunk has no header, nothing to copy. */ @@ -2657,48 +2658,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,  					int len, int odd, struct sk_buff *skb),  			void *from, int length)  { -	int frg_cnt = 0; -	skb_frag_t *frag = NULL; -	struct page *page = NULL; -	int copy, left; +	int frg_cnt = skb_shinfo(skb)->nr_frags; +	int copy;  	int offset = 0;  	int ret; +	struct page_frag *pfrag = ¤t->task_frag;  	do {  		/* Return error if we don't have space for new frag */ -		frg_cnt = skb_shinfo(skb)->nr_frags;  		if (frg_cnt >= MAX_SKB_FRAGS) -			return -EFAULT; +			return -EMSGSIZE; -		/* allocate a new page for next frag */ -		page = alloc_pages(sk->sk_allocation, 0); - -		/* If alloc_page fails just return failure and caller will -		 * free previous allocated pages by doing kfree_skb() -		 */ -		if (page == NULL) +		if (!sk_page_frag_refill(sk, pfrag))  			return -ENOMEM; -		/* initialize the next frag */ -		skb_fill_page_desc(skb, frg_cnt, page, 0, 0); -		skb->truesize += PAGE_SIZE; -		atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - -		/* get the new initialized frag */ -		frg_cnt = skb_shinfo(skb)->nr_frags; -		frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; -  		/* copy the user data to page */ -		left = PAGE_SIZE - frag->page_offset; -		copy = (length > left)? left : length; +		copy = min_t(int, length, pfrag->size - pfrag->offset); -		ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), -			    offset, copy, 0, skb); +		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, +			      offset, copy, 0, skb);  		if (ret < 0)  			return -EFAULT;  		/* copy was successful so update the size parameters */ -		skb_frag_size_add(frag, copy); +		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, +				   copy); +		frg_cnt++; +		pfrag->offset += copy; +		get_page(pfrag->page); + +		skb->truesize += copy; +		atomic_add(copy, &sk->sk_wmem_alloc);  		skb->len += copy;  		skb->data_len += copy;  		offset += copy; @@ -2748,6 +2738,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  	unsigned int mss = skb_shinfo(skb)->gso_size;  	unsigned int doffset = skb->data - skb_mac_header(skb);  	unsigned int offset = doffset; +	unsigned int tnl_hlen = skb_tnl_header_len(skb);  	unsigned int headroom;  	unsigned int len;  	int sg = !!(features & NETIF_F_SG); @@ -2824,7 +2815,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  		skb_set_network_header(nskb, skb->mac_len);  		nskb->transport_header = (nskb->network_header +  					  skb_network_header_len(skb)); -		skb_copy_from_linear_data(skb, nskb->data, doffset); + +		skb_copy_from_linear_data_offset(skb, -tnl_hlen, +						 nskb->data - tnl_hlen, +						 doffset + tnl_hlen);  		if (fskb != skb_shinfo(skb)->frag_list)  			continue; @@ -2842,6 +2836,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  		skb_copy_from_linear_data_offset(skb, offset,  						 skb_put(nskb, hsize), hsize); +		skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; +  		while (pos < offset + len && i < nfrags) {  			*frag = skb_shinfo(skb)->frags[i];  			__skb_frag_ref(frag); @@ -2999,12 +2995,11 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  	memcpy(skb_mac_header(nskb), skb_mac_header(p),  	       p->data - skb_mac_header(p)); -	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);  	skb_shinfo(nskb)->frag_list = p;  	skb_shinfo(nskb)->gso_size = pinfo->gso_size;  	pinfo->gso_size = 0;  	skb_header_release(p); -	nskb->prev = p; +	NAPI_GRO_CB(nskb)->last = p;  	nskb->data_len += p->len;  	nskb->truesize += p->truesize; @@ -3030,8 +3025,8 @@ merge:  	__skb_pull(skb, offset); -	p->prev->next = skb; -	p->prev = skb; +	NAPI_GRO_CB(p)->last->next = skb; +	NAPI_GRO_CB(p)->last = skb;  	skb_header_release(skb);  done: diff --git a/net/core/sock.c b/net/core/sock.c index 8a146cfcc36..b261a797774 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -186,8 +186,10 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)  static struct lock_class_key af_family_keys[AF_MAX];  static struct lock_class_key af_family_slock_keys[AF_MAX]; +#if defined(CONFIG_MEMCG_KMEM)  struct static_key memcg_socket_limit_enabled;  EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif  /*   * Make lock validator output more readable. (we pre-construct these @@ -505,7 +507,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)  }  EXPORT_SYMBOL(sk_dst_check); -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +static int sock_setbindtodevice(struct sock *sk, char __user *optval, +				int optlen)  {  	int ret = -ENOPROTOOPT;  #ifdef CONFIG_NETDEVICES @@ -515,7 +518,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)  	/* Sorry... */  	ret = -EPERM; -	if (!capable(CAP_NET_RAW)) +	if (!ns_capable(net->user_ns, CAP_NET_RAW))  		goto out;  	ret = -EINVAL; @@ -562,6 +565,59 @@ out:  	return ret;  } +static int sock_getbindtodevice(struct sock *sk, char __user *optval, +				int __user *optlen, int len) +{ +	int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES +	struct net *net = sock_net(sk); +	struct net_device *dev; +	char devname[IFNAMSIZ]; +	unsigned seq; + +	if (sk->sk_bound_dev_if == 0) { +		len = 0; +		goto zero; +	} + +	ret = -EINVAL; +	if (len < IFNAMSIZ) +		goto out; + +retry: +	seq = read_seqcount_begin(&devnet_rename_seq); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); +	ret = -ENODEV; +	if (!dev) { +		rcu_read_unlock(); +		goto out; +	} + +	strcpy(devname, dev->name); +	rcu_read_unlock(); +	if (read_seqcount_retry(&devnet_rename_seq, seq)) +		goto retry; + +	len = strlen(devname) + 1; + +	ret = -EFAULT; +	if (copy_to_user(optval, devname, len)) +		goto out; + +zero: +	ret = -EFAULT; +	if (put_user(len, optlen)) +		goto out; + +	ret = 0; + +out: +#endif + +	return ret; +} +  static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)  {  	if (valbool) @@ -589,7 +645,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  	 */  	if (optname == SO_BINDTODEVICE) -		return sock_bindtodevice(sk, optval, optlen); +		return sock_setbindtodevice(sk, optval, optlen);  	if (optlen < sizeof(int))  		return -EINVAL; @@ -611,6 +667,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  	case SO_REUSEADDR:  		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);  		break; +	case SO_REUSEPORT: +		sk->sk_reuseport = valbool; +		break;  	case SO_TYPE:  	case SO_PROTOCOL:  	case SO_DOMAIN: @@ -696,7 +755,8 @@ set_rcvbuf:  		break;  	case SO_PRIORITY: -		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) +		if ((val >= 0 && val <= 6) || +		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			sk->sk_priority = val;  		else  			ret = -EPERM; @@ -806,6 +866,13 @@ set_rcvbuf:  		ret = sk_detach_filter(sk);  		break; +	case SO_LOCK_FILTER: +		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) +			ret = -EPERM; +		else +			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); +		break; +  	case SO_PASSSEC:  		if (valbool)  			set_bit(SOCK_PASSSEC, &sock->flags); @@ -813,7 +880,7 @@ set_rcvbuf:  			clear_bit(SOCK_PASSSEC, &sock->flags);  		break;  	case SO_MARK: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			ret = -EPERM;  		else  			sk->sk_mark = val; @@ -910,6 +977,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		v.val = sk->sk_reuse;  		break; +	case SO_REUSEPORT: +		v.val = sk->sk_reuseport; +		break; +  	case SO_KEEPALIVE:  		v.val = sock_flag(sk, SOCK_KEEPOPEN);  		break; @@ -1074,6 +1145,21 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  	case SO_NOFCS:  		v.val = sock_flag(sk, SOCK_NOFCS);  		break; + +	case SO_BINDTODEVICE: +		return sock_getbindtodevice(sk, optval, optlen, len); + +	case SO_GET_FILTER: +		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); +		if (len < 0) +			return len; + +		goto lenout; + +	case SO_LOCK_FILTER: +		v.val = sock_flag(sk, SOCK_FILTER_LOCKED); +		break; +  	default:  		return -ENOPROTOOPT;  	} @@ -1214,13 +1300,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)  #ifdef CONFIG_CGROUPS  #if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) +void sock_update_classid(struct sock *sk, struct task_struct *task)  {  	u32 classid; -	rcu_read_lock();  /* doing current task, which cannot vanish. */ -	classid = task_cls_classid(current); -	rcu_read_unlock(); +	classid = task_cls_classid(task);  	if (classid != sk->sk_classid)  		sk->sk_classid = classid;  } @@ -1263,7 +1347,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		sock_net_set(sk, get_net(net));  		atomic_set(&sk->sk_wmem_alloc, 1); -		sock_update_classid(sk); +		sock_update_classid(sk, current);  		sock_update_netprioidx(sk, current);  	} @@ -2148,7 +2232,7 @@ EXPORT_SYMBOL(sk_reset_timer);  void sk_stop_timer(struct sock *sk, struct timer_list* timer)  { -	if (timer_pending(timer) && del_timer(timer)) +	if (del_timer(timer))  		__sock_put(sk);  }  EXPORT_SYMBOL(sk_stop_timer); @@ -2754,7 +2838,7 @@ static const struct file_operations proto_seq_fops = {  static __net_init int proto_init_net(struct net *net)  { -	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) +	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))  		return -ENOMEM;  	return 0; @@ -2762,7 +2846,7 @@ static __net_init int proto_init_net(struct net *net)  static __net_exit void proto_exit_net(struct net *net)  { -	proc_net_remove(net, "protocols"); +	remove_proc_entry("protocols", net->proc_net);  } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 602cd637182..a29e90cf36b 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -97,21 +97,6 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)  }  EXPORT_SYMBOL_GPL(sock_diag_unregister); -static const inline struct sock_diag_handler *sock_diag_lock_handler(int family) -{ -	if (sock_diag_handlers[family] == NULL) -		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, -				NETLINK_SOCK_DIAG, family); - -	mutex_lock(&sock_diag_table_mutex); -	return sock_diag_handlers[family]; -} - -static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) -{ -	mutex_unlock(&sock_diag_table_mutex); -} -  static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	int err; @@ -121,12 +106,20 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	if (nlmsg_len(nlh) < sizeof(*req))  		return -EINVAL; -	hndl = sock_diag_lock_handler(req->sdiag_family); +	if (req->sdiag_family >= AF_MAX) +		return -EINVAL; + +	if (sock_diag_handlers[req->sdiag_family] == NULL) +		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, +				NETLINK_SOCK_DIAG, req->sdiag_family); + +	mutex_lock(&sock_diag_table_mutex); +	hndl = sock_diag_handlers[req->sdiag_family];  	if (hndl == NULL)  		err = -ENOENT;  	else  		err = hndl->dump(skb, nlh); -	sock_diag_unlock_handler(hndl); +	mutex_unlock(&sock_diag_table_mutex);  	return err;  } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a7c36845b12..cfdb46ab3a7 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -20,6 +20,8 @@  #include <net/sock.h>  #include <net/net_ratelimit.h> +static int one = 1; +  #ifdef CONFIG_RPS  static int rps_sock_flow_sysctl(ctl_table *table, int write,  				void __user *buffer, size_t *lenp, loff_t *ppos) @@ -92,28 +94,32 @@ static struct ctl_table net_core_table[] = {  		.data		= &sysctl_wmem_max,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "rmem_max",  		.data		= &sysctl_rmem_max,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "wmem_default",  		.data		= &sysctl_wmem_default,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "rmem_default",  		.data		= &sysctl_rmem_default,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "dev_weight", @@ -216,6 +222,11 @@ static __net_init int sysctl_core_net_init(struct net *net)  			goto err_dup;  		tbl[0].data = &net->core.sysctl_somaxconn; + +		/* Don't export any sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) { +			tbl[0].procname = NULL; +		}  	}  	net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);  |