diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/dev.c | 244 | ||||
| -rw-r--r-- | net/core/ethtool.c | 2 | ||||
| -rw-r--r-- | net/core/filter.c | 139 | ||||
| -rw-r--r-- | net/core/flow.c | 4 | ||||
| -rw-r--r-- | net/core/neighbour.c | 20 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 21 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 55 | ||||
| -rw-r--r-- | net/core/netpoll.c | 6 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 262 | ||||
| -rw-r--r-- | net/core/pktgen.c | 47 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 230 | ||||
| -rw-r--r-- | net/core/scm.c | 6 | ||||
| -rw-r--r-- | net/core/skbuff.c | 34 | ||||
| -rw-r--r-- | net/core/sock.c | 84 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 5 | 
15 files changed, 862 insertions, 297 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index e5942bf45a6..f64e439b4a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -176,8 +176,10 @@  #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)  static DEFINE_SPINLOCK(ptype_lock); +static DEFINE_SPINLOCK(offload_lock);  static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;  static struct list_head ptype_all __read_mostly;	/* Taps */ +static struct list_head offload_base __read_mostly;  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -201,6 +203,8 @@ static struct list_head ptype_all __read_mostly;	/* Taps */  DEFINE_RWLOCK(dev_base_lock);  EXPORT_SYMBOL(dev_base_lock); +seqcount_t devnet_rename_seq; +  static inline void dev_base_seq_inc(struct net *net)  {  	while (++net->dev_base_seq == 0); @@ -470,6 +474,82 @@ void dev_remove_pack(struct packet_type *pt)  }  EXPORT_SYMBOL(dev_remove_pack); + +/** + *	dev_add_offload - register offload handlers + *	@po: protocol offload declaration + * + *	Add protocol offload handlers to the networking stack. The passed + *	&proto_offload is linked into kernel lists and may not be freed until + *	it has been removed from the kernel lists. + * + *	This call does not sleep therefore it can not + *	guarantee all CPU's that are in middle of receiving packets + *	will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; + +	spin_lock(&offload_lock); +	list_add_rcu(&po->list, head); +	spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + *	__dev_remove_offload	 - remove offload handler + *	@po: packet offload declaration + * + *	Remove a protocol offload handler that was previously added to the + *	kernel offload handlers by dev_add_offload(). The passed &offload_type + *	is removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *      The packet type might still be in use by receivers + *	and must not be freed until after all the CPU's have gone + *	through a quiescent state. + */ +void __dev_remove_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; +	struct packet_offload *po1; + +	spin_lock(&offload_lock); + +	list_for_each_entry(po1, head, list) { +		if (po == po1) { +			list_del_rcu(&po->list); +			goto out; +		} +	} + +	pr_warn("dev_remove_offload: %p not found\n", po); +out: +	spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(__dev_remove_offload); + +/** + *	dev_remove_offload	 - remove packet offload handler + *	@po: packet offload declaration + * + *	Remove a packet offload handler that was previously added to the kernel + *	offload handlers by dev_add_offload(). The passed &offload_type is + *	removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *	This call sleeps to guarantee that no CPU is looking at the packet + *	type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ +	__dev_remove_offload(po); + +	synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); +  /******************************************************************************  		      Device Boot-time Settings Routines @@ -1013,22 +1093,31 @@ int dev_change_name(struct net_device *dev, const char *newname)  	if (dev->flags & IFF_UP)  		return -EBUSY; -	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) +	write_seqcount_begin(&devnet_rename_seq); + +	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { +		write_seqcount_end(&devnet_rename_seq);  		return 0; +	}  	memcpy(oldname, dev->name, IFNAMSIZ);  	err = dev_get_valid_name(net, dev, newname); -	if (err < 0) +	if (err < 0) { +		write_seqcount_end(&devnet_rename_seq);  		return err; +	}  rollback:  	ret = device_rename(&dev->dev, dev->name);  	if (ret) {  		memcpy(dev->name, oldname, IFNAMSIZ); +		write_seqcount_end(&devnet_rename_seq);  		return ret;  	} +	write_seqcount_end(&devnet_rename_seq); +  	write_lock_bh(&dev_base_lock);  	hlist_del_rcu(&dev->name_hlist);  	write_unlock_bh(&dev_base_lock); @@ -1046,6 +1135,7 @@ rollback:  		/* err >= 0 after dev_alloc_name() or stores the first errno */  		if (err >= 0) {  			err = ret; +			write_seqcount_begin(&devnet_rename_seq);  			memcpy(dev->name, oldname, IFNAMSIZ);  			goto rollback;  		} else { @@ -1075,10 +1165,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  		return -EINVAL;  	if (!len) { -		if (dev->ifalias) { -			kfree(dev->ifalias); -			dev->ifalias = NULL; -		} +		kfree(dev->ifalias); +		dev->ifalias = NULL;  		return 0;  	} @@ -1994,7 +2082,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,  	netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol;  	int vlan_depth = ETH_HLEN;  	int err; @@ -2023,18 +2111,17 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,  	}  	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, -			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && !ptype->dev && ptype->gso_segment) { +	list_for_each_entry_rcu(ptype, &offload_base, list) { +		if (ptype->type == type && ptype->callbacks.gso_segment) {  			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { -				err = ptype->gso_send_check(skb); +				err = ptype->callbacks.gso_send_check(skb);  				segs = ERR_PTR(err);  				if (err || skb_gso_ok(skb, features))  					break;  				__skb_push(skb, (skb->data -  						 skb_network_header(skb)));  			} -			segs = ptype->gso_segment(skb, features); +			segs = ptype->callbacks.gso_segment(skb, features);  			break;  		}  	} @@ -2237,6 +2324,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			skb->vlan_tci = 0;  		} +		/* If encapsulation offload request, verify we are testing +		 * hardware encapsulation features instead of standard +		 * features for the netdev +		 */ +		if (skb->encapsulation) +			features &= dev->hw_enc_features; +  		if (netif_needs_gso(skb, features)) {  			if (unlikely(dev_gso_segment(skb, features)))  				goto out_kfree_skb; @@ -2252,8 +2346,12 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			 * checksumming here.  			 */  			if (skb->ip_summed == CHECKSUM_PARTIAL) { -				skb_set_transport_header(skb, -					skb_checksum_start_offset(skb)); +				if (skb->encapsulation) +					skb_set_inner_transport_header(skb, +						skb_checksum_start_offset(skb)); +				else +					skb_set_transport_header(skb, +						skb_checksum_start_offset(skb));  				if (!(features & NETIF_F_ALL_CSUM) &&  				     skb_checksum_help(skb))  					goto out_kfree_skb; @@ -3446,9 +3544,9 @@ static void flush_backlog(void *arg)  static int napi_gro_complete(struct sk_buff *skb)  { -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int err = -ENOENT;  	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -3460,10 +3558,10 @@ static int napi_gro_complete(struct sk_buff *skb)  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_complete) +		if (ptype->type != type || !ptype->callbacks.gro_complete)  			continue; -		err = ptype->gro_complete(skb); +		err = ptype->callbacks.gro_complete(skb);  		break;  	}  	rcu_read_unlock(); @@ -3507,12 +3605,34 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)  }  EXPORT_SYMBOL(napi_gro_flush); -enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ +	struct sk_buff *p; +	unsigned int maclen = skb->dev->hard_header_len; + +	for (p = napi->gro_list; p; p = p->next) { +		unsigned long diffs; + +		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; +		diffs |= p->vlan_tci ^ skb->vlan_tci; +		if (maclen == ETH_HLEN) +			diffs |= compare_ether_header(skb_mac_header(p), +						      skb_gro_mac_header(skb)); +		else if (!diffs) +			diffs = memcmp(skb_mac_header(p), +				       skb_gro_mac_header(skb), +				       maclen); +		NAPI_GRO_CB(p)->same_flow = !diffs; +		NAPI_GRO_CB(p)->flush = 0; +	} +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff **pp = NULL; -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int same_flow;  	int mac_len;  	enum gro_result ret; @@ -3523,9 +3643,11 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal; +	gro_list_prepare(napi, skb); +  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_receive) +		if (ptype->type != type || !ptype->callbacks.gro_receive)  			continue;  		skb_set_network_header(skb, skb_gro_offset(skb)); @@ -3535,7 +3657,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  		NAPI_GRO_CB(skb)->flush = 0;  		NAPI_GRO_CB(skb)->free = 0; -		pp = ptype->gro_receive(&napi->gro_list, skb); +		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);  		break;  	}  	rcu_read_unlock(); @@ -3598,34 +3720,9 @@ normal:  	ret = GRO_NORMAL;  	goto pull;  } -EXPORT_SYMBOL(dev_gro_receive); - -static inline gro_result_t -__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ -	struct sk_buff *p; -	unsigned int maclen = skb->dev->hard_header_len; -	for (p = napi->gro_list; p; p = p->next) { -		unsigned long diffs; -		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; -		diffs |= p->vlan_tci ^ skb->vlan_tci; -		if (maclen == ETH_HLEN) -			diffs |= compare_ether_header(skb_mac_header(p), -						      skb_gro_mac_header(skb)); -		else if (!diffs) -			diffs = memcmp(skb_mac_header(p), -				       skb_gro_mac_header(skb), -				       maclen); -		NAPI_GRO_CB(p)->same_flow = !diffs; -		NAPI_GRO_CB(p)->flush = 0; -	} - -	return dev_gro_receive(napi, skb); -} - -gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) {  	case GRO_NORMAL: @@ -3651,7 +3748,6 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  	return ret;  } -EXPORT_SYMBOL(napi_skb_finish);  static void skb_gro_reset_offset(struct sk_buff *skb)  { @@ -3674,7 +3770,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	skb_gro_reset_offset(skb); -	return napi_skb_finish(__napi_gro_receive(napi, skb), skb); +	return napi_skb_finish(dev_gro_receive(napi, skb), skb);  }  EXPORT_SYMBOL(napi_gro_receive); @@ -3703,7 +3799,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)  }  EXPORT_SYMBOL(napi_get_frags); -gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, +static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,  			       gro_result_t ret)  {  	switch (ret) { @@ -3728,7 +3824,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,  	return ret;  } -EXPORT_SYMBOL(napi_frags_finish);  static struct sk_buff *napi_frags_skb(struct napi_struct *napi)  { @@ -3773,7 +3868,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)  	if (!skb)  		return GRO_DROP; -	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags); @@ -4075,6 +4170,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)  {  	struct net_device *dev;  	struct ifreq ifr; +	unsigned seq;  	/*  	 *	Fetch the caller's info block. @@ -4083,6 +4179,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)  	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))  		return -EFAULT; +retry: +	seq = read_seqcount_begin(&devnet_rename_seq);  	rcu_read_lock();  	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);  	if (!dev) { @@ -4092,6 +4190,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)  	strcpy(ifr.ifr_name, dev->name);  	rcu_read_unlock(); +	if (read_seqcount_retry(&devnet_rename_seq, seq)) +		goto retry;  	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))  		return -EFAULT; @@ -4884,7 +4984,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  	else  		dev->mtu = new_mtu; -	if (!err && dev->flags & IFF_UP) +	if (!err)  		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);  	return err;  } @@ -5204,7 +5304,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	case SIOCGMIIPHY:  	case SIOCGMIIREG:  	case SIOCSIFNAME: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  		dev_load(net, ifr.ifr_name);  		rtnl_lock(); @@ -5225,16 +5325,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	 *	- require strict serialization.  	 *	- do not return a value  	 */ +	case SIOCSIFMAP: +	case SIOCSIFTXQLEN: +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	/* +	 *	These ioctl calls: +	 *	- require local superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */  	case SIOCSIFFLAGS:  	case SIOCSIFMETRIC:  	case SIOCSIFMTU: -	case SIOCSIFMAP:  	case SIOCSIFHWADDR:  	case SIOCSIFSLAVE:  	case SIOCADDMULTI:  	case SIOCDELMULTI:  	case SIOCSIFHWBROADCAST: -	case SIOCSIFTXQLEN:  	case SIOCSMIIREG:  	case SIOCBONDENSLAVE:  	case SIOCBONDRELEASE: @@ -5243,7 +5352,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	case SIOCBRADDIF:  	case SIOCBRDELIF:  	case SIOCSHWTSTAMP: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  		/* fall through */  	case SIOCBONDSLAVEINFOQUERY: @@ -6012,6 +6121,14 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  static const struct ethtool_ops default_ethtool_ops; +void netdev_set_default_ethtool_ops(struct net_device *dev, +				    const struct ethtool_ops *ops) +{ +	if (dev->ethtool_ops == &default_ethtool_ops) +		dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +  /**   *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for @@ -6268,7 +6385,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  		goto out;  	/* Ensure the device has been registrered */ -	err = -EINVAL;  	if (dev->reg_state != NETREG_REGISTERED)  		goto out; @@ -6323,6 +6439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	dev_uc_flush(dev);  	dev_mc_flush(dev); +	/* Send a netdev-removed uevent to the old namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); +  	/* Actually switch the network namespace */  	dev_net_set(dev, net); @@ -6334,6 +6453,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  			dev->iflink = dev->ifindex;  	} +	/* Send a netdev-add uevent to the new namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_ADD); +  	/* Fixup kobjects */  	err = device_rename(&dev->dev, dev->name);  	WARN_ON(err); @@ -6666,6 +6788,8 @@ static int __init net_dev_init(void)  	for (i = 0; i < PTYPE_HASH_SIZE; i++)  		INIT_LIST_HEAD(&ptype_base[i]); +	INIT_LIST_HEAD(&offload_base); +  	if (register_pernet_subsys(&netdev_net_ops))  		goto out; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d64cc2e3fa..a8705432e4b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1460,7 +1460,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_GEEE:  		break;  	default: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  	} diff --git a/net/core/filter.c b/net/core/filter.c index 3d92ebb7fbc..c23543cba13 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -39,6 +39,7 @@  #include <linux/reciprocal_div.h>  #include <linux/ratelimit.h>  #include <linux/seccomp.h> +#include <linux/if_vlan.h>  /* No hurry in this branch   * @@ -341,6 +342,12 @@ load_b:  		case BPF_S_ANC_CPU:  			A = raw_smp_processor_id();  			continue; +		case BPF_S_ANC_VLAN_TAG: +			A = vlan_tx_tag_get(skb); +			continue; +		case BPF_S_ANC_VLAN_TAG_PRESENT: +			A = !!vlan_tx_tag_present(skb); +			continue;  		case BPF_S_ANC_NLATTR: {  			struct nlattr *nla; @@ -600,6 +607,8 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  			ANCILLARY(RXHASH);  			ANCILLARY(CPU);  			ANCILLARY(ALU_XOR_X); +			ANCILLARY(VLAN_TAG); +			ANCILLARY(VLAN_TAG_PRESENT);  			}  		}  		ftest->code = code; @@ -751,3 +760,133 @@ int sk_detach_filter(struct sock *sk)  	return ret;  }  EXPORT_SYMBOL_GPL(sk_detach_filter); + +static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) +{ +	static const u16 decodes[] = { +		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K, +		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X, +		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K, +		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X, +		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K, +		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X, +		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X, +		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K, +		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X, +		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K, +		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X, +		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K, +		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X, +		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K, +		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X, +		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K, +		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X, +		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K, +		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X, +		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG, +		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS, +		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS, +		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, +		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN, +		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND, +		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND, +		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND, +		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM, +		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN, +		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH, +		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM, +		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX, +		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA, +		[BPF_S_RET_K]		= BPF_RET|BPF_K, +		[BPF_S_RET_A]		= BPF_RET|BPF_A, +		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K, +		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM, +		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM, +		[BPF_S_ST]		= BPF_ST, +		[BPF_S_STX]		= BPF_STX, +		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA, +		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K, +		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X, +		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K, +		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X, +		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K, +		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X, +		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K, +		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X, +	}; +	u16 code; + +	code = filt->code; + +	to->code = decodes[code]; +	to->jt = filt->jt; +	to->jf = filt->jf; + +	if (code == BPF_S_ALU_DIV_K) { +		/* +		 * When loaded this rule user gave us X, which was +		 * translated into R = r(X). Now we calculate the +		 * RR = r(R) and report it back. If next time this +		 * value is loaded and RRR = r(RR) is calculated +		 * then the R == RRR will be true. +		 * +		 * One exception. X == 1 translates into R == 0 and +		 * we can't calculate RR out of it with r(). +		 */ + +		if (filt->k == 0) +			to->k = 1; +		else +			to->k = reciprocal_value(filt->k); + +		BUG_ON(reciprocal_value(to->k) != filt->k); +	} else +		to->k = filt->k; +} + +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +{ +	struct sk_filter *filter; +	int i, ret; + +	lock_sock(sk); +	filter = rcu_dereference_protected(sk->sk_filter, +			sock_owned_by_user(sk)); +	ret = 0; +	if (!filter) +		goto out; +	ret = filter->len; +	if (!len) +		goto out; +	ret = -EINVAL; +	if (len < filter->len) +		goto out; + +	ret = -EFAULT; +	for (i = 0; i < filter->len; i++) { +		struct sock_filter fb; + +		sk_decode_filter(&filter->insns[i], &fb); +		if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) +			goto out; +	} + +	ret = filter->len; +out: +	release_sock(sk); +	return ret; +} diff --git a/net/core/flow.c b/net/core/flow.c index e318c7e9804..b0901ee5a00 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -327,11 +327,9 @@ static void flow_cache_flush_tasklet(unsigned long data)  static void flow_cache_flush_per_cpu(void *data)  {  	struct flow_flush_info *info = data; -	int cpu;  	struct tasklet_struct *tasklet; -	cpu = smp_processor_id(); -	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; +	tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet);  	tasklet->data = (unsigned long)info;  	tasklet_schedule(tasklet);  } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 22571488730..c815f285e5a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1787,8 +1787,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) ||  	    /* approximative value for deprecated QUEUE_LEN (in packets) */  	    nla_put_u32(skb, NDTPA_QUEUE_LEN, -			DIV_ROUND_UP(parms->queue_len_bytes, -				     SKB_TRUESIZE(ETH_FRAME_LEN))) || +			parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) ||  	    nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) ||  	    nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) ||  	    nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || @@ -2770,6 +2769,8 @@ EXPORT_SYMBOL(neigh_app_ns);  #endif /* CONFIG_ARPD */  #ifdef CONFIG_SYSCTL +static int zero; +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);  static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,  			   size_t *lenp, loff_t *ppos) @@ -2777,9 +2778,13 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,  	int size, ret;  	ctl_table tmp = *ctl; +	tmp.extra1 = &zero; +	tmp.extra2 = &unres_qlen_max;  	tmp.data = &size; -	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); -	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + +	size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +  	if (write && !ret)  		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);  	return ret; @@ -2865,7 +2870,8 @@ static struct neigh_sysctl_table {  			.procname	= "unres_qlen_bytes",  			.maxlen		= sizeof(int),  			.mode		= 0644, -			.proc_handler	= proc_dointvec, +			.extra1		= &zero, +			.proc_handler   = proc_dointvec_minmax,  		},  		[NEIGH_VAR_PROXY_QLEN] = {  			.procname	= "proxy_qlen", @@ -2987,6 +2993,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;  	} +	/* Don't export sysctls to unprivileged users */ +	if (neigh_parms_net(p)->user_ns != &init_user_ns) +		t->neigh_vars[0].procname = NULL; +  	snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",  		p_name, dev_name_source);  	t->sysctl_header = diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 017a8bacfb2..28c5f5aa7ca 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -18,11 +18,9 @@  #include <net/sock.h>  #include <net/net_namespace.h>  #include <linux/rtnetlink.h> -#include <linux/wireless.h>  #include <linux/vmalloc.h>  #include <linux/export.h>  #include <linux/jiffies.h> -#include <net/wext.h>  #include "net-sysfs.h" @@ -73,11 +71,12 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  			    const char *buf, size_t len,  			    int (*set)(struct net_device *, unsigned long))  { -	struct net_device *net = to_net_dev(dev); +	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	unsigned long new;  	int ret = -EINVAL; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	ret = kstrtoul(buf, 0, &new); @@ -87,8 +86,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  	if (!rtnl_trylock())  		return restart_syscall(); -	if (dev_isalive(net)) { -		if ((ret = (*set)(net, new)) == 0) +	if (dev_isalive(netdev)) { +		if ((ret = (*set)(netdev, new)) == 0)  			ret = len;  	}  	rtnl_unlock(); @@ -264,6 +263,9 @@ static ssize_t store_tx_queue_len(struct device *dev,  				  struct device_attribute *attr,  				  const char *buf, size_t len)  { +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +  	return netdev_store(dev, attr, buf, len, change_tx_queue_len);  } @@ -271,10 +273,11 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,  			     const char *buf, size_t len)  {  	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	size_t count = len;  	ssize_t ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	/* ignore trailing newline */ @@ -1331,7 +1334,6 @@ struct kobj_ns_type_operations net_ns_type_operations = {  };  EXPORT_SYMBOL_GPL(net_ns_type_operations); -#ifdef CONFIG_HOTPLUG  static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  {  	struct net_device *dev = to_net_dev(d); @@ -1350,7 +1352,6 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  exit:  	return retval;  } -#endif  /*   *	netdev_release -- destroy and free a dead device. @@ -1379,9 +1380,7 @@ static struct class net_class = {  #ifdef CONFIG_SYSFS  	.dev_attrs = net_class_attributes,  #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_HOTPLUG  	.dev_uevent = netdev_uevent, -#endif  	.ns_type = &net_ns_type_operations,  	.namespace = net_namespace,  }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 42f1e1c7514..8acce01b6da 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -13,6 +13,7 @@  #include <linux/proc_fs.h>  #include <linux/file.h>  #include <linux/export.h> +#include <linux/user_namespace.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h> @@ -145,7 +146,7 @@ static void ops_free_list(const struct pernet_operations *ops,  /*   * setup_net runs the initializers for the network namespace object.   */ -static __net_init int setup_net(struct net *net) +static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)  {  	/* Must be called with net_mutex held */  	const struct pernet_operations *ops, *saved_ops; @@ -155,6 +156,7 @@ static __net_init int setup_net(struct net *net)  	atomic_set(&net->count, 1);  	atomic_set(&net->passive, 1);  	net->dev_base_seq = 1; +	net->user_ns = user_ns;  #ifdef NETNS_REFCNT_DEBUG  	atomic_set(&net->use_count, 0); @@ -232,7 +234,8 @@ void net_drop_ns(void *p)  		net_free(ns);  } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) +struct net *copy_net_ns(unsigned long flags, +			struct user_namespace *user_ns, struct net *old_net)  {  	struct net *net;  	int rv; @@ -243,8 +246,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)  	net = net_alloc();  	if (!net)  		return ERR_PTR(-ENOMEM); + +	get_user_ns(user_ns); +  	mutex_lock(&net_mutex); -	rv = setup_net(net); +	rv = setup_net(net, user_ns);  	if (rv == 0) {  		rtnl_lock();  		list_add_tail_rcu(&net->list, &net_namespace_list); @@ -252,6 +258,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)  	}  	mutex_unlock(&net_mutex);  	if (rv < 0) { +		put_user_ns(user_ns);  		net_drop_ns(net);  		return ERR_PTR(rv);  	} @@ -308,6 +315,7 @@ static void cleanup_net(struct work_struct *work)  	/* Finally it is safe to free my network namespace structure */  	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {  		list_del_init(&net->exit_list); +		put_user_ns(net->user_ns);  		net_drop_ns(net);  	}  } @@ -347,13 +355,6 @@ struct net *get_net_ns_by_fd(int fd)  }  #else -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ -	if (flags & CLONE_NEWNET) -		return ERR_PTR(-EINVAL); -	return old_net; -} -  struct net *get_net_ns_by_fd(int fd)  {  	return ERR_PTR(-EINVAL); @@ -380,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid)  }  EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ +	return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ +	proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { +	.init = net_ns_net_init, +	.exit = net_ns_net_exit, +}; +  static int __init net_ns_init(void)  {  	struct net_generic *ng; @@ -402,7 +418,7 @@ static int __init net_ns_init(void)  	rcu_assign_pointer(init_net.gen, ng);  	mutex_lock(&net_mutex); -	if (setup_net(&init_net)) +	if (setup_net(&init_net, &init_user_ns))  		panic("Could not setup the initial network namespace");  	rtnl_lock(); @@ -411,6 +427,8 @@ static int __init net_ns_init(void)  	mutex_unlock(&net_mutex); +	register_pernet_subsys(&net_ns_ops); +  	return 0;  } @@ -629,16 +647,29 @@ static void netns_put(void *ns)  static int netns_install(struct nsproxy *nsproxy, void *ns)  { +	struct net *net = ns; + +	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || +	    !nsown_capable(CAP_SYS_ADMIN)) +		return -EPERM; +  	put_net(nsproxy->net_ns); -	nsproxy->net_ns = get_net(ns); +	nsproxy->net_ns = get_net(net);  	return 0;  } +static unsigned int netns_inum(void *ns) +{ +	struct net *net = ns; +	return net->proc_inum; +} +  const struct proc_ns_operations netns_operations = {  	.name		= "net",  	.type		= CLONE_NEWNET,  	.get		= netns_get,  	.put		= netns_put,  	.install	= netns_install, +	.inum		= netns_inum,  };  #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 77a0388fc3b..3151acf5ec1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -674,7 +674,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  		if ((delim = strchr(cur, '@')) == NULL)  			goto parse_failed;  		*delim = 0; -		np->local_port = simple_strtol(cur, NULL, 10); +		if (kstrtou16(cur, 10, &np->local_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++; @@ -705,7 +706,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  		*delim = 0;  		if (*cur == ' ' || *cur == '\t')  			np_info(np, "warning: whitespace is not allowed\n"); -		np->remote_port = simple_strtol(cur, NULL, 10); +		if (kstrtou16(cur, 10, &np->remote_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 79285a36035..5e67defe2cb 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,11 +27,7 @@  #include <linux/fdtable.h> -#define PRIOIDX_SZ 128 - -static unsigned long prioidx_map[PRIOIDX_SZ]; -static DEFINE_SPINLOCK(prioidx_map_lock); -static atomic_t max_prioidx = ATOMIC_INIT(0); +#define PRIOMAP_MIN_SZ		128  static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)  { @@ -39,136 +35,157 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr  			    struct cgroup_netprio_state, css);  } -static int get_prioidx(u32 *prio) -{ -	unsigned long flags; -	u32 prioidx; - -	spin_lock_irqsave(&prioidx_map_lock, flags); -	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); -	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { -		spin_unlock_irqrestore(&prioidx_map_lock, flags); -		return -ENOSPC; -	} -	set_bit(prioidx, prioidx_map); -	if (atomic_read(&max_prioidx) < prioidx) -		atomic_set(&max_prioidx, prioidx); -	spin_unlock_irqrestore(&prioidx_map_lock, flags); -	*prio = prioidx; -	return 0; -} - -static void put_prioidx(u32 idx) +/* + * Extend @dev->priomap so that it's large enough to accomodate + * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful + * return.  Must be called under rtnl lock. + */ +static int extend_netdev_table(struct net_device *dev, u32 target_idx)  { -	unsigned long flags; +	struct netprio_map *old, *new; +	size_t new_sz, new_len; -	spin_lock_irqsave(&prioidx_map_lock, flags); -	clear_bit(idx, prioidx_map); -	spin_unlock_irqrestore(&prioidx_map_lock, flags); -} - -static int extend_netdev_table(struct net_device *dev, u32 new_len) -{ -	size_t new_size = sizeof(struct netprio_map) + -			   ((sizeof(u32) * new_len)); -	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); -	struct netprio_map *old_priomap; +	/* is the existing priomap large enough? */ +	old = rtnl_dereference(dev->priomap); +	if (old && old->priomap_len > target_idx) +		return 0; -	old_priomap  = rtnl_dereference(dev->priomap); +	/* +	 * Determine the new size.  Let's keep it power-of-two.  We start +	 * from PRIOMAP_MIN_SZ and double it until it's large enough to +	 * accommodate @target_idx. +	 */ +	new_sz = PRIOMAP_MIN_SZ; +	while (true) { +		new_len = (new_sz - offsetof(struct netprio_map, priomap)) / +			sizeof(new->priomap[0]); +		if (new_len > target_idx) +			break; +		new_sz *= 2; +		/* overflowed? */ +		if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) +			return -ENOSPC; +	} -	if (!new_priomap) { +	/* allocate & copy */ +	new = kzalloc(new_sz, GFP_KERNEL); +	if (!new) {  		pr_warn("Unable to alloc new priomap!\n");  		return -ENOMEM;  	} -	if (old_priomap) -		memcpy(new_priomap->priomap, old_priomap->priomap, -		       old_priomap->priomap_len * -		       sizeof(old_priomap->priomap[0])); +	if (old) +		memcpy(new->priomap, old->priomap, +		       old->priomap_len * sizeof(old->priomap[0])); -	new_priomap->priomap_len = new_len; +	new->priomap_len = new_len; -	rcu_assign_pointer(dev->priomap, new_priomap); -	if (old_priomap) -		kfree_rcu(old_priomap, rcu); +	/* install the new priomap */ +	rcu_assign_pointer(dev->priomap, new); +	if (old) +		kfree_rcu(old, rcu);  	return 0;  } -static int write_update_netdev_table(struct net_device *dev) +/** + * netprio_prio - return the effective netprio of a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * + * Should be called under RCU read or rtnl lock. + */ +static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +{ +	struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + +	if (map && cgrp->id < map->priomap_len) +		return map->priomap[cgrp->id]; +	return 0; +} + +/** + * netprio_set_prio - set netprio on a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * @prio: prio to set + * + * Set netprio to @prio on @cgrp-@dev pair.  Should be called under rtnl + * lock and may fail under memory pressure for non-zero @prio. + */ +static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, +			    u32 prio)  { -	int ret = 0; -	u32 max_len;  	struct netprio_map *map; +	int ret; -	max_len = atomic_read(&max_prioidx) + 1; +	/* avoid extending priomap for zero writes */  	map = rtnl_dereference(dev->priomap); -	if (!map || map->priomap_len < max_len) -		ret = extend_netdev_table(dev, max_len); +	if (!prio && (!map || map->priomap_len <= cgrp->id)) +		return 0; -	return ret; +	ret = extend_netdev_table(dev, cgrp->id); +	if (ret) +		return ret; + +	map = rtnl_dereference(dev->priomap); +	map->priomap[cgrp->id] = prio; +	return 0;  } -static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)  {  	struct cgroup_netprio_state *cs; -	int ret = -EINVAL;  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);  	if (!cs)  		return ERR_PTR(-ENOMEM); -	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) -		goto out; - -	ret = get_prioidx(&cs->prioidx); -	if (ret < 0) { -		pr_warn("No space in priority index array\n"); -		goto out; -	} -  	return &cs->css; -out: -	kfree(cs); -	return ERR_PTR(ret);  } -static void cgrp_destroy(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup *cgrp)  { -	struct cgroup_netprio_state *cs; +	struct cgroup *parent = cgrp->parent;  	struct net_device *dev; -	struct netprio_map *map; +	int ret = 0; + +	if (!parent) +		return 0; -	cs = cgrp_netprio_state(cgrp);  	rtnl_lock(); +	/* +	 * Inherit prios from the parent.  As all prios are set during +	 * onlining, there is no need to clear them on offline. +	 */  	for_each_netdev(&init_net, dev) { -		map = rtnl_dereference(dev->priomap); -		if (map && cs->prioidx < map->priomap_len) -			map->priomap[cs->prioidx] = 0; +		u32 prio = netprio_prio(parent, dev); + +		ret = netprio_set_prio(cgrp, dev, prio); +		if (ret) +			break;  	}  	rtnl_unlock(); -	put_prioidx(cs->prioidx); -	kfree(cs); +	return ret; +} + +static void cgrp_css_free(struct cgroup *cgrp) +{ +	kfree(cgrp_netprio_state(cgrp));  }  static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)  { -	return (u64)cgrp_netprio_state(cgrp)->prioidx; +	return cgrp->id;  }  static int read_priomap(struct cgroup *cont, struct cftype *cft,  			struct cgroup_map_cb *cb)  {  	struct net_device *dev; -	u32 prioidx = cgrp_netprio_state(cont)->prioidx; -	u32 priority; -	struct netprio_map *map;  	rcu_read_lock(); -	for_each_netdev_rcu(&init_net, dev) { -		map = rcu_dereference(dev->priomap); -		priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; -		cb->fill(cb, dev->name, priority); -	} +	for_each_netdev_rcu(&init_net, dev) +		cb->fill(cb, dev->name, netprio_prio(cont, dev));  	rcu_read_unlock();  	return 0;  } @@ -176,66 +193,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft,  static int write_priomap(struct cgroup *cgrp, struct cftype *cft,  			 const char *buffer)  { -	char *devname = kstrdup(buffer, GFP_KERNEL); -	int ret = -EINVAL; -	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; -	unsigned long priority; -	char *priostr; +	char devname[IFNAMSIZ + 1];  	struct net_device *dev; -	struct netprio_map *map; - -	if (!devname) -		return -ENOMEM; - -	/* -	 * Minimally sized valid priomap string -	 */ -	if (strlen(devname) < 3) -		goto out_free_devname; - -	priostr = strstr(devname, " "); -	if (!priostr) -		goto out_free_devname; - -	/* -	 *Separate the devname from the associated priority -	 *and advance the priostr pointer to the priority value -	 */ -	*priostr = '\0'; -	priostr++; - -	/* -	 * If the priostr points to NULL, we're at the end of the passed -	 * in string, and its not a valid write -	 */ -	if (*priostr == '\0') -		goto out_free_devname; - -	ret = kstrtoul(priostr, 10, &priority); -	if (ret < 0) -		goto out_free_devname; +	u32 prio; +	int ret; -	ret = -ENODEV; +	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) +		return -EINVAL;  	dev = dev_get_by_name(&init_net, devname);  	if (!dev) -		goto out_free_devname; +		return -ENODEV;  	rtnl_lock(); -	ret = write_update_netdev_table(dev); -	if (ret < 0) -		goto out_put_dev; -	map = rtnl_dereference(dev->priomap); -	if (map) -		map->priomap[prioidx] = priority; +	ret = netprio_set_prio(cgrp, dev, prio); -out_put_dev:  	rtnl_unlock();  	dev_put(dev); - -out_free_devname: -	kfree(devname);  	return ret;  } @@ -248,7 +223,7 @@ static int update_netprio(const void *v, struct file *file, unsigned n)  	return 0;  } -void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct task_struct *p;  	void *v; @@ -276,22 +251,13 @@ static struct cftype ss_files[] = {  struct cgroup_subsys net_prio_subsys = {  	.name		= "net_prio", -	.create		= cgrp_create, -	.destroy	= cgrp_destroy, +	.css_alloc	= cgrp_css_alloc, +	.css_online	= cgrp_css_online, +	.css_free	= cgrp_css_free,  	.attach		= net_prio_attach,  	.subsys_id	= net_prio_subsys_id,  	.base_cftypes	= ss_files,  	.module		= THIS_MODULE, - -	/* -	 * net_prio has artificial limit on the number of cgroups and -	 * disallows nesting making it impossible to co-mount it with other -	 * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ -	 * limit and properly nest configuration such that children follow -	 * their parents' configurations by default and are allowed to -	 * override and remove the following. -	 */ -	.broken_hierarchy = true,  };  static int netprio_device_event(struct notifier_block *unused, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d1dc14c2aac..b29dacf900f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -419,20 +419,6 @@ struct pktgen_thread {  #define REMOVE 1  #define FIND   0 -static inline ktime_t ktime_now(void) -{ -	struct timespec ts; -	ktime_get_ts(&ts); - -	return timespec_to_ktime(ts); -} - -/* This works even if 32 bit because of careful byte order choice */ -static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) -{ -	return cmp1.tv64 < cmp2.tv64; -} -  static const char version[] =  	"Packet Generator for packet performance testing. "  	"Version: " VERSION "\n"; @@ -675,7 +661,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\n");  	/* not really stopped, more like last-running-at */ -	stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; +	stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;  	idle = pkt_dev->idle_acc;  	do_div(idle, NSEC_PER_USEC); @@ -2141,12 +2127,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  		return;  	} -	start_time = ktime_now(); +	start_time = ktime_get();  	if (remaining < 100000) {  		/* for small delays (<100us), just loop until limit is reached */  		do { -			end_time = ktime_now(); -		} while (ktime_lt(end_time, spin_until)); +			end_time = ktime_get(); +		} while (ktime_compare(end_time, spin_until) < 0);  	} else {  		/* see do_nanosleep */  		hrtimer_init_sleeper(&t, current); @@ -2162,7 +2148,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  			hrtimer_cancel(&t.timer);  		} while (t.task && pkt_dev->running && !signal_pending(current));  		__set_current_state(TASK_RUNNING); -		end_time = ktime_now(); +		end_time = ktime_get();  	}  	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); @@ -2427,11 +2413,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		}  	} else {		/* IPV6 * */ -		if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; -		else { +		if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {  			int i;  			/* Only random destinations yet */ @@ -2916,8 +2898,7 @@ static void pktgen_run(struct pktgen_thread *t)  			pktgen_clear_counters(pkt_dev);  			pkt_dev->running = 1;	/* Cranke yeself! */  			pkt_dev->skb = NULL; -			pkt_dev->started_at = -				pkt_dev->next_tx = ktime_now(); +			pkt_dev->started_at = pkt_dev->next_tx = ktime_get();  			set_pkt_overhead(pkt_dev); @@ -3076,7 +3057,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)  	kfree_skb(pkt_dev->skb);  	pkt_dev->skb = NULL; -	pkt_dev->stopped_at = ktime_now(); +	pkt_dev->stopped_at = ktime_get();  	pkt_dev->running = 0;  	show_results(pkt_dev, nr_frags); @@ -3095,7 +3076,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)  			continue;  		if (best == NULL)  			best = pkt_dev; -		else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) +		else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)  			best = pkt_dev;  	}  	if_unlock(t); @@ -3180,14 +3161,14 @@ static void pktgen_rem_thread(struct pktgen_thread *t)  static void pktgen_resched(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	schedule(); -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	while (atomic_read(&(pkt_dev->skb->users)) != 1) {  		if (signal_pending(current)) @@ -3198,7 +3179,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  		else  			cpu_relax();  	} -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_xmit(struct pktgen_dev *pkt_dev) @@ -3220,7 +3201,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	 * "never transmit"  	 */  	if (unlikely(pkt_dev->delay == ULLONG_MAX)) { -		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); +		pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);  		return;  	} diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fad649ae4de..1868625af25 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -128,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].doit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].doit : NULL; +	return tab[msgindex].doit;  }  static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) @@ -143,7 +143,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].dumpit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].dumpit : NULL; +	return tab[msgindex].dumpit;  }  static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) @@ -158,7 +158,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].calcit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].calcit : NULL; +	return tab[msgindex].calcit;  }  /** @@ -1316,6 +1316,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  			err = PTR_ERR(net);  			goto errout;  		} +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { +			err = -EPERM; +			goto errout; +		}  		err = dev_change_net_namespace(dev, net, ifname);  		put_net(net);  		if (err) @@ -1638,7 +1642,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)  }  EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *src_net, struct net *net, +struct net_device *rtnl_create_link(struct net *net,  	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])  {  	int err; @@ -1836,7 +1840,7 @@ replay:  		if (IS_ERR(dest_net))  			return PTR_ERR(dest_net); -		dev = rtnl_create_link(net, dest_net, ifname, ops, tb); +		dev = rtnl_create_link(dest_net, ifname, ops, tb);  		if (IS_ERR(dev)) {  			err = PTR_ERR(dev);  			goto out; @@ -2057,6 +2061,9 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	u8 *addr;  	int err; +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +  	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);  	if (err < 0)  		return err; @@ -2123,6 +2130,9 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	int err = -EINVAL;  	__u8 *addr; +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +  	if (nlmsg_len(nlh) < sizeof(*ndm))  		return -EINVAL; @@ -2253,6 +2263,211 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)  	return skb->len;  } +int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, +			    struct net_device *dev, u16 mode) +{ +	struct nlmsghdr *nlh; +	struct ifinfomsg *ifm; +	struct nlattr *br_afspec; +	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + +	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); +	if (nlh == NULL) +		return -EMSGSIZE; + +	ifm = nlmsg_data(nlh); +	ifm->ifi_family = AF_BRIDGE; +	ifm->__ifi_pad = 0; +	ifm->ifi_type = dev->type; +	ifm->ifi_index = dev->ifindex; +	ifm->ifi_flags = dev_get_flags(dev); +	ifm->ifi_change = 0; + + +	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || +	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || +	    nla_put_u8(skb, IFLA_OPERSTATE, operstate) || +	    (dev->master && +	     nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || +	    (dev->addr_len && +	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || +	    (dev->ifindex != dev->iflink && +	     nla_put_u32(skb, IFLA_LINK, dev->iflink))) +		goto nla_put_failure; + +	br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); +	if (!br_afspec) +		goto nla_put_failure; + +	if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || +	    nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { +		nla_nest_cancel(skb, br_afspec); +		goto nla_put_failure; +	} +	nla_nest_end(skb, br_afspec); + +	return nlmsg_end(skb, nlh); +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} +EXPORT_SYMBOL(ndo_dflt_bridge_getlink); + +static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	struct net_device *dev; +	int idx = 0; +	u32 portid = NETLINK_CB(cb->skb).portid; +	u32 seq = cb->nlh->nlmsg_seq; + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		const struct net_device_ops *ops = dev->netdev_ops; +		struct net_device *master = dev->master; + +		if (master && master->netdev_ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    master->netdev_ops->ndo_bridge_getlink( +				    skb, portid, seq, dev) < 0) +				break; +			idx++; +		} + +		if (ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    ops->ndo_bridge_getlink(skb, portid, seq, dev) < 0) +				break; +			idx++; +		} +	} +	rcu_read_unlock(); +	cb->args[0] = idx; + +	return skb->len; +} + +static inline size_t bridge_nlmsg_size(void) +{ +	return NLMSG_ALIGN(sizeof(struct ifinfomsg)) +		+ nla_total_size(IFNAMSIZ)	/* IFLA_IFNAME */ +		+ nla_total_size(MAX_ADDR_LEN)	/* IFLA_ADDRESS */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MASTER */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MTU */ +		+ nla_total_size(sizeof(u32))	/* IFLA_LINK */ +		+ nla_total_size(sizeof(u32))	/* IFLA_OPERSTATE */ +		+ nla_total_size(sizeof(u8))	/* IFLA_PROTINFO */ +		+ nla_total_size(sizeof(struct nlattr))	/* IFLA_AF_SPEC */ +		+ nla_total_size(sizeof(u16))	/* IFLA_BRIDGE_FLAGS */ +		+ nla_total_size(sizeof(u16));	/* IFLA_BRIDGE_MODE */ +} + +static int rtnl_bridge_notify(struct net_device *dev, u16 flags) +{ +	struct net *net = dev_net(dev); +	struct net_device *master = dev->master; +	struct sk_buff *skb; +	int err = -EOPNOTSUPP; + +	skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); +	if (!skb) { +		err = -ENOMEM; +		goto errout; +	} + +	if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && +	    master && master->netdev_ops->ndo_bridge_getlink) { +		err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); +		if (err < 0) +			goto errout; +	} + +	if ((flags & BRIDGE_FLAGS_SELF) && +	    dev->netdev_ops->ndo_bridge_getlink) { +		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); +		if (err < 0) +			goto errout; +	} + +	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); +	return 0; +errout: +	WARN_ON(err == -EMSGSIZE); +	kfree_skb(skb); +	rtnl_set_sk_err(net, RTNLGRP_LINK, err); +	return err; +} + +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, +			       void *arg) +{ +	struct net *net = sock_net(skb->sk); +	struct ifinfomsg *ifm; +	struct net_device *dev; +	struct nlattr *br_spec, *attr = NULL; +	int rem, err = -EOPNOTSUPP; +	u16 oflags, flags = 0; +	bool have_flags = false; + +	if (nlmsg_len(nlh) < sizeof(*ifm)) +		return -EINVAL; + +	ifm = nlmsg_data(nlh); +	if (ifm->ifi_family != AF_BRIDGE) +		return -EPFNOSUPPORT; + +	dev = __dev_get_by_index(net, ifm->ifi_index); +	if (!dev) { +		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); +		return -ENODEV; +	} + +	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); +	if (br_spec) { +		nla_for_each_nested(attr, br_spec, rem) { +			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { +				have_flags = true; +				flags = nla_get_u16(attr); +				break; +			} +		} +	} + +	oflags = flags; + +	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { +		if (!dev->master || +		    !dev->master->netdev_ops->ndo_bridge_setlink) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh); +		if (err) +			goto out; + +		flags &= ~BRIDGE_FLAGS_MASTER; +	} + +	if ((flags & BRIDGE_FLAGS_SELF)) { +		if (!dev->netdev_ops->ndo_bridge_setlink) +			err = -EOPNOTSUPP; +		else +			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh); + +		if (!err) +			flags &= ~BRIDGE_FLAGS_SELF; +	} + +	if (have_flags) +		memcpy(nla_data(attr), &flags, sizeof(flags)); +	/* Generate event to notify upper layer of bridge change */ +	if (!err) +		err = rtnl_bridge_notify(dev, oflags); +out: +	return err; +} +  /* Protected by RTNL sempahore.  */  static struct rtattr **rta_buf;  static int rtattr_max; @@ -2283,7 +2498,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	sz_idx = type>>2;  	kind = type&3; -	if (kind != 2 && !capable(CAP_NET_ADMIN)) +	if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -2434,5 +2649,8 @@ void __init rtnetlink_init(void)  	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);  	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);  	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); + +	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); +	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);  } diff --git a/net/core/scm.c b/net/core/scm.c index ab570841a53..57fb1ee6649 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -51,11 +51,11 @@ static __inline__ int scm_check_creds(struct ucred *creds)  	if (!uid_valid(uid) || !gid_valid(gid))  		return -EINVAL; -	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && +	if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) &&  	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) || -	      uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) && +	      uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) &&  	    ((gid_eq(gid, cred->gid)   || gid_eq(gid, cred->egid) || -	      gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) { +	      gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) {  	       return 0;  	}  	return -EPERM; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3f0636cd76c..3ab989b0de4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -519,7 +519,7 @@ static void skb_release_data(struct sk_buff *skb)  			uarg = skb_shinfo(skb)->destructor_arg;  			if (uarg->callback) -				uarg->callback(uarg); +				uarg->callback(uarg, true);  		}  		if (skb_has_frag_list(skb)) @@ -635,6 +635,26 @@ void kfree_skb(struct sk_buff *skb)  EXPORT_SYMBOL(kfree_skb);  /** + *	skb_tx_error - report an sk_buff xmit error + *	@skb: buffer that triggered an error + * + *	Report xmit error if a device callback is tracking this skb. + *	skb must be freed afterwards. + */ +void skb_tx_error(struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		struct ubuf_info *uarg; + +		uarg = skb_shinfo(skb)->destructor_arg; +		if (uarg->callback) +			uarg->callback(uarg, false); +		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; +	} +} +EXPORT_SYMBOL(skb_tx_error); + +/**   *	consume_skb - free an skbuff   *	@skb: buffer to free   * @@ -662,11 +682,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->transport_header	= old->transport_header;  	new->network_header	= old->network_header;  	new->mac_header		= old->mac_header; +	new->inner_transport_header = old->inner_transport_header; +	new->inner_network_header = old->inner_transport_header;  	skb_dst_copy(new, old);  	new->rxhash		= old->rxhash;  	new->ooo_okay		= old->ooo_okay;  	new->l4_rxhash		= old->l4_rxhash;  	new->no_fcs		= old->no_fcs; +	new->encapsulation	= old->encapsulation;  #ifdef CONFIG_XFRM  	new->sp			= secpath_get(old->sp);  #endif @@ -797,7 +820,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  	for (i = 0; i < num_frags; i++)  		skb_frag_unref(skb, i); -	uarg->callback(uarg); +	uarg->callback(uarg, false);  	/* skb frags point to kernel buffers */  	for (i = num_frags - 1; i >= 0; i--) { @@ -872,6 +895,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->network_header   += offset;  	if (skb_mac_header_was_set(new))  		new->mac_header	      += offset; +	new->inner_transport_header += offset; +	new->inner_network_header   += offset;  #endif  	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;  	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -1069,6 +1094,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  	skb->network_header   += off;  	if (skb_mac_header_was_set(skb))  		skb->mac_header += off; +	skb->inner_transport_header += off; +	skb->inner_network_header += off;  	/* Only adjust this if it actually is csum_start rather than csum */  	if (skb->ip_summed == CHECKSUM_PARTIAL)  		skb->csum_start += nhead; @@ -1168,6 +1195,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	n->network_header   += off;  	if (skb_mac_header_was_set(skb))  		n->mac_header += off; +	n->inner_transport_header += off; +	n->inner_network_header	   += off;  #endif  	return n; @@ -2999,7 +3028,6 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  	memcpy(skb_mac_header(nskb), skb_mac_header(p),  	       p->data - skb_mac_header(p)); -	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);  	skb_shinfo(nskb)->frag_list = p;  	skb_shinfo(nskb)->gso_size = pinfo->gso_size;  	pinfo->gso_size = 0; diff --git a/net/core/sock.c b/net/core/sock.c index 8a146cfcc36..bc131d41968 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)  }  EXPORT_SYMBOL(sk_dst_check); -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +static int sock_setbindtodevice(struct sock *sk, char __user *optval, +				int optlen)  {  	int ret = -ENOPROTOOPT;  #ifdef CONFIG_NETDEVICES @@ -515,7 +516,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)  	/* Sorry... */  	ret = -EPERM; -	if (!capable(CAP_NET_RAW)) +	if (!ns_capable(net->user_ns, CAP_NET_RAW))  		goto out;  	ret = -EINVAL; @@ -562,6 +563,59 @@ out:  	return ret;  } +static int sock_getbindtodevice(struct sock *sk, char __user *optval, +				int __user *optlen, int len) +{ +	int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES +	struct net *net = sock_net(sk); +	struct net_device *dev; +	char devname[IFNAMSIZ]; +	unsigned seq; + +	if (sk->sk_bound_dev_if == 0) { +		len = 0; +		goto zero; +	} + +	ret = -EINVAL; +	if (len < IFNAMSIZ) +		goto out; + +retry: +	seq = read_seqcount_begin(&devnet_rename_seq); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); +	ret = -ENODEV; +	if (!dev) { +		rcu_read_unlock(); +		goto out; +	} + +	strcpy(devname, dev->name); +	rcu_read_unlock(); +	if (read_seqcount_retry(&devnet_rename_seq, seq)) +		goto retry; + +	len = strlen(devname) + 1; + +	ret = -EFAULT; +	if (copy_to_user(optval, devname, len)) +		goto out; + +zero: +	ret = -EFAULT; +	if (put_user(len, optlen)) +		goto out; + +	ret = 0; + +out: +#endif + +	return ret; +} +  static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)  {  	if (valbool) @@ -589,7 +643,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  	 */  	if (optname == SO_BINDTODEVICE) -		return sock_bindtodevice(sk, optval, optlen); +		return sock_setbindtodevice(sk, optval, optlen);  	if (optlen < sizeof(int))  		return -EINVAL; @@ -696,7 +750,8 @@ set_rcvbuf:  		break;  	case SO_PRIORITY: -		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) +		if ((val >= 0 && val <= 6) || +		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			sk->sk_priority = val;  		else  			ret = -EPERM; @@ -813,7 +868,7 @@ set_rcvbuf:  			clear_bit(SOCK_PASSSEC, &sock->flags);  		break;  	case SO_MARK: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			ret = -EPERM;  		else  			sk->sk_mark = val; @@ -1074,6 +1129,17 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  	case SO_NOFCS:  		v.val = sock_flag(sk, SOCK_NOFCS);  		break; + +	case SO_BINDTODEVICE: +		return sock_getbindtodevice(sk, optval, optlen, len); + +	case SO_GET_FILTER: +		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); +		if (len < 0) +			return len; + +		goto lenout; +  	default:  		return -ENOPROTOOPT;  	} @@ -1214,13 +1280,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)  #ifdef CONFIG_CGROUPS  #if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) +void sock_update_classid(struct sock *sk, struct task_struct *task)  {  	u32 classid; -	rcu_read_lock();  /* doing current task, which cannot vanish. */ -	classid = task_cls_classid(current); -	rcu_read_unlock(); +	classid = task_cls_classid(task);  	if (classid != sk->sk_classid)  		sk->sk_classid = classid;  } @@ -1263,7 +1327,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		sock_net_set(sk, get_net(net));  		atomic_set(&sk->sk_wmem_alloc, 1); -		sock_update_classid(sk); +		sock_update_classid(sk, current);  		sock_update_netprioidx(sk, current);  	} diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a7c36845b12..d1b08045a9d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -216,6 +216,11 @@ static __net_init int sysctl_core_net_init(struct net *net)  			goto err_dup;  		tbl[0].data = &net->core.sysctl_somaxconn; + +		/* Don't export any sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) { +			tbl[0].procname = NULL; +		}  	}  	net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);  |