diff options
Diffstat (limited to 'net/core/sock.c')
| -rw-r--r-- | net/core/sock.c | 203 | 
1 files changed, 144 insertions, 59 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 4ed7b1d12f5..002939cfc06 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,6 +111,7 @@  #include <linux/init.h>  #include <linux/highmem.h>  #include <linux/user_namespace.h> +#include <linux/jump_label.h>  #include <asm/uaccess.h>  #include <asm/system.h> @@ -125,6 +126,7 @@  #include <net/xfrm.h>  #include <linux/ipsec.h>  #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h>  #include <linux/filter.h> @@ -134,6 +136,46 @@  #include <net/tcp.h>  #endif +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ +	struct proto *proto; +	int ret = 0; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry(proto, &proto_list, node) { +		if (proto->init_cgroup) { +			ret = proto->init_cgroup(cgrp, ss); +			if (ret) +				goto out; +		} +	} + +	mutex_unlock(&proto_list_mutex); +	return ret; +out: +	list_for_each_entry_continue_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(cgrp, ss); +	mutex_unlock(&proto_list_mutex); +	return ret; +} + +void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ +	struct proto *proto; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(cgrp, ss); +	mutex_unlock(&proto_list_mutex); +} +#endif +  /*   * Each address family might have different locking rules, so we have   * one slock key per address family: @@ -141,6 +183,9 @@  static struct lock_class_key af_family_keys[AF_MAX];  static struct lock_class_key af_family_slock_keys[AF_MAX]; +struct jump_label_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); +  /*   * Make lock validator output more readable. (we pre-construct these   * strings build-time, so that runtime initialization of socket @@ -221,10 +266,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);  EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP)  int net_cls_subsys_id = -1;  EXPORT_SYMBOL_GPL(net_cls_subsys_id);  #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)  { @@ -269,14 +320,14 @@ static void sock_warn_obsolete_bsdism(const char *name)  	}  } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags)  { -	if (sock_flag(sk, flag)) { -		sock_reset_flag(sk, flag); -		if (!sock_flag(sk, SOCK_TIMESTAMP) && -		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { +	if (sk->sk_flags & flags) { +		sk->sk_flags &= ~flags; +		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))  			net_disable_timestamp(); -		}  	}  } @@ -288,11 +339,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	unsigned long flags;  	struct sk_buff_head *list = &sk->sk_receive_queue; -	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces -	   number of warnings when compiling with -W --ANK -	 */ -	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -	    (unsigned)sk->sk_rcvbuf) { +	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {  		atomic_inc(&sk->sk_drops);  		trace_sock_rcvqueue_full(sk, skb);  		return -ENOMEM; @@ -682,7 +729,7 @@ set_rcvbuf:  					      SOCK_TIMESTAMPING_RX_SOFTWARE);  		else  			sock_disable_timestamp(sk, -					       SOCK_TIMESTAMPING_RX_SOFTWARE); +					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,  				  val & SOF_TIMESTAMPING_SOFTWARE);  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -740,6 +787,11 @@ set_rcvbuf:  	case SO_RXQ_OVFL:  		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);  		break; + +	case SO_WIFI_STATUS: +		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); +		break; +  	default:  		ret = -ENOPROTOOPT;  		break; @@ -961,6 +1013,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);  		break; +	case SO_WIFI_STATUS: +		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); +		break; +  	default:  		return -ENOPROTOOPT;  	} @@ -1111,6 +1167,18 @@ void sock_update_classid(struct sock *sk)  		sk->sk_classid = classid;  }  EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ +	struct cgroup_netprio_state *state; +	if (in_interrupt()) +		return; +	rcu_read_lock(); +	state = task_netprio_state(current); +	sk->sk_cgrp_prioidx = state ? state->prioidx : 0; +	rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx);  #endif  /** @@ -1138,6 +1206,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		atomic_set(&sk->sk_wmem_alloc, 1);  		sock_update_classid(sk); +		sock_update_netprioidx(sk);  	}  	return sk; @@ -1158,8 +1227,7 @@ static void __sk_free(struct sock *sk)  		RCU_INIT_POINTER(sk->sk_filter, NULL);  	} -	sock_disable_timestamp(sk, SOCK_TIMESTAMP); -	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); +	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);  	if (atomic_read(&sk->sk_omem_alloc))  		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1204,7 +1272,14 @@ void sk_release_kernel(struct sock *sk)  }  EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +/** + *	sk_clone_lock - clone a socket, and lock its clone + *	@sk: the socket to clone + *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + *	Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  {  	struct sock *newsk; @@ -1288,16 +1363,15 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)  		newsk->sk_wq = NULL;  		if (newsk->sk_prot->sockets_allocated) -			percpu_counter_inc(newsk->sk_prot->sockets_allocated); +			sk_sockets_allocated_inc(newsk); -		if (sock_flag(newsk, SOCK_TIMESTAMP) || -		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) +		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)  			net_enable_timestamp();  	}  out:  	return newsk;  } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock);  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)  { @@ -1677,30 +1751,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  	struct proto *prot = sk->sk_prot;  	int amt = sk_mem_pages(size);  	long allocated; +	int parent_status = UNDER_LIMIT;  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; -	allocated = atomic_long_add_return(amt, prot->memory_allocated); + +	allocated = sk_memory_allocated_add(sk, amt, &parent_status);  	/* Under limit. */ -	if (allocated <= prot->sysctl_mem[0]) { -		if (prot->memory_pressure && *prot->memory_pressure) -			*prot->memory_pressure = 0; +	if (parent_status == UNDER_LIMIT && +			allocated <= sk_prot_mem_limits(sk, 0)) { +		sk_leave_memory_pressure(sk);  		return 1;  	} -	/* Under pressure. */ -	if (allocated > prot->sysctl_mem[1]) -		if (prot->enter_memory_pressure) -			prot->enter_memory_pressure(sk); +	/* Under pressure. (we or our parents) */ +	if ((parent_status > SOFT_LIMIT) || +			allocated > sk_prot_mem_limits(sk, 1)) +		sk_enter_memory_pressure(sk); -	/* Over hard limit. */ -	if (allocated > prot->sysctl_mem[2]) +	/* Over hard limit (we or our parents) */ +	if ((parent_status == OVER_LIMIT) || +			(allocated > sk_prot_mem_limits(sk, 2)))  		goto suppress_allocation;  	/* guarantee minimum buffer size under pressure */  	if (kind == SK_MEM_RECV) {  		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])  			return 1; +  	} else { /* SK_MEM_SEND */  		if (sk->sk_type == SOCK_STREAM) {  			if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1710,13 +1788,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  				return 1;  	} -	if (prot->memory_pressure) { +	if (sk_has_memory_pressure(sk)) {  		int alloc; -		if (!*prot->memory_pressure) +		if (!sk_under_memory_pressure(sk))  			return 1; -		alloc = percpu_counter_read_positive(prot->sockets_allocated); -		if (prot->sysctl_mem[2] > alloc * +		alloc = sk_sockets_allocated_read_positive(sk); +		if (sk_prot_mem_limits(sk, 2) > alloc *  		    sk_mem_pages(sk->sk_wmem_queued +  				 atomic_read(&sk->sk_rmem_alloc) +  				 sk->sk_forward_alloc)) @@ -1739,7 +1817,9 @@ suppress_allocation:  	/* Alas. Undo changes. */  	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; -	atomic_long_sub(amt, prot->memory_allocated); + +	sk_memory_allocated_sub(sk, amt, parent_status); +  	return 0;  }  EXPORT_SYMBOL(__sk_mem_schedule); @@ -1750,15 +1830,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);   */  void __sk_mem_reclaim(struct sock *sk)  { -	struct proto *prot = sk->sk_prot; - -	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, -		   prot->memory_allocated); +	sk_memory_allocated_sub(sk, +				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);  	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; -	if (prot->memory_pressure && *prot->memory_pressure && -	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) -		*prot->memory_pressure = 0; +	if (sk_under_memory_pressure(sk) && +	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) +		sk_leave_memory_pressure(sk);  }  EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2129,16 +2207,15 @@ EXPORT_SYMBOL(sock_get_timestampns);  void sock_enable_timestamp(struct sock *sk, int flag)  {  	if (!sock_flag(sk, flag)) { +		unsigned long previous_flags = sk->sk_flags; +  		sock_set_flag(sk, flag);  		/*  		 * we just set one of the two flags which require net  		 * time stamping, but time stamping might have been on  		 * already because of the other one  		 */ -		if (!sock_flag(sk, -				flag == SOCK_TIMESTAMP ? -				SOCK_TIMESTAMPING_RX_SOFTWARE : -				SOCK_TIMESTAMP)) +		if (!(previous_flags & SK_FLAGS_TIMESTAMP))  			net_enable_timestamp();  	}  } @@ -2250,9 +2327,6 @@ void sk_common_release(struct sock *sk)  }  EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); -  #ifdef CONFIG_PROC_FS  #define PROTO_INUSE_NR	64	/* should be enough for the first time */  struct prot_inuse { @@ -2401,10 +2475,10 @@ int proto_register(struct proto *prot, int alloc_slab)  		}  	} -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	list_add(&prot->node, &proto_list);  	assign_proto_idx(prot); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	return 0;  out_free_timewait_sock_slab_name: @@ -2427,10 +2501,10 @@ EXPORT_SYMBOL(proto_register);  void proto_unregister(struct proto *prot)  { -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	release_proto_idx(prot);  	list_del(&prot->node); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	if (prot->slab != NULL) {  		kmem_cache_destroy(prot->slab); @@ -2453,9 +2527,9 @@ EXPORT_SYMBOL(proto_unregister);  #ifdef CONFIG_PROC_FS  static void *proto_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(proto_list_lock) +	__acquires(proto_list_mutex)  { -	read_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	return seq_list_start_head(&proto_list, *pos);  } @@ -2465,25 +2539,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)  }  static void proto_seq_stop(struct seq_file *seq, void *v) -	__releases(proto_list_lock) +	__releases(proto_list_mutex)  { -	read_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  }  static char proto_method_implemented(const void *method)  {  	return method == NULL ? 'n' : 'y';  } +static long sock_prot_memory_allocated(struct proto *proto) +{ +	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ +	return proto->memory_pressure != NULL ? +	proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +}  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)  { +  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "  			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",  		   proto->name,  		   proto->obj_size,  		   sock_prot_inuse_get(seq_file_net(seq), proto), -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", +		   sock_prot_memory_allocated(proto), +		   sock_prot_memory_pressure(proto),  		   proto->max_header,  		   proto->slab == NULL ? "no" : "yes",  		   module_name(proto->owner),  |