diff options
Diffstat (limited to 'net/ipv4/tcp.c')
| -rw-r--r-- | net/ipv4/tcp.c | 363 | 
1 files changed, 335 insertions, 28 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 64d0af67582..c8666b70cde 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -264,6 +264,7 @@  #include <linux/cache.h>  #include <linux/err.h>  #include <linux/crypto.h> +#include <linux/time.h>  #include <net/icmp.h>  #include <net/tcp.h> @@ -326,6 +327,43 @@ void tcp_enter_memory_pressure(struct sock *sk)  EXPORT_SYMBOL(tcp_enter_memory_pressure); +/* Convert seconds to retransmits based on initial and max timeout */ +static u8 secs_to_retrans(int seconds, int timeout, int rto_max) +{ +	u8 res = 0; + +	if (seconds > 0) { +		int period = timeout; + +		res = 1; +		while (seconds > period && res < 255) { +			res++; +			timeout <<= 1; +			if (timeout > rto_max) +				timeout = rto_max; +			period += timeout; +		} +	} +	return res; +} + +/* Convert retransmits to seconds based on initial and max timeout */ +static int retrans_to_secs(u8 retrans, int timeout, int rto_max) +{ +	int period = 0; + +	if (retrans > 0) { +		period = timeout; +		while (--retrans) { +			timeout <<= 1; +			if (timeout > rto_max) +				timeout = rto_max; +			period += timeout; +		} +	} +	return period; +} +  /*   *	Wait for a TCP event.   * @@ -1146,7 +1184,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  #if TCP_DEBUG  	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); -	WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); +	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), +	     KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", +	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);  #endif  	if (inet_csk_ack_scheduled(sk)) { @@ -1393,11 +1433,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  			/* Now that we have two receive queues this  			 * shouldn't happen.  			 */ -			if (before(*seq, TCP_SKB_CB(skb)->seq)) { -				printk(KERN_INFO "recvmsg bug: copied %X " -				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); +			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), +			     KERN_INFO "recvmsg bug: copied %X " +				       "seq %X rcvnxt %X fl %X\n", *seq, +				       TCP_SKB_CB(skb)->seq, tp->rcv_nxt, +				       flags))  				break; -			} +  			offset = *seq - TCP_SKB_CB(skb)->seq;  			if (tcp_hdr(skb)->syn)  				offset--; @@ -1405,7 +1447,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  				goto found_ok_skb;  			if (tcp_hdr(skb)->fin)  				goto found_fin_ok; -			WARN_ON(!(flags & MSG_PEEK)); +			WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " +					"copied %X seq %X rcvnxt %X fl %X\n", +					*seq, TCP_SKB_CB(skb)->seq, +					tp->rcv_nxt, flags);  		}  		/* Well, if we have backlog, try to process it now yet. */ @@ -1998,7 +2043,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	__skb_queue_purge(&sk->sk_async_wait_queue);  #endif -	inet->dport = 0; +	inet->inet_dport = 0;  	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))  		inet_reset_saddr(sk); @@ -2015,6 +2060,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;  	tp->snd_cwnd_cnt = 0;  	tp->bytes_acked = 0; +	tp->window_clamp = 0;  	tcp_set_ca_state(sk, TCP_CA_Open);  	tcp_clear_retrans(tp);  	inet_csk_delack_init(sk); @@ -2022,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));  	__sk_dst_reset(sk); -	WARN_ON(inet->num && !icsk->icsk_bind_hash); +	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);  	sk->sk_error_report(sk);  	return err; @@ -2039,8 +2085,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  	int val;  	int err = 0; -	/* This is a string value all the others are int's */ -	if (optname == TCP_CONGESTION) { +	/* These are data/string values, all the others are ints */ +	switch (optname) { +	case TCP_CONGESTION: {  		char name[TCP_CA_NAME_MAX];  		if (optlen < 1) @@ -2057,6 +2104,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		release_sock(sk);  		return err;  	} +	case TCP_COOKIE_TRANSACTIONS: { +		struct tcp_cookie_transactions ctd; +		struct tcp_cookie_values *cvp = NULL; + +		if (sizeof(ctd) > optlen) +			return -EINVAL; +		if (copy_from_user(&ctd, optval, sizeof(ctd))) +			return -EFAULT; + +		if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || +		    ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) +			return -EINVAL; + +		if (ctd.tcpct_cookie_desired == 0) { +			/* default to global value */ +		} else if ((0x1 & ctd.tcpct_cookie_desired) || +			   ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || +			   ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { +			return -EINVAL; +		} + +		if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { +			/* Supercedes all other values */ +			lock_sock(sk); +			if (tp->cookie_values != NULL) { +				kref_put(&tp->cookie_values->kref, +					 tcp_cookie_values_release); +				tp->cookie_values = NULL; +			} +			tp->rx_opt.cookie_in_always = 0; /* false */ +			tp->rx_opt.cookie_out_never = 1; /* true */ +			release_sock(sk); +			return err; +		} + +		/* Allocate ancillary memory before locking. +		 */ +		if (ctd.tcpct_used > 0 || +		    (tp->cookie_values == NULL && +		     (sysctl_tcp_cookie_size > 0 || +		      ctd.tcpct_cookie_desired > 0 || +		      ctd.tcpct_s_data_desired > 0))) { +			cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, +				      GFP_KERNEL); +			if (cvp == NULL) +				return -ENOMEM; +		} +		lock_sock(sk); +		tp->rx_opt.cookie_in_always = +			(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); +		tp->rx_opt.cookie_out_never = 0; /* false */ + +		if (tp->cookie_values != NULL) { +			if (cvp != NULL) { +				/* Changed values are recorded by a changed +				 * pointer, ensuring the cookie will differ, +				 * without separately hashing each value later. +				 */ +				kref_put(&tp->cookie_values->kref, +					 tcp_cookie_values_release); +				kref_init(&cvp->kref); +				tp->cookie_values = cvp; +			} else { +				cvp = tp->cookie_values; +			} +		} +		if (cvp != NULL) { +			cvp->cookie_desired = ctd.tcpct_cookie_desired; + +			if (ctd.tcpct_used > 0) { +				memcpy(cvp->s_data_payload, ctd.tcpct_value, +				       ctd.tcpct_used); +				cvp->s_data_desired = ctd.tcpct_used; +				cvp->s_data_constant = 1; /* true */ +			} else { +				/* No constant payload data. */ +				cvp->s_data_desired = ctd.tcpct_s_data_desired; +				cvp->s_data_constant = 0; /* false */ +			} +		} +		release_sock(sk); +		return err; +	} +	default: +		/* fallthru */ +		break; +	};  	if (optlen < sizeof(int))  		return -EINVAL; @@ -2163,16 +2297,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		break;  	case TCP_DEFER_ACCEPT: -		icsk->icsk_accept_queue.rskq_defer_accept = 0; -		if (val > 0) { -			/* Translate value in seconds to number of -			 * retransmits */ -			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && -			       val > ((TCP_TIMEOUT_INIT / HZ) << -				       icsk->icsk_accept_queue.rskq_defer_accept)) -				icsk->icsk_accept_queue.rskq_defer_accept++; -			icsk->icsk_accept_queue.rskq_defer_accept++; -		} +		/* Translate value in seconds to number of retransmits */ +		icsk->icsk_accept_queue.rskq_defer_accept = +			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, +					TCP_RTO_MAX / HZ);  		break;  	case TCP_WINDOW_CLAMP: @@ -2353,8 +2481,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  			val = (val ? : sysctl_tcp_fin_timeout) / HZ;  		break;  	case TCP_DEFER_ACCEPT: -		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : -			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); +		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, +				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);  		break;  	case TCP_WINDOW_CLAMP:  		val = tp->window_clamp; @@ -2387,6 +2515,47 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))  			return -EFAULT;  		return 0; + +	case TCP_COOKIE_TRANSACTIONS: { +		struct tcp_cookie_transactions ctd; +		struct tcp_cookie_values *cvp = tp->cookie_values; + +		if (get_user(len, optlen)) +			return -EFAULT; +		if (len < sizeof(ctd)) +			return -EINVAL; + +		memset(&ctd, 0, sizeof(ctd)); +		ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? +				   TCP_COOKIE_IN_ALWAYS : 0) +				| (tp->rx_opt.cookie_out_never ? +				   TCP_COOKIE_OUT_NEVER : 0); + +		if (cvp != NULL) { +			ctd.tcpct_flags |= (cvp->s_data_in ? +					    TCP_S_DATA_IN : 0) +					 | (cvp->s_data_out ? +					    TCP_S_DATA_OUT : 0); + +			ctd.tcpct_cookie_desired = cvp->cookie_desired; +			ctd.tcpct_s_data_desired = cvp->s_data_desired; + +			/* Cookie(s) saved, return as nonce */ +			if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) { +				/* impossible? */ +				return -EINVAL; +			} +			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], +			       cvp->cookie_pair_size); +			ctd.tcpct_used = cvp->cookie_pair_size; +		} + +		if (put_user(sizeof(ctd), optlen)) +			return -EFAULT; +		if (copy_to_user(optval, &ctd, sizeof(ctd))) +			return -EFAULT; +		return 0; +	}  	default:  		return -ENOPROTOOPT;  	} @@ -2809,6 +2978,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);  #endif +/** + * Each Responder maintains up to two secret values concurrently for + * efficient secret rollover.  Each secret value has 4 states: + * + * Generating.  (tcp_secret_generating != tcp_secret_primary) + *    Generates new Responder-Cookies, but not yet used for primary + *    verification.  This is a short-term state, typically lasting only + *    one round trip time (RTT). + * + * Primary.  (tcp_secret_generating == tcp_secret_primary) + *    Used both for generation and primary verification. + * + * Retiring.  (tcp_secret_retiring != tcp_secret_secondary) + *    Used for verification, until the first failure that can be + *    verified by the newer Generating secret.  At that time, this + *    cookie's state is changed to Secondary, and the Generating + *    cookie's state is changed to Primary.  This is a short-term state, + *    typically lasting only one round trip time (RTT). + * + * Secondary.  (tcp_secret_retiring == tcp_secret_secondary) + *    Used for secondary verification, after primary verification + *    failures.  This state lasts no more than twice the Maximum Segment + *    Lifetime (2MSL).  Then, the secret is discarded. + */ +struct tcp_cookie_secret { +	/* The secret is divided into two parts.  The digest part is the +	 * equivalent of previously hashing a secret and saving the state, +	 * and serves as an initialization vector (IV).  The message part +	 * serves as the trailing secret. +	 */ +	u32				secrets[COOKIE_WORKSPACE_WORDS]; +	unsigned long			expires; +}; + +#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) +#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) +#define TCP_SECRET_LIFE (HZ * 600) + +static struct tcp_cookie_secret tcp_secret_one; +static struct tcp_cookie_secret tcp_secret_two; + +/* Essentially a circular list, without dynamic allocation. */ +static struct tcp_cookie_secret *tcp_secret_generating; +static struct tcp_cookie_secret *tcp_secret_primary; +static struct tcp_cookie_secret *tcp_secret_retiring; +static struct tcp_cookie_secret *tcp_secret_secondary; + +static DEFINE_SPINLOCK(tcp_secret_locker); + +/* Select a pseudo-random word in the cookie workspace. + */ +static inline u32 tcp_cookie_work(const u32 *ws, const int n) +{ +	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; +} + +/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. + * Called in softirq context. + * Returns: 0 for success. + */ +int tcp_cookie_generator(u32 *bakery) +{ +	unsigned long jiffy = jiffies; + +	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { +		spin_lock_bh(&tcp_secret_locker); +		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { +			/* refreshed by another */ +			memcpy(bakery, +			       &tcp_secret_generating->secrets[0], +			       COOKIE_WORKSPACE_WORDS); +		} else { +			/* still needs refreshing */ +			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); + +			/* The first time, paranoia assumes that the +			 * randomization function isn't as strong.  But, +			 * this secret initialization is delayed until +			 * the last possible moment (packet arrival). +			 * Although that time is observable, it is +			 * unpredictably variable.  Mash in the most +			 * volatile clock bits available, and expire the +			 * secret extra quickly. +			 */ +			if (unlikely(tcp_secret_primary->expires == +				     tcp_secret_secondary->expires)) { +				struct timespec tv; + +				getnstimeofday(&tv); +				bakery[COOKIE_DIGEST_WORDS+0] ^= +					(u32)tv.tv_nsec; + +				tcp_secret_secondary->expires = jiffy +					+ TCP_SECRET_1MSL +					+ (0x0f & tcp_cookie_work(bakery, 0)); +			} else { +				tcp_secret_secondary->expires = jiffy +					+ TCP_SECRET_LIFE +					+ (0xff & tcp_cookie_work(bakery, 1)); +				tcp_secret_primary->expires = jiffy +					+ TCP_SECRET_2MSL +					+ (0x1f & tcp_cookie_work(bakery, 2)); +			} +			memcpy(&tcp_secret_secondary->secrets[0], +			       bakery, COOKIE_WORKSPACE_WORDS); + +			rcu_assign_pointer(tcp_secret_generating, +					   tcp_secret_secondary); +			rcu_assign_pointer(tcp_secret_retiring, +					   tcp_secret_primary); +			/* +			 * Neither call_rcu() nor synchronize_rcu() needed. +			 * Retiring data is not freed.  It is replaced after +			 * further (locked) pointer updates, and a quiet time +			 * (minimum 1MSL, maximum LIFE - 2MSL). +			 */ +		} +		spin_unlock_bh(&tcp_secret_locker); +	} else { +		rcu_read_lock_bh(); +		memcpy(bakery, +		       &rcu_dereference(tcp_secret_generating)->secrets[0], +		       COOKIE_WORKSPACE_WORDS); +		rcu_read_unlock_bh(); +	} +	return 0; +} +EXPORT_SYMBOL(tcp_cookie_generator); +  void tcp_done(struct sock *sk)  {  	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) @@ -2843,6 +3141,7 @@ void __init tcp_init(void)  	struct sk_buff *skb = NULL;  	unsigned long nr_pages, limit;  	int order, i, max_share; +	unsigned long jiffy = jiffies;  	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -2865,11 +3164,10 @@ void __init tcp_init(void)  					(totalram_pages >= 128 * 1024) ?  					13 : 15,  					0, -					&tcp_hashinfo.ehash_size,  					NULL, +					&tcp_hashinfo.ehash_mask,  					thash_entries ? 0 : 512 * 1024); -	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; -	for (i = 0; i < tcp_hashinfo.ehash_size; i++) { +	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);  	} @@ -2878,7 +3176,7 @@ void __init tcp_init(void)  	tcp_hashinfo.bhash =  		alloc_large_system_hash("TCP bind",  					sizeof(struct inet_bind_hashbucket), -					tcp_hashinfo.ehash_size, +					tcp_hashinfo.ehash_mask + 1,  					(totalram_pages >= 128 * 1024) ?  					13 : 15,  					0, @@ -2933,10 +3231,19 @@ void __init tcp_init(void)  	sysctl_tcp_rmem[2] = max(87380, max_share);  	printk(KERN_INFO "TCP: Hash tables configured " -	       "(established %d bind %d)\n", -	       tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); +	       "(established %u bind %u)\n", +	       tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);  	tcp_register_congestion_control(&tcp_reno); + +	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); +	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); +	tcp_secret_one.expires = jiffy; /* past due */ +	tcp_secret_two.expires = jiffy; /* past due */ +	tcp_secret_generating = &tcp_secret_one; +	tcp_secret_primary = &tcp_secret_one; +	tcp_secret_retiring = &tcp_secret_two; +	tcp_secret_secondary = &tcp_secret_two;  }  EXPORT_SYMBOL(tcp_close);  |