diff options
| author | Jerry Chu <hkchu@google.com> | 2012-08-31 12:29:13 +0000 | 
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2012-08-31 20:02:19 -0400 | 
| commit | 168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef (patch) | |
| tree | 0d5b9181b840c9b6b08b1452004f0746e8eebab8 /net/ipv4/tcp_ipv4.c | |
| parent | 8336886f786fdacbc19b719c1f7ea91eb70706d4 (diff) | |
| download | olio-linux-3.10-168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef.tar.xz olio-linux-3.10-168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef.zip  | |
tcp: TCP Fast Open Server - main code path
This patch adds the main processing path to complete the TFO server
patches.
A TFO request (i.e., SYN+data packet with a TFO cookie option) first
gets processed in tcp_v4_conn_request(). If it passes the various TFO
checks by tcp_fastopen_check(), a child socket will be created right
away to be accepted by applications, rather than waiting for the 3WHS
to finish.
In additon to the use of TFO cookie, a simple max_qlen based scheme
is put in place to fend off spoofed TFO attack.
When a valid ACK comes back to tcp_rcv_state_process(), it will cause
the state of the child socket to switch from either TCP_SYN_RECV to
TCP_ESTABLISHED, or TCP_FIN_WAIT1 to TCP_FIN_WAIT2. At this time
retransmission will resume for any unack'ed (data, FIN,...) segments.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 265 | 
1 files changed, 251 insertions, 14 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bb148dee1ed..e64abed249c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	const int code = icmp_hdr(icmp_skb)->code;  	struct sock *sk;  	struct sk_buff *skb; +	struct request_sock *req;  	__u32 seq;  	__u32 remaining;  	int err; @@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	icsk = inet_csk(sk);  	tp = tcp_sk(sk); +	req = tp->fastopen_rsk;  	seq = ntohl(th->seq);  	if (sk->sk_state != TCP_LISTEN && -	    !between(seq, tp->snd_una, tp->snd_nxt)) { +	    !between(seq, tp->snd_una, tp->snd_nxt) && +	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) { +		/* For a Fast Open socket, allow seq to be snt_isn. */  		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);  		goto out;  	} @@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		    !icsk->icsk_backoff)  			break; +		/* XXX (TFO) - revisit the following logic for TFO */ +  		if (sock_owned_by_user(sk))  			break; @@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		goto out;  	} +	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather +	 * than following the TCP_SYN_RECV case and closing the socket, +	 * we ignore the ICMP error and keep trying like a fully established +	 * socket. Is this the right thing to do? +	 */ +	if (req && req->sk == NULL) +		goto out; +  	switch (sk->sk_state) {  		struct request_sock *req, **prev;  	case TCP_LISTEN: @@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	case TCP_SYN_SENT:  	case TCP_SYN_RECV:  /* Cannot happen. -			       It can f.e. if SYNs crossed. +			       It can f.e. if SYNs crossed, +			       or Fast Open.  			     */  		if (!sock_owned_by_user(sk)) {  			sk->sk_err = err; @@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)  static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,  				  struct request_sock *req)  { -	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, -			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, +	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV +	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. +	 */ +	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? +			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, +			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,  			req->ts_recent,  			0,  			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {  };  #endif +static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, +			       struct request_sock *req, +			       struct tcp_fastopen_cookie *foc, +			       struct tcp_fastopen_cookie *valid_foc) +{ +	bool skip_cookie = false; +	struct fastopen_queue *fastopenq; + +	if (likely(!fastopen_cookie_present(foc))) { +		/* See include/net/tcp.h for the meaning of these knobs */ +		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || +		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && +		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) +			skip_cookie = true; /* no cookie to validate */ +		else +			return false; +	} +	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; +	/* A FO option is present; bump the counter. */ +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + +	/* Make sure the listener has enabled fastopen, and we don't +	 * exceed the max # of pending TFO requests allowed before trying +	 * to validating the cookie in order to avoid burning CPU cycles +	 * unnecessarily. +	 * +	 * XXX (TFO) - The implication of checking the max_qlen before +	 * processing a cookie request is that clients can't differentiate +	 * between qlen overflow causing Fast Open to be disabled +	 * temporarily vs a server not supporting Fast Open at all. +	 */ +	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || +	    fastopenq == NULL || fastopenq->max_qlen == 0) +		return false; + +	if (fastopenq->qlen >= fastopenq->max_qlen) { +		struct request_sock *req1; +		spin_lock(&fastopenq->lock); +		req1 = fastopenq->rskq_rst_head; +		if ((req1 == NULL) || time_after(req1->expires, jiffies)) { +			spin_unlock(&fastopenq->lock); +			NET_INC_STATS_BH(sock_net(sk), +			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); +			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ +			foc->len = -1; +			return false; +		} +		fastopenq->rskq_rst_head = req1->dl_next; +		fastopenq->qlen--; +		spin_unlock(&fastopenq->lock); +		reqsk_free(req1); +	} +	if (skip_cookie) { +		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +		return true; +	} +	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { +		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { +			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); +			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || +			    memcmp(&foc->val[0], &valid_foc->val[0], +			    TCP_FASTOPEN_COOKIE_SIZE) != 0) +				return false; +			valid_foc->len = -1; +		} +		/* Acknowledge the data received from the peer. */ +		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +		return true; +	} else if (foc->len == 0) { /* Client requesting a cookie */ +		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); +		NET_INC_STATS_BH(sock_net(sk), +		    LINUX_MIB_TCPFASTOPENCOOKIEREQD); +	} else { +		/* Client sent a cookie with wrong size. Treat it +		 * the same as invalid and return a valid one. +		 */ +		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); +	} +	return false; +} + +static int tcp_v4_conn_req_fastopen(struct sock *sk, +				    struct sk_buff *skb, +				    struct sk_buff *skb_synack, +				    struct request_sock *req, +				    struct request_values *rvp) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; +	const struct inet_request_sock *ireq = inet_rsk(req); +	struct sock *child; + +	req->retrans = 0; +	req->sk = NULL; + +	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); +	if (child == NULL) { +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL); +		kfree_skb(skb_synack); +		return -1; +	} +	ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, +			ireq->rmt_addr, ireq->opt); +	/* XXX (TFO) - is it ok to ignore error and continue? */ + +	spin_lock(&queue->fastopenq->lock); +	queue->fastopenq->qlen++; +	spin_unlock(&queue->fastopenq->lock); + +	/* Initialize the child socket. Have to fix some values to take +	 * into account the child is a Fast Open socket and is created +	 * only out of the bits carried in the SYN packet. +	 */ +	tp = tcp_sk(child); + +	tp->fastopen_rsk = req; +	/* Do a hold on the listner sk so that if the listener is being +	 * closed, the child that has been accepted can live on and still +	 * access listen_lock. +	 */ +	sock_hold(sk); +	tcp_rsk(req)->listener = sk; + +	/* RFC1323: The window in SYN & SYN/ACK segments is never +	 * scaled. So correct it appropriately. +	 */ +	tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + +	/* Activate the retrans timer so that SYNACK can be retransmitted. +	 * The request socket is not added to the SYN table of the parent +	 * because it's been added to the accept queue directly. +	 */ +	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, +	    TCP_TIMEOUT_INIT, TCP_RTO_MAX); + +	/* Add the child socket directly into the accept queue */ +	inet_csk_reqsk_queue_add(sk, req, child); + +	/* Now finish processing the fastopen child socket. */ +	inet_csk(child)->icsk_af_ops->rebuild_header(child); +	tcp_init_congestion_control(child); +	tcp_mtup_init(child); +	tcp_init_buffer_space(child); +	tcp_init_metrics(child); + +	/* Queue the data carried in the SYN packet. We need to first +	 * bump skb's refcnt because the caller will attempt to free it. +	 * +	 * XXX (TFO) - we honor a zero-payload TFO request for now. +	 * (Any reason not to?) +	 */ +	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { +		/* Don't queue the skb if there is no payload in SYN. +		 * XXX (TFO) - How about SYN+FIN? +		 */ +		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	} else { +		skb = skb_get(skb); +		skb_dst_drop(skb); +		__skb_pull(skb, tcp_hdr(skb)->doff * 4); +		skb_set_owner_r(skb, child); +		__skb_queue_tail(&child->sk_receive_queue, skb); +		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	} +	sk->sk_data_ready(sk, 0); +	bh_unlock_sock(child); +	sock_put(child); +	WARN_ON(req->sk == NULL); +	return 0; +} +  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  {  	struct tcp_extend_values tmp_ext; @@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	__be32 daddr = ip_hdr(skb)->daddr;  	__u32 isn = TCP_SKB_CB(skb)->when;  	bool want_cookie = false; +	struct flowi4 fl4; +	struct tcp_fastopen_cookie foc = { .len = -1 }; +	struct tcp_fastopen_cookie valid_foc = { .len = -1 }; +	struct sk_buff *skb_synack; +	int do_fastopen;  	/* Never answer to SYNs send to broadcast or multicast */  	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	tcp_clear_options(&tmp_opt);  	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;  	tmp_opt.user_mss  = tp->rx_opt.user_mss; -	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); +	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, +	    want_cookie ? NULL : &foc);  	if (tmp_opt.cookie_plus > 0 &&  	    tmp_opt.saw_tstamp && @@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  		isn = cookie_v4_init_sequence(sk, skb, &req->mss);  		req->cookie_ts = tmp_opt.tstamp_ok;  	} else if (!isn) { -		struct flowi4 fl4; -  		/* VJ's idea. We save last timestamp seen  		 * from the destination in peer table, when entering  		 * state TIME-WAIT, and check against it before @@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	tcp_rsk(req)->snt_isn = isn;  	tcp_rsk(req)->snt_synack = tcp_time_stamp; -	if (tcp_v4_send_synack(sk, dst, req, -			       (struct request_values *)&tmp_ext, -			       skb_get_queue_mapping(skb), -			       want_cookie) || -	    want_cookie) +	if (dst == NULL) { +		dst = inet_csk_route_req(sk, &fl4, req); +		if (dst == NULL) +			goto drop_and_free; +	} +	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); + +	/* We don't call tcp_v4_send_synack() directly because we need +	 * to make sure a child socket can be created successfully before +	 * sending back synack! +	 * +	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() +	 * (or better yet, call tcp_send_synack() in the child context +	 * directly, but will have to fix bunch of other code first) +	 * after syn_recv_sock() except one will need to first fix the +	 * latter to remove its dependency on the current implementation +	 * of tcp_v4_send_synack()->tcp_select_initial_window(). +	 */ +	skb_synack = tcp_make_synack(sk, dst, req, +	    (struct request_values *)&tmp_ext, +	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); + +	if (skb_synack) { +		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); +		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); +	} else +		goto drop_and_free; + +	if (likely(!do_fastopen)) { +		int err; +		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, +		     ireq->rmt_addr, ireq->opt); +		err = net_xmit_eval(err); +		if (err || want_cookie) +			goto drop_and_free; + +		tcp_rsk(req)->listener = NULL; +		/* Add the request_sock to the SYN table */ +		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +		if (fastopen_cookie_present(&foc) && foc.len != 0) +			NET_INC_STATS_BH(sock_net(sk), +			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL); +	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, +	    (struct request_values *)&tmp_ext))  		goto drop_and_free; -	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);  	return 0;  drop_and_release: @@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)  			 tcp_cookie_values_release);  		tp->cookie_values = NULL;  	} +	BUG_ON(tp->fastopen_rsk != NULL);  	/* If socket is aborted during connect operation */  	tcp_free_fastopen_req(tp); @@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  	const struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk);  	const struct inet_sock *inet = inet_sk(sk); +	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;  	__be32 dest = inet->inet_daddr;  	__be32 src = inet->inet_rcv_saddr;  	__u16 destp = ntohs(inet->inet_dport); @@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  		jiffies_to_clock_t(icsk->icsk_ack.ato),  		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,  		tp->snd_cwnd, -		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, +		sk->sk_state == TCP_LISTEN ? +		    (fastopenq ? fastopenq->max_qlen : 0) : +		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),  		len);  }  |