diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 143 | 
1 files changed, 59 insertions, 84 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0298f80681f..cad73b7dfef 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -66,6 +66,7 @@  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/sysctl.h> +#include <net/dst.h>  #include <net/tcp.h>  #include <net/inet_common.h>  #include <linux/ipsec.h> @@ -113,8 +114,6 @@ int sysctl_tcp_abc __read_mostly;  #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)  #define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) -#define IsSackFrto() (sysctl_tcp_frto == 0x2) -  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)  #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -605,7 +604,7 @@ static u32 tcp_rto_min(struct sock *sk)  	u32 rto_min = TCP_RTO_MIN;  	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) -		rto_min = dst->metrics[RTAX_RTO_MIN - 1]; +		rto_min = dst_metric(dst, RTAX_RTO_MIN);  	return rto_min;  } @@ -769,7 +768,7 @@ void tcp_update_metrics(struct sock *sk)  				dst->metrics[RTAX_RTTVAR - 1] = m;  			else  				dst->metrics[RTAX_RTTVAR-1] -= -					(dst->metrics[RTAX_RTTVAR-1] - m)>>2; +					(dst_metric(dst, RTAX_RTTVAR) - m)>>2;  		}  		if (tp->snd_ssthresh >= 0xFFFF) { @@ -788,21 +787,21 @@ void tcp_update_metrics(struct sock *sk)  				dst->metrics[RTAX_SSTHRESH-1] =  					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);  			if (!dst_metric_locked(dst, RTAX_CWND)) -				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; +				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;  		} else {  			/* Else slow start did not finish, cwnd is non-sense,  			   ssthresh may be also invalid.  			 */  			if (!dst_metric_locked(dst, RTAX_CWND)) -				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; -			if (dst->metrics[RTAX_SSTHRESH-1] && +				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; +			if (dst_metric(dst, RTAX_SSTHRESH) &&  			    !dst_metric_locked(dst, RTAX_SSTHRESH) && -			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) +			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))  				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;  		}  		if (!dst_metric_locked(dst, RTAX_REORDERING)) { -			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && +			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&  			    tp->reordering != sysctl_tcp_reordering)  				dst->metrics[RTAX_REORDERING-1] = tp->reordering;  		} @@ -1172,8 +1171,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,  			   struct tcp_sack_block_wire *sp, int num_sacks,  			   u32 prior_snd_una)  { -	u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq)); -	u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq)); +	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); +	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);  	int dup_sack = 0;  	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { @@ -1181,8 +1180,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,  		tcp_dsack_seen(tp);  		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);  	} else if (num_sacks > 1) { -		u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); -		u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq)); +		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); +		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);  		if (!after(end_seq_0, end_seq_1) &&  		    !before(start_seq_0, start_seq_1)) { @@ -1393,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,  	if (before(next_dup->start_seq, skip_to_seq)) {  		skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); -		tcp_sacktag_walk(skb, sk, NULL, -				 next_dup->start_seq, next_dup->end_seq, -				 1, fack_count, reord, flag); +		skb = tcp_sacktag_walk(skb, sk, NULL, +				     next_dup->start_seq, next_dup->end_seq, +				     1, fack_count, reord, flag);  	}  	return skb; @@ -1453,8 +1452,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	for (i = 0; i < num_sacks; i++) {  		int dup_sack = !i && found_dup_sack; -		sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq)); -		sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq)); +		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); +		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);  		if (!tcp_is_sackblock_valid(tp, dup_sack,  					    sp[used_sacks].start_seq, @@ -1685,6 +1684,11 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)  	tp->sacked_out = 0;  } +static int tcp_is_sackfrto(const struct tcp_sock *tp) +{ +	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); +} +  /* F-RTO can only be used if TCP has never retransmitted anything other than   * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)   */ @@ -1701,7 +1705,7 @@ int tcp_use_frto(struct sock *sk)  	if (icsk->icsk_mtup.probe_size)  		return 0; -	if (IsSackFrto()) +	if (tcp_is_sackfrto(tp))  		return 1;  	/* Avoid expensive walking of rexmit queue if possible */ @@ -1791,7 +1795,7 @@ void tcp_enter_frto(struct sock *sk)  	/* Earlier loss recovery underway (see RFC4138; Appendix B).  	 * The last condition is necessary at least in tp->frto_counter case.  	 */ -	if (IsSackFrto() && (tp->frto_counter || +	if (tcp_is_sackfrto(tp) && (tp->frto_counter ||  	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&  	    after(tp->high_seq, tp->snd_una)) {  		tp->frto_highmark = tp->high_seq; @@ -1838,9 +1842,16 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;  		} -		/* Don't lost mark skbs that were fwd transmitted after RTO */ -		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && -		    !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { +		/* Marking forward transmissions that were made after RTO lost +		 * can cause unnecessary retransmissions in some scenarios, +		 * SACK blocks will mitigate that in some but not in all cases. +		 * We used to not mark them but it was causing break-ups with +		 * receivers that do only in-order receival. +		 * +		 * TODO: we could detect presence of such receiver and select +		 * different behavior per flow. +		 */ +		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {  			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;  			tp->lost_out += tcp_skb_pcount(skb);  		} @@ -1856,7 +1867,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)  	tp->reordering = min_t(unsigned int, tp->reordering,  			       sysctl_tcp_reordering);  	tcp_set_ca_state(sk, TCP_CA_Loss); -	tp->high_seq = tp->frto_highmark; +	tp->high_seq = tp->snd_nxt;  	TCP_ECN_queue_cwr(tp);  	tcp_clear_retrans_hints_partial(tp); @@ -2472,28 +2483,34 @@ static inline void tcp_complete_cwr(struct sock *sk)  	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);  } +static void tcp_try_keep_open(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int state = TCP_CA_Open; + +	if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) +		state = TCP_CA_Disorder; + +	if (inet_csk(sk)->icsk_ca_state != state) { +		tcp_set_ca_state(sk, state); +		tp->high_seq = tp->snd_nxt; +	} +} +  static void tcp_try_to_open(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	tcp_verify_left_out(tp); -	if (tp->retrans_out == 0) +	if (!tp->frto_counter && tp->retrans_out == 0)  		tp->retrans_stamp = 0;  	if (flag & FLAG_ECE)  		tcp_enter_cwr(sk, 1);  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { -		int state = TCP_CA_Open; - -		if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) -			state = TCP_CA_Disorder; - -		if (inet_csk(sk)->icsk_ca_state != state) { -			tcp_set_ca_state(sk, state); -			tp->high_seq = tp->snd_nxt; -		} +		tcp_try_keep_open(sk);  		tcp_moderate_cwnd(tp);  	} else {  		tcp_cwnd_down(sk, flag); @@ -3123,7 +3140,7 @@ static int tcp_process_frto(struct sock *sk, int flag)  		return 1;  	} -	if (!IsSackFrto() || tcp_is_reno(tp)) { +	if (!tcp_is_sackfrto(tp)) {  		/* RFC4138 shortcoming in step 2; should also have case c):  		 * ACK isn't duplicate nor advances window, e.g., opposite dir  		 * data, winupdate @@ -3299,8 +3316,11 @@ no_queue:  	return 1;  old_ack: -	if (TCP_SKB_CB(skb)->sacked) +	if (TCP_SKB_CB(skb)->sacked) {  		tcp_sacktag_write_queue(sk, skb, prior_snd_una); +		if (icsk->icsk_ca_state == TCP_CA_Open) +			tcp_try_keep_open(sk); +	}  uninteresting_ack:  	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3340,7 +3360,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  			switch (opcode) {  			case TCPOPT_MSS:  				if (opsize == TCPOLEN_MSS && th->syn && !estab) { -					u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); +					u16 in_mss = get_unaligned_be16(ptr);  					if (in_mss) {  						if (opt_rx->user_mss &&  						    opt_rx->user_mss < in_mss) @@ -3369,8 +3389,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  				    ((estab && opt_rx->tstamp_ok) ||  				     (!estab && sysctl_tcp_timestamps))) {  					opt_rx->saw_tstamp = 1; -					opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); -					opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); +					opt_rx->rcv_tsval = get_unaligned_be32(ptr); +					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);  				}  				break;  			case TCPOPT_SACK_PERM: @@ -4521,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)  	}  } -static int tcp_defer_accept_check(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); - -	if (tp->defer_tcp_accept.request) { -		int queued_data =  tp->rcv_nxt - tp->copied_seq; -		int hasfin =  !skb_queue_empty(&sk->sk_receive_queue) ? -			tcp_hdr((struct sk_buff *) -				sk->sk_receive_queue.prev)->fin : 0; - -		if (queued_data && hasfin) -			queued_data--; - -		if (queued_data && -		    tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { -			if (sock_flag(sk, SOCK_KEEPOPEN)) { -				inet_csk_reset_keepalive_timer(sk, -							       keepalive_time_when(tp)); -			} else { -				inet_csk_delete_keepalive_timer(sk); -			} - -			inet_csk_reqsk_queue_add( -				tp->defer_tcp_accept.listen_sk, -				tp->defer_tcp_accept.request, -				sk); - -			tp->defer_tcp_accept.listen_sk->sk_data_ready( -				tp->defer_tcp_accept.listen_sk, 0); - -			sock_put(tp->defer_tcp_accept.listen_sk); -			sock_put(sk); -			tp->defer_tcp_accept.listen_sk = NULL; -			tp->defer_tcp_accept.request = NULL; -		} else if (hasfin || -			   tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { -			tcp_reset(sk); -			return -1; -		} -	} -	return 0; -} -  static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -4924,8 +4901,6 @@ step5:  	tcp_data_snd_check(sk);  	tcp_ack_snd_check(sk); - -	tcp_defer_accept_check(sk);  	return 0;  csum_error:  |