diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 206 | 
1 files changed, 168 insertions, 38 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0c74da8a047..94cdbc55ca7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@  #include <linux/rcupdate.h>  #include <linux/times.h>  #include <linux/slab.h> +#include <linux/prefetch.h>  #include <net/dst.h>  #include <net/net_namespace.h>  #include <net/protocol.h> @@ -112,7 +113,7 @@  #include <net/secure_seq.h>  #define RT_FL_TOS(oldflp4) \ -    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) +	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))  #define IP_MAX_MTU	0xFFF0 @@ -120,6 +121,7 @@  static int ip_rt_max_size;  static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly  = 60 * HZ;  static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;  static int ip_rt_redirect_number __read_mostly	= 9;  static int ip_rt_redirect_load __read_mostly	= HZ / 50; @@ -131,6 +133,10 @@ static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;  static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;  static int ip_rt_min_advmss __read_mostly	= 256;  static int rt_chain_length_max __read_mostly	= 20; +static int redirect_genid; + +static struct delayed_work expires_work; +static unsigned long expires_ljiffies;  /*   *	Interface to generic destination cache. @@ -138,7 +144,7 @@ static int rt_chain_length_max __read_mostly	= 20;  static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);  static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst); -static unsigned int	 ipv4_default_mtu(const struct dst_entry *dst); +static unsigned int	 ipv4_mtu(const struct dst_entry *dst);  static void		 ipv4_dst_destroy(struct dst_entry *dst);  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);  static void		 ipv4_link_failure(struct sk_buff *skb); @@ -193,7 +199,7 @@ static struct dst_ops ipv4_dst_ops = {  	.gc =			rt_garbage_collect,  	.check =		ipv4_dst_check,  	.default_advmss =	ipv4_default_advmss, -	.default_mtu =		ipv4_default_mtu, +	.mtu =			ipv4_mtu,  	.cow_metrics =		ipv4_cow_metrics,  	.destroy =		ipv4_dst_destroy,  	.ifdown =		ipv4_dst_ifdown, @@ -416,9 +422,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)  	else {  		struct rtable *r = v;  		struct neighbour *n; -		int len; +		int len, HHUptod; +		rcu_read_lock();  		n = dst_get_neighbour(&r->dst); +		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; +		rcu_read_unlock(); +  		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"  			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",  			r->dst.dev ? r->dst.dev->name : "*", @@ -432,7 +442,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)  			      dst_metric(&r->dst, RTAX_RTTVAR)),  			r->rt_key_tos,  			-1, -			(n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0, +			HHUptod,  			r->rt_spec_dst, &len);  		seq_printf(seq, "%*s\n", 127 - len, ""); @@ -825,6 +835,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)  	return ONE;  } +static void rt_check_expire(void) +{ +	static unsigned int rover; +	unsigned int i = rover, goal; +	struct rtable *rth; +	struct rtable __rcu **rthp; +	unsigned long samples = 0; +	unsigned long sum = 0, sum2 = 0; +	unsigned long delta; +	u64 mult; + +	delta = jiffies - expires_ljiffies; +	expires_ljiffies = jiffies; +	mult = ((u64)delta) << rt_hash_log; +	if (ip_rt_gc_timeout > 1) +		do_div(mult, ip_rt_gc_timeout); +	goal = (unsigned int)mult; +	if (goal > rt_hash_mask) +		goal = rt_hash_mask + 1; +	for (; goal > 0; goal--) { +		unsigned long tmo = ip_rt_gc_timeout; +		unsigned long length; + +		i = (i + 1) & rt_hash_mask; +		rthp = &rt_hash_table[i].chain; + +		if (need_resched()) +			cond_resched(); + +		samples++; + +		if (rcu_dereference_raw(*rthp) == NULL) +			continue; +		length = 0; +		spin_lock_bh(rt_hash_lock_addr(i)); +		while ((rth = rcu_dereference_protected(*rthp, +					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { +			prefetch(rth->dst.rt_next); +			if (rt_is_expired(rth)) { +				*rthp = rth->dst.rt_next; +				rt_free(rth); +				continue; +			} +			if (rth->dst.expires) { +				/* Entry is expired even if it is in use */ +				if (time_before_eq(jiffies, rth->dst.expires)) { +nofree: +					tmo >>= 1; +					rthp = &rth->dst.rt_next; +					/* +					 * We only count entries on +					 * a chain with equal hash inputs once +					 * so that entries for different QOS +					 * levels, and other non-hash input +					 * attributes don't unfairly skew +					 * the length computation +					 */ +					length += has_noalias(rt_hash_table[i].chain, rth); +					continue; +				} +			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) +				goto nofree; + +			/* Cleanup aged off entries. */ +			*rthp = rth->dst.rt_next; +			rt_free(rth); +		} +		spin_unlock_bh(rt_hash_lock_addr(i)); +		sum += length; +		sum2 += length*length; +	} +	if (samples) { +		unsigned long avg = sum / samples; +		unsigned long sd = int_sqrt(sum2 / samples - avg*avg); +		rt_chain_length_max = max_t(unsigned long, +					ip_rt_gc_elasticity, +					(avg + 4*sd) >> FRACT_BITS); +	} +	rover = i; +} + +/* + * rt_worker_func() is run in process context. + * we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ +	rt_check_expire(); +	schedule_delayed_work(&expires_work, ip_rt_gc_interval); +} +  /*   * Perturbation of rt_genid by a small quantity [1..256]   * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -837,6 +938,7 @@ static void rt_cache_invalidate(struct net *net)  	get_random_bytes(&shuffle, sizeof(shuffle));  	atomic_add(shuffle + 1U, &net->ipv4.rt_genid); +	redirect_genid++;  }  /* @@ -1265,7 +1367,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)  {  	struct rtable *rt = (struct rtable *) dst; -	if (rt) { +	if (rt && !(rt->dst.flags & DST_NOPEER)) {  		if (rt->peer == NULL)  			rt_bind_peer(rt, rt->rt_dst, 1); @@ -1276,7 +1378,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)  			iph->id = htons(inet_getid(rt->peer, more));  			return;  		} -	} else +	} else if (!rt)  		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",  		       __builtin_return_address(0)); @@ -1304,7 +1406,7 @@ static void rt_del(unsigned hash, struct rtable *rt)  	spin_unlock_bh(rt_hash_lock_addr(hash));  } -static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)  {  	struct rtable *rt = (struct rtable *) dst;  	__be32 orig_gw = rt->rt_gateway; @@ -1315,21 +1417,19 @@ static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)  	rt->rt_gateway = peer->redirect_learned.a4;  	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); -	if (IS_ERR(n)) -		return PTR_ERR(n); +	if (IS_ERR(n)) { +		rt->rt_gateway = orig_gw; +		return; +	}  	old_n = xchg(&rt->dst._neighbour, n);  	if (old_n)  		neigh_release(old_n); -	if (!n || !(n->nud_state & NUD_VALID)) { -		if (n) -			neigh_event_send(n, NULL); -		rt->rt_gateway = orig_gw; -		return -EAGAIN; +	if (!(n->nud_state & NUD_VALID)) { +		neigh_event_send(n, NULL);  	} else {  		rt->rt_flags |= RTCF_REDIRECTED;  		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);  	} -	return 0;  }  /* called in rcu_read_lock() section */ @@ -1391,8 +1491,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,  				peer = rt->peer;  				if (peer) { -					if (peer->redirect_learned.a4 != new_gw) { +					if (peer->redirect_learned.a4 != new_gw || +					    peer->redirect_genid != redirect_genid) {  						peer->redirect_learned.a4 = new_gw; +						peer->redirect_genid = redirect_genid;  						atomic_inc(&__rt_peer_genid);  					}  					check_peer_redir(&rt->dst, peer); @@ -1685,12 +1787,8 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)  } -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +static void ipv4_validate_peer(struct rtable *rt)  { -	struct rtable *rt = (struct rtable *) dst; - -	if (rt_is_expired(rt)) -		return NULL;  	if (rt->rt_peer_genid != rt_peer_genid()) {  		struct inet_peer *peer; @@ -1699,17 +1797,26 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)  		peer = rt->peer;  		if (peer) { -			check_peer_pmtu(dst, peer); +			check_peer_pmtu(&rt->dst, peer); +			if (peer->redirect_genid != redirect_genid) +				peer->redirect_learned.a4 = 0;  			if (peer->redirect_learned.a4 && -			    peer->redirect_learned.a4 != rt->rt_gateway) { -				if (check_peer_redir(dst, peer)) -					return NULL; -			} +			    peer->redirect_learned.a4 != rt->rt_gateway) +				check_peer_redir(&rt->dst, peer);  		}  		rt->rt_peer_genid = rt_peer_genid();  	} +} + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ +	struct rtable *rt = (struct rtable *) dst; + +	if (rt_is_expired(rt)) +		return NULL; +	ipv4_validate_peer(rt);  	return dst;  } @@ -1814,12 +1921,17 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)  	return advmss;  } -static unsigned int ipv4_default_mtu(const struct dst_entry *dst) +static unsigned int ipv4_mtu(const struct dst_entry *dst)  { -	unsigned int mtu = dst->dev->mtu; +	const struct rtable *rt = (const struct rtable *) dst; +	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + +	if (mtu && rt_is_output_route(rt)) +		return mtu; + +	mtu = dst->dev->mtu;  	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { -		const struct rtable *rt = (const struct rtable *) dst;  		if (rt->rt_gateway != rt->rt_dst && mtu > 576)  			mtu = 576; @@ -1852,6 +1964,8 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,  		dst_init_metrics(&rt->dst, peer->metrics, false);  		check_peer_pmtu(&rt->dst, peer); +		if (peer->redirect_genid != redirect_genid) +			peer->redirect_learned.a4 = 0;  		if (peer->redirect_learned.a4 &&  		    peer->redirect_learned.a4 != rt->rt_gateway) {  			rt->rt_gateway = peer->redirect_learned.a4; @@ -2357,6 +2471,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,  		    rth->rt_mark == skb->mark &&  		    net_eq(dev_net(rth->dst.dev), net) &&  		    !rt_is_expired(rth)) { +			ipv4_validate_peer(rth);  			if (noref) {  				dst_use_noref(&rth->dst, jiffies);  				skb_dst_set_noref(skb, &rth->dst); @@ -2415,11 +2530,11 @@ EXPORT_SYMBOL(ip_route_input_common);  static struct rtable *__mkroute_output(const struct fib_result *res,  				       const struct flowi4 *fl4,  				       __be32 orig_daddr, __be32 orig_saddr, -				       int orig_oif, struct net_device *dev_out, +				       int orig_oif, __u8 orig_rtos, +				       struct net_device *dev_out,  				       unsigned int flags)  {  	struct fib_info *fi = res->fi; -	u32 tos = RT_FL_TOS(fl4);  	struct in_device *in_dev;  	u16 type = res->type;  	struct rtable *rth; @@ -2470,7 +2585,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,  	rth->rt_genid = rt_genid(dev_net(dev_out));  	rth->rt_flags	= flags;  	rth->rt_type	= type; -	rth->rt_key_tos	= tos; +	rth->rt_key_tos	= orig_rtos;  	rth->rt_dst	= fl4->daddr;  	rth->rt_src	= fl4->saddr;  	rth->rt_route_iif = 0; @@ -2520,7 +2635,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,  static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)  {  	struct net_device *dev_out = NULL; -	u32 tos	= RT_FL_TOS(fl4); +	__u8 tos = RT_FL_TOS(fl4);  	unsigned int flags = 0;  	struct fib_result res;  	struct rtable *rth; @@ -2696,7 +2811,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)  make_route:  	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, -			       dev_out, flags); +			       tos, dev_out, flags);  	if (!IS_ERR(rth)) {  		unsigned int hash; @@ -2732,6 +2847,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)  			    (IPTOS_RT_MASK | RTO_ONLINK)) &&  		    net_eq(dev_net(rth->dst.dev), net) &&  		    !rt_is_expired(rth)) { +			ipv4_validate_peer(rth);  			dst_use(&rth->dst, jiffies);  			RT_CACHE_STAT_INC(out_hit);  			rcu_read_unlock_bh(); @@ -2755,9 +2871,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo  	return NULL;  } -static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)  { -	return 0; +	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + +	return mtu ? : dst->dev->mtu;  }  static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -2775,7 +2893,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {  	.protocol		=	cpu_to_be16(ETH_P_IP),  	.destroy		=	ipv4_dst_destroy,  	.check			=	ipv4_blackhole_dst_check, -	.default_mtu		=	ipv4_blackhole_default_mtu, +	.mtu			=	ipv4_blackhole_mtu,  	.default_advmss		=	ipv4_default_advmss,  	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,  	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics, @@ -3157,6 +3275,13 @@ static ctl_table ipv4_route_table[] = {  		.proc_handler	= proc_dointvec_jiffies,  	},  	{ +		.procname	= "gc_interval", +		.data		= &ip_rt_gc_interval, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_jiffies, +	}, +	{  		.procname	= "redirect_load",  		.data		= &ip_rt_redirect_load,  		.maxlen		= sizeof(int), @@ -3366,6 +3491,11 @@ int __init ip_rt_init(void)  	devinet_init();  	ip_fib_init(); +	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); +	expires_ljiffies = jiffies; +	schedule_delayed_work(&expires_work, +		net_random() % ip_rt_gc_interval + ip_rt_gc_interval); +  	if (ip_rt_proc_init())  		printk(KERN_ERR "Unable to create route proc files\n");  #ifdef CONFIG_XFRM  |