diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 2138 | 
1 files changed, 599 insertions, 1539 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98b30d08efe..6bcb8fc71cb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;  static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;  static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;  static int ip_rt_min_advmss __read_mostly	= 256; -static int rt_chain_length_max __read_mostly	= 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies;  /*   *	Interface to generic destination cache. @@ -145,11 +141,12 @@ static unsigned long expires_ljiffies;  static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);  static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);  static unsigned int	 ipv4_mtu(const struct dst_entry *dst); -static void		 ipv4_dst_destroy(struct dst_entry *dst);  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);  static void		 ipv4_link_failure(struct sk_buff *skb); -static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(struct dst_ops *ops); +static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, +					   struct sk_buff *skb, u32 mtu); +static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk, +					struct sk_buff *skb);  static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,  			    int how) @@ -158,54 +155,26 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,  static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)  { -	struct rtable *rt = (struct rtable *) dst; -	struct inet_peer *peer; -	u32 *p = NULL; - -	if (!rt->peer) -		rt_bind_peer(rt, rt->rt_dst, 1); - -	peer = rt->peer; -	if (peer) { -		u32 *old_p = __DST_METRICS_PTR(old); -		unsigned long prev, new; - -		p = peer->metrics; -		if (inet_metrics_new(peer)) -			memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - -		new = (unsigned long) p; -		prev = cmpxchg(&dst->_metrics, old, new); - -		if (prev != old) { -			p = __DST_METRICS_PTR(prev); -			if (prev & DST_METRICS_READ_ONLY) -				p = NULL; -		} else { -			if (rt->fi) { -				fib_info_put(rt->fi); -				rt->fi = NULL; -			} -		} -	} -	return p; +	WARN_ON(1); +	return NULL;  } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, +					   struct sk_buff *skb, +					   const void *daddr);  static struct dst_ops ipv4_dst_ops = {  	.family =		AF_INET,  	.protocol =		cpu_to_be16(ETH_P_IP), -	.gc =			rt_garbage_collect,  	.check =		ipv4_dst_check,  	.default_advmss =	ipv4_default_advmss,  	.mtu =			ipv4_mtu,  	.cow_metrics =		ipv4_cow_metrics, -	.destroy =		ipv4_dst_destroy,  	.ifdown =		ipv4_dst_ifdown,  	.negative_advice =	ipv4_negative_advice,  	.link_failure =		ipv4_link_failure,  	.update_pmtu =		ip_rt_update_pmtu, +	.redirect =		ip_do_redirect,  	.local_out =		__ip_local_out,  	.neigh_lookup =		ipv4_neigh_lookup,  }; @@ -232,184 +201,30 @@ const __u8 ip_tos2prio[16] = {  };  EXPORT_SYMBOL(ip_tos2prio); -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - *    as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - *    they do so with atomic increments and with the - *    lock held. - */ - -struct rt_hash_bucket { -	struct rtable __rcu	*chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ -	defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ	256 -#else -# if NR_CPUS >= 32 -#  define RT_HASH_LOCK_SZ	4096 -# elif NR_CPUS >= 16 -#  define RT_HASH_LOCK_SZ	2048 -# elif NR_CPUS >= 8 -#  define RT_HASH_LOCK_SZ	1024 -# elif NR_CPUS >= 4 -#  define RT_HASH_LOCK_SZ	512 -# else -#  define RT_HASH_LOCK_SZ	256 -# endif -#endif - -static spinlock_t	*rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ -	int i; - -	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, -			GFP_KERNEL); -	if (!rt_hash_locks) -		panic("IP: failed to allocate rt_hash_locks\n"); - -	for (i = 0; i < RT_HASH_LOCK_SZ; i++) -		spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket 	*rt_hash_table __read_mostly; -static unsigned int		rt_hash_mask __read_mostly; -static unsigned int		rt_hash_log  __read_mostly; -  static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);  #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, -				   int genid) -{ -	return jhash_3words((__force u32)daddr, (__force u32)saddr, -			    idx, genid) -		& rt_hash_mask; -} -  static inline int rt_genid(struct net *net)  {  	return atomic_read(&net->ipv4.rt_genid);  }  #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { -	struct seq_net_private p; -	int bucket; -	int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ -	struct rt_cache_iter_state *st = seq->private; -	struct rtable *r = NULL; - -	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { -		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) -			continue; -		rcu_read_lock_bh(); -		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); -		while (r) { -			if (dev_net(r->dst.dev) == seq_file_net(seq) && -			    r->rt_genid == st->genid) -				return r; -			r = rcu_dereference_bh(r->dst.rt_next); -		} -		rcu_read_unlock_bh(); -	} -	return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, -					  struct rtable *r) -{ -	struct rt_cache_iter_state *st = seq->private; - -	r = rcu_dereference_bh(r->dst.rt_next); -	while (!r) { -		rcu_read_unlock_bh(); -		do { -			if (--st->bucket < 0) -				return NULL; -		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); -		rcu_read_lock_bh(); -		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); -	} -	return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, -					struct rtable *r) -{ -	struct rt_cache_iter_state *st = seq->private; -	while ((r = __rt_cache_get_next(seq, r)) != NULL) { -		if (dev_net(r->dst.dev) != seq_file_net(seq)) -			continue; -		if (r->rt_genid == st->genid) -			break; -	} -	return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ -	struct rtable *r = rt_cache_get_first(seq); - -	if (r) -		while (pos && (r = rt_cache_get_next(seq, r))) -			--pos; -	return pos ? NULL : r; -} -  static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)  { -	struct rt_cache_iter_state *st = seq->private;  	if (*pos) -		return rt_cache_get_idx(seq, *pos - 1); -	st->genid = rt_genid(seq_file_net(seq)); +		return NULL;  	return SEQ_START_TOKEN;  }  static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)  { -	struct rtable *r; - -	if (v == SEQ_START_TOKEN) -		r = rt_cache_get_first(seq); -	else -		r = rt_cache_get_next(seq, v);  	++*pos; -	return r; +	return NULL;  }  static void rt_cache_seq_stop(struct seq_file *seq, void *v)  { -	if (v && v != SEQ_START_TOKEN) -		rcu_read_unlock_bh();  }  static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -419,34 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)  			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"  			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"  			   "HHUptod\tSpecDst"); -	else { -		struct rtable *r = v; -		struct neighbour *n; -		int len, HHUptod; - -		rcu_read_lock(); -		n = dst_get_neighbour_noref(&r->dst); -		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; -		rcu_read_unlock(); - -		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" -			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", -			r->dst.dev ? r->dst.dev->name : "*", -			(__force u32)r->rt_dst, -			(__force u32)r->rt_gateway, -			r->rt_flags, atomic_read(&r->dst.__refcnt), -			r->dst.__use, 0, (__force u32)r->rt_src, -			dst_metric_advmss(&r->dst) + 40, -			dst_metric(&r->dst, RTAX_WINDOW), -			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + -			      dst_metric(&r->dst, RTAX_RTTVAR)), -			r->rt_key_tos, -			-1, -			HHUptod, -			r->rt_spec_dst, &len); - -		seq_printf(seq, "%*s\n", 127 - len, ""); -	}  	return 0;  } @@ -459,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {  static int rt_cache_seq_open(struct inode *inode, struct file *file)  { -	return seq_open_net(inode, file, &rt_cache_seq_ops, -			sizeof(struct rt_cache_iter_state)); +	return seq_open(file, &rt_cache_seq_ops);  }  static const struct file_operations rt_cache_seq_fops = { @@ -468,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {  	.open	 = rt_cache_seq_open,  	.read	 = seq_read,  	.llseek	 = seq_lseek, -	.release = seq_release_net, +	.release = seq_release,  }; @@ -658,275 +444,12 @@ static inline int ip_rt_proc_init(void)  }  #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) -{ -	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline void rt_drop(struct rtable *rt) -{ -	ip_rt_put(rt); -	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline int rt_fast_clean(struct rtable *rth) -{ -	/* Kill broadcast/multicast entries very aggresively, if they -	   collide in hash table with more useful entries */ -	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && -		rt_is_input_route(rth) && rth->dst.rt_next; -} - -static inline int rt_valuable(struct rtable *rth) -{ -	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || -		(rth->peer && rth->peer->pmtu_expires); -} - -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ -	unsigned long age; -	int ret = 0; - -	if (atomic_read(&rth->dst.__refcnt)) -		goto out; - -	age = jiffies - rth->dst.lastuse; -	if ((age <= tmo1 && !rt_fast_clean(rth)) || -	    (age <= tmo2 && rt_valuable(rth))) -		goto out; -	ret = 1; -out:	return ret; -} - -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) -{ -	u32 score = jiffies - rt->dst.lastuse; - -	score = ~score & ~(3<<30); - -	if (rt_valuable(rt)) -		score |= (1<<31); - -	if (rt_is_output_route(rt) || -	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) -		score |= (1<<30); - -	return score; -} - -static inline bool rt_caching(const struct net *net) -{ -	return net->ipv4.current_rt_cache_rebuild_count <= -		net->ipv4.sysctl_rt_cache_rebuild_count; -} - -static inline bool compare_hash_inputs(const struct rtable *rt1, -				       const struct rtable *rt2) -{ -	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | -		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | -		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); -} - -static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) -{ -	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | -		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | -		(rt1->rt_mark ^ rt2->rt_mark) | -		(rt1->rt_key_tos ^ rt2->rt_key_tos) | -		(rt1->rt_route_iif ^ rt2->rt_route_iif) | -		(rt1->rt_oif ^ rt2->rt_oif)) == 0; -} - -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ -	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); -} -  static inline int rt_is_expired(struct rtable *rth)  {  	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));  }  /* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(struct net *net, int process_context) -{ -	unsigned int i; -	struct rtable *rth, *next; - -	for (i = 0; i <= rt_hash_mask; i++) { -		struct rtable __rcu **pprev; -		struct rtable *list; - -		if (process_context && need_resched()) -			cond_resched(); -		rth = rcu_access_pointer(rt_hash_table[i].chain); -		if (!rth) -			continue; - -		spin_lock_bh(rt_hash_lock_addr(i)); - -		list = NULL; -		pprev = &rt_hash_table[i].chain; -		rth = rcu_dereference_protected(*pprev, -			lockdep_is_held(rt_hash_lock_addr(i))); - -		while (rth) { -			next = rcu_dereference_protected(rth->dst.rt_next, -				lockdep_is_held(rt_hash_lock_addr(i))); - -			if (!net || -			    net_eq(dev_net(rth->dst.dev), net)) { -				rcu_assign_pointer(*pprev, next); -				rcu_assign_pointer(rth->dst.rt_next, list); -				list = rth; -			} else { -				pprev = &rth->dst.rt_next; -			} -			rth = next; -		} - -		spin_unlock_bh(rt_hash_lock_addr(i)); - -		for (; list; list = next) { -			next = rcu_dereference_protected(list->dst.rt_next, 1); -			rt_free(list); -		} -	} -} - -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - *  rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -/* - * Given a hash chain and an item in this hash chain, - * find if a previous entry has the same hash_inputs - * (but differs on tos, mark or oif) - * Returns 0 if an alias is found. - * Returns ONE if rth has no alias before itself. - */ -static int has_noalias(const struct rtable *head, const struct rtable *rth) -{ -	const struct rtable *aux = head; - -	while (aux != rth) { -		if (compare_hash_inputs(aux, rth)) -			return 0; -		aux = rcu_dereference_protected(aux->dst.rt_next, 1); -	} -	return ONE; -} - -static void rt_check_expire(void) -{ -	static unsigned int rover; -	unsigned int i = rover, goal; -	struct rtable *rth; -	struct rtable __rcu **rthp; -	unsigned long samples = 0; -	unsigned long sum = 0, sum2 = 0; -	unsigned long delta; -	u64 mult; - -	delta = jiffies - expires_ljiffies; -	expires_ljiffies = jiffies; -	mult = ((u64)delta) << rt_hash_log; -	if (ip_rt_gc_timeout > 1) -		do_div(mult, ip_rt_gc_timeout); -	goal = (unsigned int)mult; -	if (goal > rt_hash_mask) -		goal = rt_hash_mask + 1; -	for (; goal > 0; goal--) { -		unsigned long tmo = ip_rt_gc_timeout; -		unsigned long length; - -		i = (i + 1) & rt_hash_mask; -		rthp = &rt_hash_table[i].chain; - -		if (need_resched()) -			cond_resched(); - -		samples++; - -		if (rcu_dereference_raw(*rthp) == NULL) -			continue; -		length = 0; -		spin_lock_bh(rt_hash_lock_addr(i)); -		while ((rth = rcu_dereference_protected(*rthp, -					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { -			prefetch(rth->dst.rt_next); -			if (rt_is_expired(rth)) { -				*rthp = rth->dst.rt_next; -				rt_free(rth); -				continue; -			} -			if (rth->dst.expires) { -				/* Entry is expired even if it is in use */ -				if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: -					tmo >>= 1; -					rthp = &rth->dst.rt_next; -					/* -					 * We only count entries on -					 * a chain with equal hash inputs once -					 * so that entries for different QOS -					 * levels, and other non-hash input -					 * attributes don't unfairly skew -					 * the length computation -					 */ -					length += has_noalias(rt_hash_table[i].chain, rth); -					continue; -				} -			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) -				goto nofree; - -			/* Cleanup aged off entries. */ -			*rthp = rth->dst.rt_next; -			rt_free(rth); -		} -		spin_unlock_bh(rt_hash_lock_addr(i)); -		sum += length; -		sum2 += length*length; -	} -	if (samples) { -		unsigned long avg = sum / samples; -		unsigned long sd = int_sqrt(sum2 / samples - avg*avg); -		rt_chain_length_max = max_t(unsigned long, -					ip_rt_gc_elasticity, -					(avg + 4*sd) >> FRACT_BITS); -	} -	rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ -	rt_check_expire(); -	schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - -/*   * Perturbation of rt_genid by a small quantity [1..256]   * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()   * many times (2^24) without giving recent rt_genid. @@ -938,7 +461,6 @@ static void rt_cache_invalidate(struct net *net)  	get_random_bytes(&shuffle, sizeof(shuffle));  	atomic_add(shuffle + 1U, &net->ipv4.rt_genid); -	inetpeer_invalidate_tree(AF_INET);  }  /* @@ -948,183 +470,22 @@ static void rt_cache_invalidate(struct net *net)  void rt_cache_flush(struct net *net, int delay)  {  	rt_cache_invalidate(net); -	if (delay >= 0) -		rt_do_flush(net, !in_softirq()); -} - -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(struct net *net) -{ -	rt_do_flush(net, !in_softirq()); -} - -static void rt_emergency_hash_rebuild(struct net *net) -{ -	net_warn_ratelimited("Route hash chain too long!\n"); -	rt_cache_invalidate(net); -} - -/* -   Short description of GC goals. - -   We want to build algorithm, which will keep routing cache -   at some equilibrium point, when number of aged off entries -   is kept approximately equal to newly generated ones. - -   Current expiration strength is variable "expire". -   We try to adjust it dynamically, so that if networking -   is idle expires is large enough to keep enough of warm entries, -   and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) -{ -	static unsigned long expire = RT_GC_TIMEOUT; -	static unsigned long last_gc; -	static int rover; -	static int equilibrium; -	struct rtable *rth; -	struct rtable __rcu **rthp; -	unsigned long now = jiffies; -	int goal; -	int entries = dst_entries_get_fast(&ipv4_dst_ops); - -	/* -	 * Garbage collection is pretty expensive, -	 * do not make it too frequently. -	 */ - -	RT_CACHE_STAT_INC(gc_total); - -	if (now - last_gc < ip_rt_gc_min_interval && -	    entries < ip_rt_max_size) { -		RT_CACHE_STAT_INC(gc_ignored); -		goto out; -	} - -	entries = dst_entries_get_slow(&ipv4_dst_ops); -	/* Calculate number of entries, which we want to expire now. */ -	goal = entries - (ip_rt_gc_elasticity << rt_hash_log); -	if (goal <= 0) { -		if (equilibrium < ipv4_dst_ops.gc_thresh) -			equilibrium = ipv4_dst_ops.gc_thresh; -		goal = entries - equilibrium; -		if (goal > 0) { -			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); -			goal = entries - equilibrium; -		} -	} else { -		/* We are in dangerous area. Try to reduce cache really -		 * aggressively. -		 */ -		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); -		equilibrium = entries - goal; -	} - -	if (now - last_gc >= ip_rt_gc_min_interval) -		last_gc = now; - -	if (goal <= 0) { -		equilibrium += goal; -		goto work_done; -	} - -	do { -		int i, k; - -		for (i = rt_hash_mask, k = rover; i >= 0; i--) { -			unsigned long tmo = expire; - -			k = (k + 1) & rt_hash_mask; -			rthp = &rt_hash_table[k].chain; -			spin_lock_bh(rt_hash_lock_addr(k)); -			while ((rth = rcu_dereference_protected(*rthp, -					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { -				if (!rt_is_expired(rth) && -					!rt_may_expire(rth, tmo, expire)) { -					tmo >>= 1; -					rthp = &rth->dst.rt_next; -					continue; -				} -				*rthp = rth->dst.rt_next; -				rt_free(rth); -				goal--; -			} -			spin_unlock_bh(rt_hash_lock_addr(k)); -			if (goal <= 0) -				break; -		} -		rover = k; - -		if (goal <= 0) -			goto work_done; - -		/* Goal is not achieved. We stop process if: - -		   - if expire reduced to zero. Otherwise, expire is halfed. -		   - if table is not full. -		   - if we are called from interrupt. -		   - jiffies check is just fallback/debug loop breaker. -		     We will not spin here for long time in any case. -		 */ - -		RT_CACHE_STAT_INC(gc_goal_miss); - -		if (expire == 0) -			break; - -		expire >>= 1; - -		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) -			goto out; -	} while (!in_softirq() && time_before_eq(jiffies, now)); - -	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) -		goto out; -	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) -		goto out; -	net_warn_ratelimited("dst cache overflow\n"); -	RT_CACHE_STAT_INC(gc_dst_overflow); -	return 1; - -work_done: -	expire += ip_rt_gc_min_interval; -	if (expire > ip_rt_gc_timeout || -	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || -	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) -		expire = ip_rt_gc_timeout; -out:	return 0; -} - -/* - * Returns number of entries in a hash chain that have different hash_inputs - */ -static int slow_chain_length(const struct rtable *head) -{ -	int length = 0; -	const struct rtable *rth = head; - -	while (rth) { -		length += has_noalias(head, rth); -		rth = rcu_dereference_protected(rth->dst.rt_next, 1); -	} -	return length >> FRACT_BITS;  } -static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, +					   struct sk_buff *skb, +					   const void *daddr)  { -	static const __be32 inaddr_any = 0;  	struct net_device *dev = dst->dev;  	const __be32 *pkey = daddr;  	const struct rtable *rt;  	struct neighbour *n;  	rt = (const struct rtable *) dst; - -	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) -		pkey = &inaddr_any; -	else if (rt->rt_gateway) +	if (rt->rt_gateway)  		pkey = (const __be32 *) &rt->rt_gateway; +	else if (skb) +		pkey = &ip_hdr(skb)->daddr;  	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);  	if (n) @@ -1132,311 +493,210 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo  	return neigh_create(&arp_tbl, pkey, dev);  } -static int rt_bind_neighbour(struct rtable *rt) +/* + * Peer allocation may fail only in serious out-of-memory conditions.  However + * we still can generate some output. + * Random ID selection looks a bit dangerous because we have no chances to + * select ID being unique in a reasonable period of time. + * But broken packet identifier may be better than no packet at all. + */ +static void ip_select_fb_ident(struct iphdr *iph)  { -	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); -	if (IS_ERR(n)) -		return PTR_ERR(n); -	dst_set_neighbour(&rt->dst, n); +	static DEFINE_SPINLOCK(ip_fb_id_lock); +	static u32 ip_fallback_id; +	u32 salt; -	return 0; +	spin_lock_bh(&ip_fb_id_lock); +	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); +	iph->id = htons(salt & 0xFFFF); +	ip_fallback_id = salt; +	spin_unlock_bh(&ip_fb_id_lock);  } -static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, -				     struct sk_buff *skb, int ifindex) +void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)  { -	struct rtable	*rth, *cand; -	struct rtable __rcu **rthp, **candp; -	unsigned long	now; -	u32 		min_score; -	int		chain_length; -	int attempts = !in_softirq(); - -restart: -	chain_length = 0; -	min_score = ~(u32)0; -	cand = NULL; -	candp = NULL; -	now = jiffies; - -	if (!rt_caching(dev_net(rt->dst.dev))) { -		/* -		 * If we're not caching, just tell the caller we -		 * were successful and don't touch the route.  The -		 * caller hold the sole reference to the cache entry, and -		 * it will be released when the caller is done with it. -		 * If we drop it here, the callers have no way to resolve routes -		 * when we're not caching.  Instead, just point *rp at rt, so -		 * the caller gets a single use out of the route -		 * Note that we do rt_free on this new route entry, so that -		 * once its refcount hits zero, we are still able to reap it -		 * (Thanks Alexey) -		 * Note: To avoid expensive rcu stuff for this uncached dst, -		 * we set DST_NOCACHE so that dst_release() can free dst without -		 * waiting a grace period. -		 */ - -		rt->dst.flags |= DST_NOCACHE; -		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { -			int err = rt_bind_neighbour(rt); -			if (err) { -				net_warn_ratelimited("Neighbour table failure & not caching routes\n"); -				ip_rt_put(rt); -				return ERR_PTR(err); -			} -		} +	struct net *net = dev_net(dst->dev); +	struct inet_peer *peer; -		goto skip_hashing; +	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); +	if (peer) { +		iph->id = htons(inet_getid(peer, more)); +		inet_putpeer(peer); +		return;  	} -	rthp = &rt_hash_table[hash].chain; - -	spin_lock_bh(rt_hash_lock_addr(hash)); -	while ((rth = rcu_dereference_protected(*rthp, -			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { -		if (rt_is_expired(rth)) { -			*rthp = rth->dst.rt_next; -			rt_free(rth); -			continue; -		} -		if (compare_keys(rth, rt) && compare_netns(rth, rt)) { -			/* Put it first */ -			*rthp = rth->dst.rt_next; -			/* -			 * Since lookup is lockfree, the deletion -			 * must be visible to another weakly ordered CPU before -			 * the insertion at the start of the hash chain. -			 */ -			rcu_assign_pointer(rth->dst.rt_next, -					   rt_hash_table[hash].chain); -			/* -			 * Since lookup is lockfree, the update writes -			 * must be ordered for consistency on SMP. -			 */ -			rcu_assign_pointer(rt_hash_table[hash].chain, rth); - -			dst_use(&rth->dst, now); -			spin_unlock_bh(rt_hash_lock_addr(hash)); - -			rt_drop(rt); -			if (skb) -				skb_dst_set(skb, &rth->dst); -			return rth; -		} - -		if (!atomic_read(&rth->dst.__refcnt)) { -			u32 score = rt_score(rth); - -			if (score <= min_score) { -				cand = rth; -				candp = rthp; -				min_score = score; -			} -		} - -		chain_length++; - -		rthp = &rth->dst.rt_next; -	} +	ip_select_fb_ident(iph); +} +EXPORT_SYMBOL(__ip_select_ident); -	if (cand) { -		/* ip_rt_gc_elasticity used to be average length of chain -		 * length, when exceeded gc becomes really aggressive. -		 * -		 * The second limit is less certain. At the moment it allows -		 * only 2 entries per bucket. We will see. -		 */ -		if (chain_length > ip_rt_gc_elasticity) { -			*candp = cand->dst.rt_next; -			rt_free(cand); -		} -	} else { -		if (chain_length > rt_chain_length_max && -		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { -			struct net *net = dev_net(rt->dst.dev); -			int num = ++net->ipv4.current_rt_cache_rebuild_count; -			if (!rt_caching(net)) { -				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", -					rt->dst.dev->name, num); -			} -			rt_emergency_hash_rebuild(net); -			spin_unlock_bh(rt_hash_lock_addr(hash)); +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, +			     const struct iphdr *iph, +			     int oif, u8 tos, +			     u8 prot, u32 mark, int flow_flags) +{ +	if (sk) { +		const struct inet_sock *inet = inet_sk(sk); -			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, -					ifindex, rt_genid(net)); -			goto restart; -		} +		oif = sk->sk_bound_dev_if; +		mark = sk->sk_mark; +		tos = RT_CONN_FLAGS(sk); +		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;  	} +	flowi4_init_output(fl4, oif, mark, tos, +			   RT_SCOPE_UNIVERSE, prot, +			   flow_flags, +			   iph->daddr, iph->saddr, 0, 0); +} -	/* Try to bind route to arp only if it is output -	   route or unicast forwarding path. -	 */ -	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { -		int err = rt_bind_neighbour(rt); -		if (err) { -			spin_unlock_bh(rt_hash_lock_addr(hash)); - -			if (err != -ENOBUFS) { -				rt_drop(rt); -				return ERR_PTR(err); -			} - -			/* Neighbour tables are full and nothing -			   can be released. Try to shrink route cache, -			   it is most likely it holds some neighbour records. -			 */ -			if (attempts-- > 0) { -				int saved_elasticity = ip_rt_gc_elasticity; -				int saved_int = ip_rt_gc_min_interval; -				ip_rt_gc_elasticity	= 1; -				ip_rt_gc_min_interval	= 0; -				rt_garbage_collect(&ipv4_dst_ops); -				ip_rt_gc_min_interval	= saved_int; -				ip_rt_gc_elasticity	= saved_elasticity; -				goto restart; -			} - -			net_warn_ratelimited("Neighbour table overflow\n"); -			rt_drop(rt); -			return ERR_PTR(-ENOBUFS); -		} -	} +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, +			       const struct sock *sk) +{ +	const struct iphdr *iph = ip_hdr(skb); +	int oif = skb->dev->ifindex; +	u8 tos = RT_TOS(iph->tos); +	u8 prot = iph->protocol; +	u32 mark = skb->mark; -	rt->dst.rt_next = rt_hash_table[hash].chain; +	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); +} -	/* -	 * Since lookup is lockfree, we must make sure -	 * previous writes to rt are committed to memory -	 * before making rt visible to other CPUS. -	 */ -	rcu_assign_pointer(rt_hash_table[hash].chain, rt); +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) +{ +	const struct inet_sock *inet = inet_sk(sk); +	const struct ip_options_rcu *inet_opt; +	__be32 daddr = inet->inet_daddr; -	spin_unlock_bh(rt_hash_lock_addr(hash)); +	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, +			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, +			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, +			   inet_sk_flowi_flags(sk), +			   daddr, inet->inet_saddr, 0, 0); +	rcu_read_unlock(); +} -skip_hashing: +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, +				 const struct sk_buff *skb) +{  	if (skb) -		skb_dst_set(skb, &rt->dst); -	return rt; +		build_skb_flow_key(fl4, skb, sk); +	else +		build_sk_flow_key(fl4, sk);  } -static atomic_t __rt_peer_genid = ATOMIC_INIT(0); +static DEFINE_SEQLOCK(fnhe_seqlock); -static u32 rt_peer_genid(void) +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)  { -	return atomic_read(&__rt_peer_genid); +	struct fib_nh_exception *fnhe, *oldest; + +	oldest = rcu_dereference(hash->chain); +	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) +			oldest = fnhe; +	} +	return oldest;  } -void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) +static inline u32 fnhe_hashfun(__be32 daddr)  { -	struct inet_peer *peer; +	u32 hval; -	peer = inet_getpeer_v4(daddr, create); +	hval = (__force u32) daddr; +	hval ^= (hval >> 11) ^ (hval >> 22); -	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) -		inet_putpeer(peer); -	else -		rt->rt_peer_genid = rt_peer_genid(); +	return hval & (FNHE_HASH_SIZE - 1);  } -/* - * Peer allocation may fail only in serious out-of-memory conditions.  However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, +				  u32 pmtu, unsigned long expires)  { -	static DEFINE_SPINLOCK(ip_fb_id_lock); -	static u32 ip_fallback_id; -	u32 salt; - -	spin_lock_bh(&ip_fb_id_lock); -	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); -	iph->id = htons(salt & 0xFFFF); -	ip_fallback_id = salt; -	spin_unlock_bh(&ip_fb_id_lock); -} +	struct fnhe_hash_bucket *hash; +	struct fib_nh_exception *fnhe; +	int depth; +	u32 hval = fnhe_hashfun(daddr); -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) -{ -	struct rtable *rt = (struct rtable *) dst; +	write_seqlock_bh(&fnhe_seqlock); -	if (rt && !(rt->dst.flags & DST_NOPEER)) { -		if (rt->peer == NULL) -			rt_bind_peer(rt, rt->rt_dst, 1); +	hash = nh->nh_exceptions; +	if (!hash) { +		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); +		if (!hash) +			goto out_unlock; +		nh->nh_exceptions = hash; +	} -		/* If peer is attached to destination, it is never detached, -		   so that we need not to grab a lock to dereference it. -		 */ -		if (rt->peer) { -			iph->id = htons(inet_getid(rt->peer, more)); -			return; -		} -	} else if (!rt) -		pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); +	hash += hval; -	ip_select_fb_ident(iph); -} -EXPORT_SYMBOL(__ip_select_ident); +	depth = 0; +	for (fnhe = rcu_dereference(hash->chain); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (fnhe->fnhe_daddr == daddr) +			break; +		depth++; +	} -static void rt_del(unsigned int hash, struct rtable *rt) -{ -	struct rtable __rcu **rthp; -	struct rtable *aux; +	if (fnhe) { +		if (gw) +			fnhe->fnhe_gw = gw; +		if (pmtu) { +			fnhe->fnhe_pmtu = pmtu; +			fnhe->fnhe_expires = expires; +		} +	} else { +		if (depth > FNHE_RECLAIM_DEPTH) +			fnhe = fnhe_oldest(hash); +		else { +			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); +			if (!fnhe) +				goto out_unlock; -	rthp = &rt_hash_table[hash].chain; -	spin_lock_bh(rt_hash_lock_addr(hash)); -	ip_rt_put(rt); -	while ((aux = rcu_dereference_protected(*rthp, -			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { -		if (aux == rt || rt_is_expired(aux)) { -			*rthp = aux->dst.rt_next; -			rt_free(aux); -			continue; +			fnhe->fnhe_next = hash->chain; +			rcu_assign_pointer(hash->chain, fnhe);  		} -		rthp = &aux->dst.rt_next; +		fnhe->fnhe_daddr = daddr; +		fnhe->fnhe_gw = gw; +		fnhe->fnhe_pmtu = pmtu; +		fnhe->fnhe_expires = expires;  	} -	spin_unlock_bh(rt_hash_lock_addr(hash)); + +	fnhe->fnhe_stamp = jiffies; + +out_unlock: +	write_sequnlock_bh(&fnhe_seqlock); +	return;  } -static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, +			     bool kill_route)  { -	struct rtable *rt = (struct rtable *) dst; -	__be32 orig_gw = rt->rt_gateway; -	struct neighbour *n, *old_n; - -	dst_confirm(&rt->dst); +	__be32 new_gw = icmp_hdr(skb)->un.gateway; +	__be32 old_gw = ip_hdr(skb)->saddr; +	struct net_device *dev = skb->dev; +	struct in_device *in_dev; +	struct fib_result res; +	struct neighbour *n; +	struct net *net; -	rt->rt_gateway = peer->redirect_learned.a4; +	switch (icmp_hdr(skb)->code & 7) { +	case ICMP_REDIR_NET: +	case ICMP_REDIR_NETTOS: +	case ICMP_REDIR_HOST: +	case ICMP_REDIR_HOSTTOS: +		break; -	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); -	if (IS_ERR(n)) { -		rt->rt_gateway = orig_gw; +	default:  		return;  	} -	old_n = xchg(&rt->dst._neighbour, n); -	if (old_n) -		neigh_release(old_n); -	if (!(n->nud_state & NUD_VALID)) { -		neigh_event_send(n, NULL); -	} else { -		rt->rt_flags |= RTCF_REDIRECTED; -		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); -	} -} -/* called in rcu_read_lock() section */ -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, -		    __be32 saddr, struct net_device *dev) -{ -	int s, i; -	struct in_device *in_dev = __in_dev_get_rcu(dev); -	__be32 skeys[2] = { saddr, 0 }; -	int    ikeys[2] = { dev->ifindex, 0 }; -	struct inet_peer *peer; -	struct net *net; +	if (rt->rt_gateway != old_gw) +		return; +	in_dev = __in_dev_get_rcu(dev);  	if (!in_dev)  		return; @@ -1456,72 +716,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,  			goto reject_redirect;  	} -	for (s = 0; s < 2; s++) { -		for (i = 0; i < 2; i++) { -			unsigned int hash; -			struct rtable __rcu **rthp; -			struct rtable *rt; - -			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); - -			rthp = &rt_hash_table[hash].chain; - -			while ((rt = rcu_dereference(*rthp)) != NULL) { -				rthp = &rt->dst.rt_next; - -				if (rt->rt_key_dst != daddr || -				    rt->rt_key_src != skeys[s] || -				    rt->rt_oif != ikeys[i] || -				    rt_is_input_route(rt) || -				    rt_is_expired(rt) || -				    !net_eq(dev_net(rt->dst.dev), net) || -				    rt->dst.error || -				    rt->dst.dev != dev || -				    rt->rt_gateway != old_gw) -					continue; - -				if (!rt->peer) -					rt_bind_peer(rt, rt->rt_dst, 1); +	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); +	if (n) { +		if (!(n->nud_state & NUD_VALID)) { +			neigh_event_send(n, NULL); +		} else { +			if (fib_lookup(net, fl4, &res) == 0) { +				struct fib_nh *nh = &FIB_RES_NH(res); -				peer = rt->peer; -				if (peer) { -					if (peer->redirect_learned.a4 != new_gw) { -						peer->redirect_learned.a4 = new_gw; -						atomic_inc(&__rt_peer_genid); -					} -					check_peer_redir(&rt->dst, peer); -				} +				update_or_create_fnhe(nh, fl4->daddr, new_gw, +						      0, 0);  			} +			if (kill_route) +				rt->dst.obsolete = DST_OBSOLETE_KILL; +			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);  		} +		neigh_release(n);  	}  	return;  reject_redirect:  #ifdef CONFIG_IP_ROUTE_VERBOSE -	if (IN_DEV_LOG_MARTIANS(in_dev)) +	if (IN_DEV_LOG_MARTIANS(in_dev)) { +		const struct iphdr *iph = (const struct iphdr *) skb->data; +		__be32 daddr = iph->daddr; +		__be32 saddr = iph->saddr; +  		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"  				     "  Advised path = %pI4 -> %pI4\n",  				     &old_gw, dev->name, &new_gw,  				     &saddr, &daddr); +	}  #endif  	;  } -static bool peer_pmtu_expired(struct inet_peer *peer) +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)  { -	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); - -	return orig && -	       time_after_eq(jiffies, orig) && -	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig; -} +	struct rtable *rt; +	struct flowi4 fl4; -static bool peer_pmtu_cleaned(struct inet_peer *peer) -{ -	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); +	rt = (struct rtable *) dst; -	return orig && -	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +	ip_rt_build_flow_key(&fl4, sk, skb); +	__ip_do_redirect(rt, skb, &fl4, true);  }  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1533,14 +771,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)  		if (dst->obsolete > 0) {  			ip_rt_put(rt);  			ret = NULL; -		} else if (rt->rt_flags & RTCF_REDIRECTED) { -			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, -						rt->rt_oif, -						rt_genid(dev_net(dst->dev))); -			rt_del(hash, rt); +		} else if ((rt->rt_flags & RTCF_REDIRECTED) || +			   rt->dst.expires) { +			ip_rt_put(rt);  			ret = NULL; -		} else if (rt->peer && peer_pmtu_expired(rt->peer)) { -			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);  		}  	}  	return ret; @@ -1567,6 +801,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)  	struct rtable *rt = skb_rtable(skb);  	struct in_device *in_dev;  	struct inet_peer *peer; +	struct net *net;  	int log_martians;  	rcu_read_lock(); @@ -1578,9 +813,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)  	log_martians = IN_DEV_LOG_MARTIANS(in_dev);  	rcu_read_unlock(); -	if (!rt->peer) -		rt_bind_peer(rt, rt->rt_dst, 1); -	peer = rt->peer; +	net = dev_net(rt->dst.dev); +	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);  	if (!peer) {  		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);  		return; @@ -1597,7 +831,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)  	 */  	if (peer->rate_tokens >= ip_rt_redirect_number) {  		peer->rate_last = jiffies; -		return; +		goto out_put_peer;  	}  	/* Check for load limit; set rate_last to the latest sent @@ -1614,20 +848,38 @@ void ip_rt_send_redirect(struct sk_buff *skb)  		if (log_martians &&  		    peer->rate_tokens == ip_rt_redirect_number)  			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", -					     &ip_hdr(skb)->saddr, rt->rt_iif, -					     &rt->rt_dst, &rt->rt_gateway); +					     &ip_hdr(skb)->saddr, inet_iif(skb), +					     &ip_hdr(skb)->daddr, &rt->rt_gateway);  #endif  	} +out_put_peer: +	inet_putpeer(peer);  }  static int ip_error(struct sk_buff *skb)  { +	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);  	struct rtable *rt = skb_rtable(skb);  	struct inet_peer *peer;  	unsigned long now; +	struct net *net;  	bool send;  	int code; +	net = dev_net(rt->dst.dev); +	if (!IN_DEV_FORWARD(in_dev)) { +		switch (rt->dst.error) { +		case EHOSTUNREACH: +			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); +			break; + +		case ENETUNREACH: +			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); +			break; +		} +		goto out; +	} +  	switch (rt->dst.error) {  	case EINVAL:  	default: @@ -1637,17 +889,14 @@ static int ip_error(struct sk_buff *skb)  		break;  	case ENETUNREACH:  		code = ICMP_NET_UNREACH; -		IP_INC_STATS_BH(dev_net(rt->dst.dev), -				IPSTATS_MIB_INNOROUTES); +		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);  		break;  	case EACCES:  		code = ICMP_PKT_FILTERED;  		break;  	} -	if (!rt->peer) -		rt_bind_peer(rt, rt->rt_dst, 1); -	peer = rt->peer; +	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);  	send = true;  	if (peer) { @@ -1660,6 +909,7 @@ static int ip_error(struct sk_buff *skb)  			peer->rate_tokens -= ip_rt_error_cost;  		else  			send = false; +		inet_putpeer(peer);  	}  	if (send)  		icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1668,163 +918,120 @@ out:	kfree_skb(skb);  	return 0;  } -/* - *	The last two values are not from the RFC but - *	are needed for AMPRnet AX.25 paths. - */ +static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ +	struct fib_result res; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; +	if (mtu < ip_rt_min_pmtu) +		mtu = ip_rt_min_pmtu; -static inline unsigned short guess_mtu(unsigned short old_mtu) -{ -	int i; +	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { +		struct fib_nh *nh = &FIB_RES_NH(res); -	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) -		if (old_mtu > mtu_plateau[i]) -			return mtu_plateau[i]; -	return 68; +		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, +				      jiffies + ip_rt_mtu_expires); +	} +	return mtu;  } -unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, -				 unsigned short new_mtu, -				 struct net_device *dev) +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, +			      struct sk_buff *skb, u32 mtu)  { -	unsigned short old_mtu = ntohs(iph->tot_len); -	unsigned short est_mtu = 0; -	struct inet_peer *peer; - -	peer = inet_getpeer_v4(iph->daddr, 1); -	if (peer) { -		unsigned short mtu = new_mtu; - -		if (new_mtu < 68 || new_mtu >= old_mtu) { -			/* BSD 4.2 derived systems incorrectly adjust -			 * tot_len by the IP header length, and report -			 * a zero MTU in the ICMP message. -			 */ -			if (mtu == 0 && -			    old_mtu >= 68 + (iph->ihl << 2)) -				old_mtu -= iph->ihl << 2; -			mtu = guess_mtu(old_mtu); -		} - -		if (mtu < ip_rt_min_pmtu) -			mtu = ip_rt_min_pmtu; -		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { -			unsigned long pmtu_expires; - -			pmtu_expires = jiffies + ip_rt_mtu_expires; -			if (!pmtu_expires) -				pmtu_expires = 1UL; +	struct rtable *rt = (struct rtable *) dst; +	struct flowi4 fl4; -			est_mtu = mtu; -			peer->pmtu_learned = mtu; -			peer->pmtu_expires = pmtu_expires; -			atomic_inc(&__rt_peer_genid); -		} +	ip_rt_build_flow_key(&fl4, sk, skb); +	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); -		inet_putpeer(peer); +	if (!rt->rt_pmtu) { +		dst->obsolete = DST_OBSOLETE_KILL; +	} else { +		rt->rt_pmtu = mtu; +		dst_set_expires(&rt->dst, ip_rt_mtu_expires);  	} -	return est_mtu ? : new_mtu;  } -static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, +		      int oif, u32 mark, u8 protocol, int flow_flags)  { -	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; -	if (!expires) -		return; -	if (time_before(jiffies, expires)) { -		u32 orig_dst_mtu = dst_mtu(dst); -		if (peer->pmtu_learned < orig_dst_mtu) { -			if (!peer->pmtu_orig) -				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); -			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); -		} -	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) -		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +	__build_flow_key(&fl4, NULL, iph, oif, +			 RT_TOS(iph->tos), protocol, mark, flow_flags); +	rt = __ip_route_output_key(net, &fl4); +	if (!IS_ERR(rt)) { +		__ip_rt_update_pmtu(rt, &fl4, mtu); +		ip_rt_put(rt); +	}  } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  { -	struct rtable *rt = (struct rtable *) dst; -	struct inet_peer *peer; - -	dst_confirm(dst); - -	if (!rt->peer) -		rt_bind_peer(rt, rt->rt_dst, 1); -	peer = rt->peer; -	if (peer) { -		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); - -		if (mtu < ip_rt_min_pmtu) -			mtu = ip_rt_min_pmtu; -		if (!pmtu_expires || mtu < peer->pmtu_learned) { - -			pmtu_expires = jiffies + ip_rt_mtu_expires; -			if (!pmtu_expires) -				pmtu_expires = 1UL; - -			peer->pmtu_learned = mtu; -			peer->pmtu_expires = pmtu_expires; +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; -			atomic_inc(&__rt_peer_genid); -			rt->rt_peer_genid = rt_peer_genid(); -		} -		check_peer_pmtu(dst, peer); +	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); +	rt = __ip_route_output_key(sock_net(sk), &fl4); +	if (!IS_ERR(rt)) { +		__ip_rt_update_pmtu(rt, &fl4, mtu); +		ip_rt_put(rt);  	}  } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); - -static void ipv4_validate_peer(struct rtable *rt) +void ipv4_redirect(struct sk_buff *skb, struct net *net, +		   int oif, u32 mark, u8 protocol, int flow_flags)  { -	if (rt->rt_peer_genid != rt_peer_genid()) { -		struct inet_peer *peer; - -		if (!rt->peer) -			rt_bind_peer(rt, rt->rt_dst, 0); +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; -		peer = rt->peer; -		if (peer) { -			check_peer_pmtu(&rt->dst, peer); +	__build_flow_key(&fl4, NULL, iph, oif, +			 RT_TOS(iph->tos), protocol, mark, flow_flags); +	rt = __ip_route_output_key(net, &fl4); +	if (!IS_ERR(rt)) { +		__ip_do_redirect(rt, skb, &fl4, false); +		ip_rt_put(rt); +	} +} +EXPORT_SYMBOL_GPL(ipv4_redirect); -			if (peer->redirect_learned.a4 && -			    peer->redirect_learned.a4 != rt->rt_gateway) -				check_peer_redir(&rt->dst, peer); -		} +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; -		rt->rt_peer_genid = rt_peer_genid(); +	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); +	rt = __ip_route_output_key(sock_net(sk), &fl4); +	if (!IS_ERR(rt)) { +		__ip_do_redirect(rt, skb, &fl4, false); +		ip_rt_put(rt);  	}  } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect);  static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)  {  	struct rtable *rt = (struct rtable *) dst; -	if (rt_is_expired(rt)) +	/* All IPV4 dsts are created with ->obsolete set to the value +	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down +	 * into this function always. +	 * +	 * When a PMTU/redirect information update invalidates a +	 * route, this is indicated by setting obsolete to +	 * DST_OBSOLETE_KILL. +	 */ +	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))  		return NULL; -	ipv4_validate_peer(rt);  	return dst;  } -static void ipv4_dst_destroy(struct dst_entry *dst) -{ -	struct rtable *rt = (struct rtable *) dst; -	struct inet_peer *peer = rt->peer; - -	if (rt->fi) { -		fib_info_put(rt->fi); -		rt->fi = NULL; -	} -	if (peer) { -		rt->peer = NULL; -		inet_putpeer(peer); -	} -} - -  static void ipv4_link_failure(struct sk_buff *skb)  {  	struct rtable *rt; @@ -1832,8 +1039,8 @@ static void ipv4_link_failure(struct sk_buff *skb)  	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);  	rt = skb_rtable(skb); -	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) -		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); +	if (rt) +		dst_set_expires(&rt->dst, 0);  }  static int ip_rt_bug(struct sk_buff *skb) @@ -1880,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)  		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)  			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);  		else -			src = inet_select_addr(rt->dst.dev, rt->rt_gateway, -					RT_SCOPE_UNIVERSE); +			src = inet_select_addr(rt->dst.dev, +					       rt_nexthop(rt, iph->daddr), +					       RT_SCOPE_UNIVERSE);  		rcu_read_unlock();  	}  	memcpy(addr, &src, 4); @@ -1913,7 +1121,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)  static unsigned int ipv4_mtu(const struct dst_entry *dst)  {  	const struct rtable *rt = (const struct rtable *) dst; -	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); +	unsigned int mtu = rt->rt_pmtu; + +	if (mtu && time_after_eq(jiffies, rt->dst.expires)) +		mtu = 0; + +	if (!mtu) +		mtu = dst_metric_raw(dst, RTAX_MTU);  	if (mtu && rt_is_output_route(rt))  		return mtu; @@ -1921,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)  	mtu = dst->dev->mtu;  	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - -		if (rt->rt_gateway != rt->rt_dst && mtu > 576) +		if (rt->rt_gateway && mtu > 576)  			mtu = 576;  	} @@ -1932,76 +1145,121 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)  	return mtu;  } -static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, -			    struct fib_info *fi) +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)  { -	struct inet_peer *peer; -	int create = 0; +	struct fnhe_hash_bucket *hash = nh->nh_exceptions; +	struct fib_nh_exception *fnhe; +	u32 hval; -	/* If a peer entry exists for this destination, we must hook -	 * it up in order to get at cached metrics. -	 */ -	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) -		create = 1; +	if (!hash) +		return NULL; -	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); -	if (peer) { -		rt->rt_peer_genid = rt_peer_genid(); -		if (inet_metrics_new(peer)) -			memcpy(peer->metrics, fi->fib_metrics, -			       sizeof(u32) * RTAX_MAX); -		dst_init_metrics(&rt->dst, peer->metrics, false); +	hval = fnhe_hashfun(daddr); -		check_peer_pmtu(&rt->dst, peer); +	for (fnhe = rcu_dereference(hash[hval].chain); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (fnhe->fnhe_daddr == daddr) +			return fnhe; +	} +	return NULL; +} -		if (peer->redirect_learned.a4 && -		    peer->redirect_learned.a4 != rt->rt_gateway) { -			rt->rt_gateway = peer->redirect_learned.a4; -			rt->rt_flags |= RTCF_REDIRECTED; -		} -	} else { -		if (fi->fib_metrics != (u32 *) dst_default_metrics) { -			rt->fi = fi; -			atomic_inc(&fi->fib_clntref); +static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, +			      __be32 daddr) +{ +	__be32 fnhe_daddr, gw; +	unsigned long expires; +	unsigned int seq; +	u32 pmtu; + +restart: +	seq = read_seqbegin(&fnhe_seqlock); +	fnhe_daddr = fnhe->fnhe_daddr; +	gw = fnhe->fnhe_gw; +	pmtu = fnhe->fnhe_pmtu; +	expires = fnhe->fnhe_expires; +	if (read_seqretry(&fnhe_seqlock, seq)) +		goto restart; + +	if (daddr != fnhe_daddr) +		return; + +	if (pmtu) { +		unsigned long diff = expires - jiffies; + +		if (time_before(jiffies, expires)) { +			rt->rt_pmtu = pmtu; +			dst_set_expires(&rt->dst, diff);  		} -		dst_init_metrics(&rt->dst, fi->fib_metrics, true); +	} +	if (gw) { +		rt->rt_flags |= RTCF_REDIRECTED; +		rt->rt_gateway = gw; +	} +	fnhe->fnhe_stamp = jiffies; +} + +static inline void rt_release_rcu(struct rcu_head *head) +{ +	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); +	dst_release(dst); +} + +static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ +	struct rtable *orig, *prev, **p = &nh->nh_rth_output; + +	if (rt_is_input_route(rt)) +		p = &nh->nh_rth_input; + +	orig = *p; + +	prev = cmpxchg(p, orig, rt); +	if (prev == orig) { +		dst_clone(&rt->dst); +		if (orig) +			call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);  	}  } -static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, +static bool rt_cache_valid(struct rtable *rt) +{ +	return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr,  			   const struct fib_result *res, +			   struct fib_nh_exception *fnhe,  			   struct fib_info *fi, u16 type, u32 itag)  { -	struct dst_entry *dst = &rt->dst; -  	if (fi) { -		if (FIB_RES_GW(*res) && -		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) -			rt->rt_gateway = FIB_RES_GW(*res); -		rt_init_metrics(rt, fl4, fi); +		struct fib_nh *nh = &FIB_RES_NH(*res); + +		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) +			rt->rt_gateway = nh->nh_gw; +		if (unlikely(fnhe)) +			rt_bind_exception(rt, fnhe, daddr); +		dst_init_metrics(&rt->dst, fi->fib_metrics, true);  #ifdef CONFIG_IP_ROUTE_CLASSID -		dst->tclassid = FIB_RES_NH(*res).nh_tclassid; +		rt->dst.tclassid = nh->nh_tclassid;  #endif +		if (!(rt->dst.flags & DST_HOST)) +			rt_cache_route(nh, rt);  	} -	if (dst_mtu(dst) > IP_MAX_MTU) -		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); -	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) -		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -  #ifdef CONFIG_IP_ROUTE_CLASSID  #ifdef CONFIG_IP_MULTIPLE_TABLES -	set_class_tag(rt, fib_rules_tclass(res)); +	set_class_tag(rt, res->tclassid);  #endif  	set_class_tag(rt, itag);  #endif  }  static struct rtable *rt_dst_alloc(struct net_device *dev, -				   bool nopolicy, bool noxfrm) +				   bool nopolicy, bool noxfrm, bool will_cache)  { -	return dst_alloc(&ipv4_dst_ops, dev, 1, -1, -			 DST_HOST | +	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, +			 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |  			 (nopolicy ? DST_NOPOLICY : 0) |  			 (noxfrm ? DST_NOXFRM : 0));  } @@ -2010,9 +1268,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  				u8 tos, struct net_device *dev, int our)  { -	unsigned int hash;  	struct rtable *rth; -	__be32 spec_dst;  	struct in_device *in_dev = __in_dev_get_rcu(dev);  	u32 itag = 0;  	int err; @@ -2023,21 +1279,24 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  		return -EINVAL;  	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || -	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) +	    skb->protocol != htons(ETH_P_IP))  		goto e_inval; +	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) +		if (ipv4_is_loopback(saddr)) +			goto e_inval; +  	if (ipv4_is_zeronet(saddr)) {  		if (!ipv4_is_local_multicast(daddr))  			goto e_inval; -		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);  	} else { -		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, -					  &itag); +		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, +					  in_dev, &itag);  		if (err < 0)  			goto e_err;  	}  	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, -			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false); +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);  	if (!rth)  		goto e_nobufs; @@ -2046,23 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  #endif  	rth->dst.output = ip_rt_bug; -	rth->rt_key_dst	= daddr; -	rth->rt_key_src	= saddr;  	rth->rt_genid	= rt_genid(dev_net(dev));  	rth->rt_flags	= RTCF_MULTICAST;  	rth->rt_type	= RTN_MULTICAST; -	rth->rt_key_tos	= tos; -	rth->rt_dst	= daddr; -	rth->rt_src	= saddr; -	rth->rt_route_iif = dev->ifindex; -	rth->rt_iif	= dev->ifindex; -	rth->rt_oif	= 0; -	rth->rt_mark    = skb->mark; -	rth->rt_gateway	= daddr; -	rth->rt_spec_dst= spec_dst; -	rth->rt_peer_genid = 0; -	rth->peer = NULL; -	rth->fi = NULL; +	rth->rt_is_input= 1; +	rth->rt_iif	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0;  	if (our) {  		rth->dst.input= ip_local_deliver;  		rth->rt_flags |= RTCF_LOCAL; @@ -2074,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  #endif  	RT_CACHE_STAT_INC(in_slow_mc); -	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); -	rth = rt_intern_hash(hash, rth, skb, dev->ifindex); -	return IS_ERR(rth) ? PTR_ERR(rth) : 0; +	skb_dst_set(skb, &rth->dst); +	return 0;  e_nobufs:  	return -ENOBUFS; @@ -2123,7 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb,  	int err;  	struct in_device *out_dev;  	unsigned int flags = 0; -	__be32 spec_dst; +	bool do_cache;  	u32 itag;  	/* get a working reference to the output device */ @@ -2135,7 +1383,7 @@ static int __mkroute_input(struct sk_buff *skb,  	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), -				  in_dev->dev, &spec_dst, &itag); +				  in_dev->dev, in_dev, &itag);  	if (err < 0) {  		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,  					 saddr); @@ -2143,9 +1391,6 @@ static int __mkroute_input(struct sk_buff *skb,  		goto cleanup;  	} -	if (err) -		flags |= RTCF_DIRECTSRC; -  	if (out_dev == in_dev && err &&  	    (IN_DEV_SHARED_MEDIA(out_dev) ||  	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) @@ -2166,37 +1411,39 @@ static int __mkroute_input(struct sk_buff *skb,  		}  	} +	do_cache = false; +	if (res->fi) { +		if (!itag) { +			rth = FIB_RES_NH(*res).nh_rth_input; +			if (rt_cache_valid(rth)) { +				dst_hold(&rth->dst); +				goto out; +			} +			do_cache = true; +		} +	} +  	rth = rt_dst_alloc(out_dev->dev,  			   IN_DEV_CONF_GET(in_dev, NOPOLICY), -			   IN_DEV_CONF_GET(out_dev, NOXFRM)); +			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);  	if (!rth) {  		err = -ENOBUFS;  		goto cleanup;  	} -	rth->rt_key_dst	= daddr; -	rth->rt_key_src	= saddr;  	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));  	rth->rt_flags = flags;  	rth->rt_type = res->type; -	rth->rt_key_tos	= tos; -	rth->rt_dst	= daddr; -	rth->rt_src	= saddr; -	rth->rt_route_iif = in_dev->dev->ifindex; -	rth->rt_iif 	= in_dev->dev->ifindex; -	rth->rt_oif 	= 0; -	rth->rt_mark    = skb->mark; -	rth->rt_gateway	= daddr; -	rth->rt_spec_dst= spec_dst; -	rth->rt_peer_genid = 0; -	rth->peer = NULL; -	rth->fi = NULL; +	rth->rt_is_input = 1; +	rth->rt_iif 	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0;  	rth->dst.input = ip_forward;  	rth->dst.output = ip_output; -	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); - +	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); +out:  	*result = rth;  	err = 0;   cleanup: @@ -2211,7 +1458,6 @@ static int ip_mkroute_input(struct sk_buff *skb,  {  	struct rtable *rth = NULL;  	int err; -	unsigned int hash;  #ifdef CONFIG_IP_ROUTE_MULTIPATH  	if (res->fi && res->fi->fib_nhs > 1) @@ -2223,12 +1469,7 @@ static int ip_mkroute_input(struct sk_buff *skb,  	if (err)  		return err; -	/* put it into the cache */ -	hash = rt_hash(daddr, saddr, fl4->flowi4_iif, -		       rt_genid(dev_net(rth->dst.dev))); -	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); -	if (IS_ERR(rth)) -		return PTR_ERR(rth); +	skb_dst_set(skb, &rth->dst);  	return 0;  } @@ -2252,10 +1493,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	unsigned int	flags = 0;  	u32		itag = 0;  	struct rtable	*rth; -	unsigned int	hash; -	__be32		spec_dst;  	int		err = -EINVAL;  	struct net    *net = dev_net(dev); +	bool do_cache;  	/* IP on this device is disabled. */ @@ -2266,10 +1506,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	   by fib_lookup.  	 */ -	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || -	    ipv4_is_loopback(saddr)) +	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))  		goto martian_source; +	res.fi = NULL;  	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))  		goto brd_input; @@ -2279,9 +1519,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	if (ipv4_is_zeronet(saddr))  		goto martian_source; -	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) +	if (ipv4_is_zeronet(daddr))  		goto martian_destination; +	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { +		if (ipv4_is_loopback(daddr)) +			goto martian_destination; + +		if (ipv4_is_loopback(saddr)) +			goto martian_source; +	} +  	/*  	 *	Now we are ready to route packet.  	 */ @@ -2293,11 +1541,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	fl4.daddr = daddr;  	fl4.saddr = saddr;  	err = fib_lookup(net, &fl4, &res); -	if (err != 0) { -		if (!IN_DEV_FORWARD(in_dev)) -			goto e_hostunreach; +	if (err != 0)  		goto no_route; -	}  	RT_CACHE_STAT_INC(in_slow_tot); @@ -2307,17 +1552,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	if (res.type == RTN_LOCAL) {  		err = fib_validate_source(skb, saddr, daddr, tos,  					  net->loopback_dev->ifindex, -					  dev, &spec_dst, &itag); +					  dev, in_dev, &itag);  		if (err < 0)  			goto martian_source_keep_err; -		if (err) -			flags |= RTCF_DIRECTSRC; -		spec_dst = daddr;  		goto local_input;  	}  	if (!IN_DEV_FORWARD(in_dev)) -		goto e_hostunreach; +		goto no_route;  	if (res.type != RTN_UNICAST)  		goto martian_destination; @@ -2328,23 +1570,31 @@ brd_input:  	if (skb->protocol != htons(ETH_P_IP))  		goto e_inval; -	if (ipv4_is_zeronet(saddr)) -		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); -	else { -		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, -					  &itag); +	if (!ipv4_is_zeronet(saddr)) { +		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, +					  in_dev, &itag);  		if (err < 0)  			goto martian_source_keep_err; -		if (err) -			flags |= RTCF_DIRECTSRC;  	}  	flags |= RTCF_BROADCAST;  	res.type = RTN_BROADCAST;  	RT_CACHE_STAT_INC(in_brd);  local_input: +	do_cache = false; +	if (res.fi) { +		if (!itag) { +			rth = FIB_RES_NH(res).nh_rth_input; +			if (rt_cache_valid(rth)) { +				dst_hold(&rth->dst); +				goto set_and_out; +			} +			do_cache = true; +		} +	} +  	rth = rt_dst_alloc(net->loopback_dev, -			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false); +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);  	if (!rth)  		goto e_nobufs; @@ -2354,41 +1604,27 @@ local_input:  	rth->dst.tclassid = itag;  #endif -	rth->rt_key_dst	= daddr; -	rth->rt_key_src	= saddr;  	rth->rt_genid = rt_genid(net);  	rth->rt_flags 	= flags|RTCF_LOCAL;  	rth->rt_type	= res.type; -	rth->rt_key_tos	= tos; -	rth->rt_dst	= daddr; -	rth->rt_src	= saddr; -#ifdef CONFIG_IP_ROUTE_CLASSID -	rth->dst.tclassid = itag; -#endif -	rth->rt_route_iif = dev->ifindex; -	rth->rt_iif	= dev->ifindex; -	rth->rt_oif	= 0; -	rth->rt_mark    = skb->mark; -	rth->rt_gateway	= daddr; -	rth->rt_spec_dst= spec_dst; -	rth->rt_peer_genid = 0; -	rth->peer = NULL; -	rth->fi = NULL; +	rth->rt_is_input = 1; +	rth->rt_iif	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0;  	if (res.type == RTN_UNREACHABLE) {  		rth->dst.input= ip_error;  		rth->dst.error= -err;  		rth->rt_flags 	&= ~RTCF_LOCAL;  	} -	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); -	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); +	if (do_cache) +		rt_cache_route(&FIB_RES_NH(res), rth); +set_and_out: +	skb_dst_set(skb, &rth->dst);  	err = 0; -	if (IS_ERR(rth)) -		err = PTR_ERR(rth);  	goto out;  no_route:  	RT_CACHE_STAT_INC(in_no_route); -	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);  	res.type = RTN_UNREACHABLE;  	if (err == -ESRCH)  		err = -ENETUNREACH; @@ -2405,10 +1641,6 @@ martian_destination:  				     &daddr, &saddr, dev->name);  #endif -e_hostunreach: -	err = -EHOSTUNREACH; -	goto out; -  e_inval:  	err = -EINVAL;  	goto out; @@ -2424,50 +1656,13 @@ martian_source_keep_err:  	goto out;  } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, -			   u8 tos, struct net_device *dev, bool noref) +int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, +		   u8 tos, struct net_device *dev)  { -	struct rtable	*rth; -	unsigned int	hash; -	int iif = dev->ifindex; -	struct net *net;  	int res; -	net = dev_net(dev); -  	rcu_read_lock(); -	if (!rt_caching(net)) -		goto skip_cache; - -	tos &= IPTOS_RT_MASK; -	hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - -	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; -	     rth = rcu_dereference(rth->dst.rt_next)) { -		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | -		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | -		     (rth->rt_route_iif ^ iif) | -		     (rth->rt_key_tos ^ tos)) == 0 && -		    rth->rt_mark == skb->mark && -		    net_eq(dev_net(rth->dst.dev), net) && -		    !rt_is_expired(rth)) { -			ipv4_validate_peer(rth); -			if (noref) { -				dst_use_noref(&rth->dst, jiffies); -				skb_dst_set_noref(skb, &rth->dst); -			} else { -				dst_use(&rth->dst, jiffies); -				skb_dst_set(skb, &rth->dst); -			} -			RT_CACHE_STAT_INC(in_hit); -			rcu_read_unlock(); -			return 0; -		} -		RT_CACHE_STAT_INC(in_hlist_search); -	} - -skip_cache:  	/* Multicast recognition logic is moved from route cache to here.  	   The problem was that too many Ethernet cards have broken/missing  	   hardware multicast filters :-( As result the host on multicasting @@ -2505,24 +1700,28 @@ skip_cache:  	rcu_read_unlock();  	return res;  } -EXPORT_SYMBOL(ip_route_input_common); +EXPORT_SYMBOL(ip_route_input);  /* called with rcu_read_lock() */  static struct rtable *__mkroute_output(const struct fib_result *res, -				       const struct flowi4 *fl4, -				       __be32 orig_daddr, __be32 orig_saddr, -				       int orig_oif, __u8 orig_rtos, +				       const struct flowi4 *fl4, int orig_oif,  				       struct net_device *dev_out,  				       unsigned int flags)  {  	struct fib_info *fi = res->fi; +	struct fib_nh_exception *fnhe;  	struct in_device *in_dev;  	u16 type = res->type;  	struct rtable *rth; -	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) +	in_dev = __in_dev_get_rcu(dev_out); +	if (!in_dev)  		return ERR_PTR(-EINVAL); +	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) +		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) +			return ERR_PTR(-EINVAL); +  	if (ipv4_is_lbcast(fl4->daddr))  		type = RTN_BROADCAST;  	else if (ipv4_is_multicast(fl4->daddr)) @@ -2533,10 +1732,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,  	if (dev_out->flags & IFF_LOOPBACK)  		flags |= RTCF_LOCAL; -	in_dev = __in_dev_get_rcu(dev_out); -	if (!in_dev) -		return ERR_PTR(-EINVAL); -  	if (type == RTN_BROADCAST) {  		flags |= RTCF_BROADCAST | RTCF_LOCAL;  		fi = NULL; @@ -2553,40 +1748,39 @@ static struct rtable *__mkroute_output(const struct fib_result *res,  			fi = NULL;  	} +	fnhe = NULL; +	if (fi) { +		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); +		if (!fnhe) { +			rth = FIB_RES_NH(*res).nh_rth_output; +			if (rt_cache_valid(rth)) { +				dst_hold(&rth->dst); +				return rth; +			} +		} +	}  	rth = rt_dst_alloc(dev_out,  			   IN_DEV_CONF_GET(in_dev, NOPOLICY), -			   IN_DEV_CONF_GET(in_dev, NOXFRM)); +			   IN_DEV_CONF_GET(in_dev, NOXFRM), +			   fi && !fnhe);  	if (!rth)  		return ERR_PTR(-ENOBUFS);  	rth->dst.output = ip_output; -	rth->rt_key_dst	= orig_daddr; -	rth->rt_key_src	= orig_saddr;  	rth->rt_genid = rt_genid(dev_net(dev_out));  	rth->rt_flags	= flags;  	rth->rt_type	= type; -	rth->rt_key_tos	= orig_rtos; -	rth->rt_dst	= fl4->daddr; -	rth->rt_src	= fl4->saddr; -	rth->rt_route_iif = 0; -	rth->rt_iif	= orig_oif ? : dev_out->ifindex; -	rth->rt_oif	= orig_oif; -	rth->rt_mark    = fl4->flowi4_mark; -	rth->rt_gateway = fl4->daddr; -	rth->rt_spec_dst= fl4->saddr; -	rth->rt_peer_genid = 0; -	rth->peer = NULL; -	rth->fi = NULL; +	rth->rt_is_input = 0; +	rth->rt_iif	= orig_oif ? : 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway = 0;  	RT_CACHE_STAT_INC(out_slow_tot); -	if (flags & RTCF_LOCAL) { +	if (flags & RTCF_LOCAL)  		rth->dst.input = ip_local_deliver; -		rth->rt_spec_dst = fl4->daddr; -	}  	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { -		rth->rt_spec_dst = fl4->saddr;  		if (flags & RTCF_LOCAL &&  		    !(dev_out->flags & IFF_LOOPBACK)) {  			rth->dst.output = ip_mc_output; @@ -2603,34 +1797,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res,  #endif  	} -	rt_set_nexthop(rth, fl4, res, fi, type, 0); +	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);  	return rth;  }  /*   * Major route resolver routine. - * called with rcu_read_lock();   */ -static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)  {  	struct net_device *dev_out = NULL;  	__u8 tos = RT_FL_TOS(fl4);  	unsigned int flags = 0;  	struct fib_result res;  	struct rtable *rth; -	__be32 orig_daddr; -	__be32 orig_saddr;  	int orig_oif; +	res.tclassid	= 0;  	res.fi		= NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES -	res.r		= NULL; -#endif +	res.table	= NULL; -	orig_daddr = fl4->daddr; -	orig_saddr = fl4->saddr;  	orig_oif = fl4->flowi4_oif;  	fl4->flowi4_iif = net->loopback_dev->ifindex; @@ -2730,6 +1918,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)  	if (fib_lookup(net, fl4, &res)) {  		res.fi = NULL; +		res.table = NULL;  		if (fl4->flowi4_oif) {  			/* Apparently, routing tables are wrong. Assume,  			   that the destination is on link. @@ -2791,60 +1980,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)  make_route: -	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, -			       tos, dev_out, flags); -	if (!IS_ERR(rth)) { -		unsigned int hash; - -		hash = rt_hash(orig_daddr, orig_saddr, orig_oif, -			       rt_genid(dev_net(dev_out))); -		rth = rt_intern_hash(hash, rth, NULL, orig_oif); -	} +	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);  out:  	rcu_read_unlock();  	return rth;  } - -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) -{ -	struct rtable *rth; -	unsigned int hash; - -	if (!rt_caching(net)) -		goto slow_output; - -	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); - -	rcu_read_lock_bh(); -	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; -		rth = rcu_dereference_bh(rth->dst.rt_next)) { -		if (rth->rt_key_dst == flp4->daddr && -		    rth->rt_key_src == flp4->saddr && -		    rt_is_output_route(rth) && -		    rth->rt_oif == flp4->flowi4_oif && -		    rth->rt_mark == flp4->flowi4_mark && -		    !((rth->rt_key_tos ^ flp4->flowi4_tos) & -			    (IPTOS_RT_MASK | RTO_ONLINK)) && -		    net_eq(dev_net(rth->dst.dev), net) && -		    !rt_is_expired(rth)) { -			ipv4_validate_peer(rth); -			dst_use(&rth->dst, jiffies); -			RT_CACHE_STAT_INC(out_hit); -			rcu_read_unlock_bh(); -			if (!flp4->saddr) -				flp4->saddr = rth->rt_src; -			if (!flp4->daddr) -				flp4->daddr = rth->rt_dst; -			return rth; -		} -		RT_CACHE_STAT_INC(out_hlist_search); -	} -	rcu_read_unlock_bh(); - -slow_output: -	return ip_route_output_slow(net, flp4); -}  EXPORT_SYMBOL_GPL(__ip_route_output_key);  static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) @@ -2859,7 +2000,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)  	return mtu ? : dst->dev->mtu;  } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, +					  struct sk_buff *skb, u32 mtu) +{ +} + +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, +				       struct sk_buff *skb)  {  } @@ -2872,53 +2019,40 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,  static struct dst_ops ipv4_dst_blackhole_ops = {  	.family			=	AF_INET,  	.protocol		=	cpu_to_be16(ETH_P_IP), -	.destroy		=	ipv4_dst_destroy,  	.check			=	ipv4_blackhole_dst_check,  	.mtu			=	ipv4_blackhole_mtu,  	.default_advmss		=	ipv4_default_advmss,  	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu, +	.redirect		=	ipv4_rt_blackhole_redirect,  	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,  	.neigh_lookup		=	ipv4_neigh_lookup,  };  struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)  { -	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);  	struct rtable *ort = (struct rtable *) dst_orig; +	struct rtable *rt; +	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);  	if (rt) {  		struct dst_entry *new = &rt->dst;  		new->__use = 1;  		new->input = dst_discard;  		new->output = dst_discard; -		dst_copy_metrics(new, &ort->dst);  		new->dev = ort->dst.dev;  		if (new->dev)  			dev_hold(new->dev); -		rt->rt_key_dst = ort->rt_key_dst; -		rt->rt_key_src = ort->rt_key_src; -		rt->rt_key_tos = ort->rt_key_tos; -		rt->rt_route_iif = ort->rt_route_iif; +		rt->rt_is_input = ort->rt_is_input;  		rt->rt_iif = ort->rt_iif; -		rt->rt_oif = ort->rt_oif; -		rt->rt_mark = ort->rt_mark; +		rt->rt_pmtu = ort->rt_pmtu;  		rt->rt_genid = rt_genid(net);  		rt->rt_flags = ort->rt_flags;  		rt->rt_type = ort->rt_type; -		rt->rt_dst = ort->rt_dst; -		rt->rt_src = ort->rt_src;  		rt->rt_gateway = ort->rt_gateway; -		rt->rt_spec_dst = ort->rt_spec_dst; -		rt->peer = ort->peer; -		if (rt->peer) -			atomic_inc(&rt->peer->refcnt); -		rt->fi = ort->fi; -		if (rt->fi) -			atomic_inc(&rt->fi->fib_clntref);  		dst_free(new);  	} @@ -2945,16 +2079,16 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,  }  EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, -			struct sk_buff *skb, u32 pid, u32 seq, int event, -			int nowait, unsigned int flags) +static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, +			struct flowi4 *fl4, struct sk_buff *skb, u32 pid, +			u32 seq, int event, int nowait, unsigned int flags)  {  	struct rtable *rt = skb_rtable(skb);  	struct rtmsg *r;  	struct nlmsghdr *nlh;  	unsigned long expires = 0; -	const struct inet_peer *peer = rt->peer; -	u32 id = 0, ts = 0, tsage = 0, error; +	u32 error; +	u32 metrics[RTAX_MAX];  	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);  	if (nlh == NULL) @@ -2964,7 +2098,7 @@ static int rt_fill_info(struct net *net,  	r->rtm_family	 = AF_INET;  	r->rtm_dst_len	= 32;  	r->rtm_src_len	= 0; -	r->rtm_tos	= rt->rt_key_tos; +	r->rtm_tos	= fl4->flowi4_tos;  	r->rtm_table	= RT_TABLE_MAIN;  	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))  		goto nla_put_failure; @@ -2975,11 +2109,11 @@ static int rt_fill_info(struct net *net,  	if (rt->rt_flags & RTCF_NOTIFY)  		r->rtm_flags |= RTM_F_NOTIFY; -	if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) +	if (nla_put_be32(skb, RTA_DST, dst))  		goto nla_put_failure; -	if (rt->rt_key_src) { +	if (src) {  		r->rtm_src_len = 32; -		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) +		if (nla_put_be32(skb, RTA_SRC, src))  			goto nla_put_failure;  	}  	if (rt->dst.dev && @@ -2990,69 +2124,40 @@ static int rt_fill_info(struct net *net,  	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))  		goto nla_put_failure;  #endif -	if (rt_is_input_route(rt)) { -		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) -			goto nla_put_failure; -	} else if (rt->rt_src != rt->rt_key_src) { -		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) +	if (!rt_is_input_route(rt) && +	    fl4->saddr != src) { +		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))  			goto nla_put_failure;  	} -	if (rt->rt_dst != rt->rt_gateway && +	if (rt->rt_gateway &&  	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))  		goto nla_put_failure; -	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) +	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); +	if (rt->rt_pmtu) +		metrics[RTAX_MTU - 1] = rt->rt_pmtu; +	if (rtnetlink_put_metrics(skb, metrics) < 0)  		goto nla_put_failure; -	if (rt->rt_mark && -	    nla_put_be32(skb, RTA_MARK, rt->rt_mark)) +	if (fl4->flowi4_mark && +	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))  		goto nla_put_failure;  	error = rt->dst.error; -	if (peer) { -		inet_peer_refcheck(rt->peer); -		id = atomic_read(&peer->ip_id_count) & 0xffff; -		if (peer->tcp_ts_stamp) { -			ts = peer->tcp_ts; -			tsage = get_seconds() - peer->tcp_ts_stamp; -		} -		expires = ACCESS_ONCE(peer->pmtu_expires); -		if (expires) { -			if (time_before(jiffies, expires)) -				expires -= jiffies; -			else -				expires = 0; -		} +	expires = rt->dst.expires; +	if (expires) { +		if (time_before(jiffies, expires)) +			expires -= jiffies; +		else +			expires = 0;  	}  	if (rt_is_input_route(rt)) { -#ifdef CONFIG_IP_MROUTE -		__be32 dst = rt->rt_dst; - -		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && -		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { -			int err = ipmr_get_route(net, skb, -						 rt->rt_src, rt->rt_dst, -						 r, nowait); -			if (err <= 0) { -				if (!nowait) { -					if (err == 0) -						return 0; -					goto nla_put_failure; -				} else { -					if (err == -EMSGSIZE) -						goto nla_put_failure; -					error = err; -				} -			} -		} else -#endif -			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) -				goto nla_put_failure; +		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) +			goto nla_put_failure;  	} -	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, -			       expires, error) < 0) +	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)  		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -3068,6 +2173,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void  	struct rtmsg *rtm;  	struct nlattr *tb[RTA_MAX+1];  	struct rtable *rt = NULL; +	struct flowi4 fl4;  	__be32 dst = 0;  	__be32 src = 0;  	u32 iif; @@ -3102,6 +2208,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void  	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;  	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; +	memset(&fl4, 0, sizeof(fl4)); +	fl4.daddr = dst; +	fl4.saddr = src; +	fl4.flowi4_tos = rtm->rtm_tos; +	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; +	fl4.flowi4_mark = mark; +  	if (iif) {  		struct net_device *dev; @@ -3122,13 +2235,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void  		if (err == 0 && rt->dst.error)  			err = -rt->dst.error;  	} else { -		struct flowi4 fl4 = { -			.daddr = dst, -			.saddr = src, -			.flowi4_tos = rtm->rtm_tos, -			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, -			.flowi4_mark = mark, -		};  		rt = ip_route_output_key(net, &fl4);  		err = 0; @@ -3143,7 +2249,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void  	if (rtm->rtm_flags & RTM_F_NOTIFY)  		rt->rt_flags |= RTCF_NOTIFY; -	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, +	err = rt_fill_info(net, dst, src, &fl4, skb, +			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,  			   RTM_NEWROUTE, 0, 0);  	if (err <= 0)  		goto errout_free; @@ -3159,43 +2266,6 @@ errout_free:  int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)  { -	struct rtable *rt; -	int h, s_h; -	int idx, s_idx; -	struct net *net; - -	net = sock_net(skb->sk); - -	s_h = cb->args[0]; -	if (s_h < 0) -		s_h = 0; -	s_idx = idx = cb->args[1]; -	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { -		if (!rt_hash_table[h].chain) -			continue; -		rcu_read_lock_bh(); -		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; -		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { -			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) -				continue; -			if (rt_is_expired(rt)) -				continue; -			skb_dst_set_noref(skb, &rt->dst); -			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, -					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, -					 1, NLM_F_MULTI) <= 0) { -				skb_dst_drop(skb); -				rcu_read_unlock_bh(); -				goto done; -			} -			skb_dst_drop(skb); -		} -		rcu_read_unlock_bh(); -	} - -done: -	cb->args[0] = h; -	cb->args[1] = idx;  	return skb->len;  } @@ -3400,26 +2470,34 @@ static __net_initdata struct pernet_operations rt_genid_ops = {  	.init = rt_genid_init,  }; +static int __net_init ipv4_inetpeer_init(struct net *net) +{ +	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); -#ifdef CONFIG_IP_ROUTE_CLASSID -struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_IP_ROUTE_CLASSID */ +	if (!bp) +		return -ENOMEM; +	inet_peer_base_init(bp); +	net->ipv4.peers = bp; +	return 0; +} -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) +static void __net_exit ipv4_inetpeer_exit(struct net *net)  { -	ssize_t ret; +	struct inet_peer_base *bp = net->ipv4.peers; -	if (!str) -		return 0; +	net->ipv4.peers = NULL; +	inetpeer_invalidate_tree(bp); +	kfree(bp); +} -	ret = kstrtoul(str, 0, &rhash_entries); -	if (ret) -		return 0; +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { +	.init	=	ipv4_inetpeer_init, +	.exit	=	ipv4_inetpeer_exit, +}; -	return 1; -} -__setup("rhash_entries=", set_rhash_entries); +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */  int __init ip_rt_init(void)  { @@ -3443,31 +2521,12 @@ int __init ip_rt_init(void)  	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)  		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); -	rt_hash_table = (struct rt_hash_bucket *) -		alloc_large_system_hash("IP route cache", -					sizeof(struct rt_hash_bucket), -					rhash_entries, -					(totalram_pages >= 128 * 1024) ? -					15 : 17, -					0, -					&rt_hash_log, -					&rt_hash_mask, -					0, -					rhash_entries ? 0 : 512 * 1024); -	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); -	rt_hash_lock_init(); - -	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); -	ip_rt_max_size = (rt_hash_mask + 1) * 16; +	ipv4_dst_ops.gc_thresh = ~0; +	ip_rt_max_size = INT_MAX;  	devinet_init();  	ip_fib_init(); -	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); -	expires_ljiffies = jiffies; -	schedule_delayed_work(&expires_work, -		net_random() % ip_rt_gc_interval + ip_rt_gc_interval); -  	if (ip_rt_proc_init())  		pr_err("Unable to create route proc files\n");  #ifdef CONFIG_XFRM @@ -3480,6 +2539,7 @@ int __init ip_rt_init(void)  	register_pernet_subsys(&sysctl_route_ops);  #endif  	register_pernet_subsys(&rt_genid_ops); +	register_pernet_subsys(&ipv4_inetpeer_ops);  	return rc;  }  |