diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 17 | ||||
| -rw-r--r-- | net/core/datagram.c | 482 | ||||
| -rw-r--r-- | net/core/dev.c | 3359 | ||||
| -rw-r--r-- | net/core/dev_mcast.c | 299 | ||||
| -rw-r--r-- | net/core/dst.c | 276 | ||||
| -rw-r--r-- | net/core/dv.c | 548 | ||||
| -rw-r--r-- | net/core/ethtool.c | 819 | ||||
| -rw-r--r-- | net/core/filter.c | 432 | ||||
| -rw-r--r-- | net/core/flow.c | 371 | ||||
| -rw-r--r-- | net/core/gen_estimator.c | 250 | ||||
| -rw-r--r-- | net/core/gen_stats.c | 239 | ||||
| -rw-r--r-- | net/core/iovec.c | 239 | ||||
| -rw-r--r-- | net/core/link_watch.c | 137 | ||||
| -rw-r--r-- | net/core/neighbour.c | 2362 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 461 | ||||
| -rw-r--r-- | net/core/netfilter.c | 799 | ||||
| -rw-r--r-- | net/core/netpoll.c | 735 | ||||
| -rw-r--r-- | net/core/pktgen.c | 3132 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 711 | ||||
| -rw-r--r-- | net/core/scm.c | 291 | ||||
| -rw-r--r-- | net/core/skbuff.c | 1460 | ||||
| -rw-r--r-- | net/core/sock.c | 1565 | ||||
| -rw-r--r-- | net/core/stream.c | 287 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 182 | ||||
| -rw-r--r-- | net/core/utils.c | 155 | ||||
| -rw-r--r-- | net/core/wireless.c | 1459 | 
26 files changed, 21067 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 00000000000..81f03243fe2 --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y		     += flow.o dev.o ethtool.o dev_mcast.o dst.o \ +			neighbour.o rtnetlink.o utils.o link_watch.o filter.o + +obj-$(CONFIG_SYSFS) += net-sysfs.o +obj-$(CONFIG_NETFILTER) += netfilter.o +obj-$(CONFIG_NET_DIVERT) += dv.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NET_RADIO) += wireless.o +obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 00000000000..d1bfd279cc1 --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,482 @@ +/* + *	SUCS NET3: + * + *	Generic datagram handling routines. These are generic for all + *	protocols. Possibly a generic IP version on top of these would + *	make sense. Not tonight however 8-). + *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + *	NetROM layer all have identical poll code and mostly + *	identical recvmsg() code. So we share it here. The poll was + *	shared before but buried in udp.c so I moved it. + * + *	Authors:	Alan Cox <alan@redhat.com>. (datagram_poll() from old + *						     udp.c code) + * + *	Fixes: + *		Alan Cox	:	NULL return from skb_peek_copy() + *					understood + *		Alan Cox	:	Rewrote skb_read_datagram to avoid the + *					skb_peek_copy stuff. + *		Alan Cox	:	Added support for SOCK_SEQPACKET. + *					IPX can no longer use the SO_TYPE hack + *					but AX.25 now works right, and SPX is + *					feasible. + *		Alan Cox	:	Fixed write poll of non IP protocol + *					crash. + *		Florian  La Roche:	Changed for my new skbuff handling. + *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET. + *		Linus Torvalds	:	BSD semantic fixes. + *		Alan Cox	:	Datagram iovec handling + *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM. + *		Alan Cox	:	POSIXisms + *		Pete Wyckoff    :       Unconnected accept() fix. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/inet.h> +#include <linux/tcp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/poll.h> +#include <linux/highmem.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/checksum.h> + + +/* + *	Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ +	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +/* + * Wait for a packet.. + */ +static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +{ +	int error; +	DEFINE_WAIT(wait); + +	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + +	/* Socket errors? */ +	error = sock_error(sk); +	if (error) +		goto out_err; + +	if (!skb_queue_empty(&sk->sk_receive_queue)) +		goto out; + +	/* Socket shut down? */ +	if (sk->sk_shutdown & RCV_SHUTDOWN) +		goto out_noerr; + +	/* Sequenced packets can come disconnected. +	 * If so we report the problem +	 */ +	error = -ENOTCONN; +	if (connection_based(sk) && +	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) +		goto out_err; + +	/* handle signals */ +	if (signal_pending(current)) +		goto interrupted; + +	error = 0; +	*timeo_p = schedule_timeout(*timeo_p); +out: +	finish_wait(sk->sk_sleep, &wait); +	return error; +interrupted: +	error = sock_intr_errno(*timeo_p); +out_err: +	*err = error; +	goto out; +out_noerr: +	*err = 0; +	error = 1; +	goto out; +} + +/** + *	skb_recv_datagram - Receive a datagram skbuff + *	@sk - socket + *	@flags - MSG_ flags + *	@noblock - blocking operation? + *	@err - error code returned + * + *	Get a datagram skbuff, understands the peeking, nonblocking wakeups + *	and possible races. This replaces identical code in packet, raw and + *	udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + *	the long standing peek and read race for datagram sockets. If you + *	alter this routine remember it must be re-entrant. + * + *	This function will lock the socket if a skb is returned, so the caller + *	needs to unlock the socket in that case (usually by calling + *	skb_free_datagram) + * + *	* It does not lock socket since today. This function is + *	* free of race conditions. This measure should/can improve + *	* significantly datagram socket latencies at high loads, + *	* when data copying to user space takes lots of time. + *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + *	*  8) Great win.) + *	*			                    --ANK (980729) + * + *	The order of the tests when we find no data waiting are specified + *	quite explicitly by POSIX 1003.1g, don't change them without having + *	the standard around please. + */ +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, +				  int noblock, int *err) +{ +	struct sk_buff *skb; +	long timeo; +	/* +	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() +	 */ +	int error = sock_error(sk); + +	if (error) +		goto no_packet; + +	timeo = sock_rcvtimeo(sk, noblock); + +	do { +		/* Again only user level code calls this function, so nothing +		 * interrupt level will suddenly eat the receive_queue. +		 * +		 * Look at current nfs client by the way... +		 * However, this function was corrent in any case. 8) +		 */ +		if (flags & MSG_PEEK) { +			unsigned long cpu_flags; + +			spin_lock_irqsave(&sk->sk_receive_queue.lock, +					  cpu_flags); +			skb = skb_peek(&sk->sk_receive_queue); +			if (skb) +				atomic_inc(&skb->users); +			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, +					       cpu_flags); +		} else +			skb = skb_dequeue(&sk->sk_receive_queue); + +		if (skb) +			return skb; + +		/* User doesn't want to wait */ +		error = -EAGAIN; +		if (!timeo) +			goto no_packet; + +	} while (!wait_for_packet(sk, err, &timeo)); + +	return NULL; + +no_packet: +	*err = error; +	return NULL; +} + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ +	kfree_skb(skb); +} + +/** + *	skb_copy_datagram_iovec - Copy a datagram to an iovec. + *	@skb - buffer to copy + *	@offset - offset in the buffer to start copying from + *	@iovec - io vector to copy to + *	@len - amount of data to copy from buffer to iovec + * + *	Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, +			    struct iovec *to, int len) +{ +	int start = skb_headlen(skb); +	int i, copy = start - offset; + +	/* Copy header. */ +	if (copy > 0) { +		if (copy > len) +			copy = len; +		if (memcpy_toiovec(to, skb->data + offset, copy)) +			goto fault; +		if ((len -= copy) == 0) +			return 0; +		offset += copy; +	} + +	/* Copy paged appendix. Hmm... why does this look so complicated? */ +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		int end; + +		BUG_TRAP(start <= offset + len); + +		end = start + skb_shinfo(skb)->frags[i].size; +		if ((copy = end - offset) > 0) { +			int err; +			u8  *vaddr; +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +			struct page *page = frag->page; + +			if (copy > len) +				copy = len; +			vaddr = kmap(page); +			err = memcpy_toiovec(to, vaddr + frag->page_offset + +					     offset - start, copy); +			kunmap(page); +			if (err) +				goto fault; +			if (!(len -= copy)) +				return 0; +			offset += copy; +		} +		start = end; +	} + +	if (skb_shinfo(skb)->frag_list) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; + +		for (; list; list = list->next) { +			int end; + +			BUG_TRAP(start <= offset + len); + +			end = start + list->len; +			if ((copy = end - offset) > 0) { +				if (copy > len) +					copy = len; +				if (skb_copy_datagram_iovec(list, +							    offset - start, +							    to, copy)) +					goto fault; +				if ((len -= copy) == 0) +					return 0; +				offset += copy; +			} +			start = end; +		} +	} +	if (!len) +		return 0; + +fault: +	return -EFAULT; +} + +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, +				      u8 __user *to, int len, +				      unsigned int *csump) +{ +	int start = skb_headlen(skb); +	int pos = 0; +	int i, copy = start - offset; + +	/* Copy header. */ +	if (copy > 0) { +		int err = 0; +		if (copy > len) +			copy = len; +		*csump = csum_and_copy_to_user(skb->data + offset, to, copy, +					       *csump, &err); +		if (err) +			goto fault; +		if ((len -= copy) == 0) +			return 0; +		offset += copy; +		to += copy; +		pos = copy; +	} + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		int end; + +		BUG_TRAP(start <= offset + len); + +		end = start + skb_shinfo(skb)->frags[i].size; +		if ((copy = end - offset) > 0) { +			unsigned int csum2; +			int err = 0; +			u8  *vaddr; +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +			struct page *page = frag->page; + +			if (copy > len) +				copy = len; +			vaddr = kmap(page); +			csum2 = csum_and_copy_to_user(vaddr + +							frag->page_offset + +							offset - start, +						      to, copy, 0, &err); +			kunmap(page); +			if (err) +				goto fault; +			*csump = csum_block_add(*csump, csum2, pos); +			if (!(len -= copy)) +				return 0; +			offset += copy; +			to += copy; +			pos += copy; +		} +		start = end; +	} + +	if (skb_shinfo(skb)->frag_list) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; + +		for (; list; list=list->next) { +			int end; + +			BUG_TRAP(start <= offset + len); + +			end = start + list->len; +			if ((copy = end - offset) > 0) { +				unsigned int csum2 = 0; +				if (copy > len) +					copy = len; +				if (skb_copy_and_csum_datagram(list, +							       offset - start, +							       to, copy, +							       &csum2)) +					goto fault; +				*csump = csum_block_add(*csump, csum2, pos); +				if ((len -= copy) == 0) +					return 0; +				offset += copy; +				to += copy; +				pos += copy; +			} +			start = end; +		} +	} +	if (!len) +		return 0; + +fault: +	return -EFAULT; +} + +/** + *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + *	@skb - skbuff + *	@hlen - hardware length + *	@iovec - io vector + *  + *	Caller _must_ check that skb will fit to this iovec. + * + *	Returns: 0       - success. + *		 -EINVAL - checksum failure. + *		 -EFAULT - fault during copy. Beware, in this case iovec + *			   can be modified! + */ +int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, +				     int hlen, struct iovec *iov) +{ +	unsigned int csum; +	int chunk = skb->len - hlen; + +	/* Skip filled elements. +	 * Pretty silly, look at memcpy_toiovec, though 8) +	 */ +	while (!iov->iov_len) +		iov++; + +	if (iov->iov_len < chunk) { +		if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, +							   skb->csum))) +			goto csum_error; +		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) +			goto fault; +	} else { +		csum = csum_partial(skb->data, hlen, skb->csum); +		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, +					       chunk, &csum)) +			goto fault; +		if ((unsigned short)csum_fold(csum)) +			goto csum_error; +		iov->iov_len -= chunk; +		iov->iov_base += chunk; +	} +	return 0; +csum_error: +	return -EINVAL; +fault: +	return -EFAULT; +} + +/** + * 	datagram_poll - generic datagram poll + *	@file - file struct + *	@sock - socket + *	@wait - poll table + * + *	Datagram poll: Again totally generic. This also handles + *	sequenced packet sockets providing the socket receive queue + *	is only ever holding data ready to receive. + * + *	Note: when you _don't_ use this routine for this protocol, + *	and you use a different write policy from sock_writeable() + *	then please supply your own write_space callback. + */ +unsigned int datagram_poll(struct file *file, struct socket *sock, +			   poll_table *wait) +{ +	struct sock *sk = sock->sk; +	unsigned int mask; + +	poll_wait(file, sk->sk_sleep, wait); +	mask = 0; + +	/* exceptional events? */ +	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) +		mask |= POLLERR; +	if (sk->sk_shutdown == SHUTDOWN_MASK) +		mask |= POLLHUP; + +	/* readable? */ +	if (!skb_queue_empty(&sk->sk_receive_queue) || +	    (sk->sk_shutdown & RCV_SHUTDOWN)) +		mask |= POLLIN | POLLRDNORM; + +	/* Connection-based need to check for termination and startup */ +	if (connection_based(sk)) { +		if (sk->sk_state == TCP_CLOSE) +			mask |= POLLHUP; +		/* connection hasn't started yet? */ +		if (sk->sk_state == TCP_SYN_SENT) +			return mask; +	} + +	/* writable? */ +	if (sock_writeable(sk)) +		mask |= POLLOUT | POLLWRNORM | POLLWRBAND; +	else +		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + +	return mask; +} + +EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iovec); +EXPORT_SYMBOL(skb_free_datagram); +EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 00000000000..42344d90369 --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,3359 @@ +/* + * 	NET3	Protocol independent device support routines. + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + *	Derived from the non IP parts of dev.c 1.0.19 + * 		Authors:	Ross Biro, <bir7@leland.Stanford.Edu> + *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + *				Mark Evans, <evansmp@uhura.aston.ac.uk> + * + *	Additional Authors: + *		Florian la Roche <rzsfl@rz.uni-sb.de> + *		Alan Cox <gw4pts@gw4pts.ampr.org> + *		David Hinds <dahinds@users.sourceforge.net> + *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + *		Adam Sulmicki <adam@cfar.umd.edu> + *              Pekka Riikonen <priikone@poesidon.pspt.fi> + * + *	Changes: + *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set + *              			to 2 if register_netdev gets called + *              			before net_dev_init & also removed a + *              			few lines of code in the process. + *		Alan Cox	:	device private ioctl copies fields back. + *		Alan Cox	:	Transmit queue code does relevant + *					stunts to keep the queue safe. + *		Alan Cox	:	Fixed double lock. + *		Alan Cox	:	Fixed promisc NULL pointer trap + *		????????	:	Support the full private ioctl range + *		Alan Cox	:	Moved ioctl permission check into + *					drivers + *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI + *		Alan Cox	:	100 backlog just doesn't cut it when + *					you start doing multicast video 8) + *		Alan Cox	:	Rewrote net_bh and list manager. + *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths. + *		Alan Cox	:	Took out transmit every packet pass + *					Saved a few bytes in the ioctl handler + *		Alan Cox	:	Network driver sets packet type before + *					calling netif_rx. Saves a function + *					call a packet. + *		Alan Cox	:	Hashed net_bh() + *		Richard Kooijman:	Timestamp fixes. + *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR + *		Alan Cox	:	Device lock protection. + *		Alan Cox	: 	Fixed nasty side effect of device close + *					changes. + *		Rudi Cilibrasi	:	Pass the right thing to + *					set_mac_address() + *		Dave Miller	:	32bit quantity for the device lock to + *					make it work out on a Sparc. + *		Bjorn Ekwall	:	Added KERNELD hack. + *		Alan Cox	:	Cleaned up the backlog initialise. + *		Craig Metz	:	SIOCGIFCONF fix if space for under + *					1 device. + *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there + *					is no device open function. + *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF + *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF + *		Cyrus Durgin	:	Cleaned for KMOD + *		Adam Sulmicki   :	Bug Fix : Network Device Unload + *					A network device unload needs to purge + *					the backlog queue. + *	Paul Rusty Russell	:	SIOCSIFNAME + *              Pekka Riikonen  :	Netdev boot-time settings code + *              Andrew Morton   :       Make unregister_netdevice wait + *              			indefinitely on dev->refcnt + * 		J Hadi Salim	:	- Backlog queue sampling + *				        - netif_rx() feedback + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/if_bridge.h> +#include <linux/divert.h> +#include <net/dst.h> +#include <net/pkt_sched.h> +#include <net/checksum.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/netpoll.h> +#include <linux/rcupdate.h> +#include <linux/delay.h> +#ifdef CONFIG_NET_RADIO +#include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */ +#include <net/iw_handler.h> +#endif	/* CONFIG_NET_RADIO */ +#include <asm/current.h> + +/* This define, if set, will randomly drop a packet when congestion + * is more than moderate.  It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the + * single interface case so it is off now by default. + */ +#undef RAND_LIE + +/* Setting this will sample the queue lengths and thus congestion + * via a timer instead of as each packet is received. + */ +#undef OFFLINE_SAMPLE + +/* + *	The list of packet types we will receive (as opposed to discard) + *	and the routines to invoke. + * + *	Why 16. Because with 16 the only overlap we get on a hash of the + *	low nibble of the protocol value is RARP/SNAP/X.25. + * + *      NOTE:  That is no longer true with the addition of VLAN tags.  Not + *             sure which should go first, but I bet it won't make much + *             difference if we are running VLANs.  The good news is that + *             this protocol won't be in the list unless compiled in, so + *             the average user (w/out VLANs) will not be adversly affected. + *             --BLG + * + *		0800	IP + *		8100    802.1Q VLAN + *		0001	802.3 + *		0002	AX.25 + *		0004	802.2 + *		8035	RARP + *		0005	SNAP + *		0805	X.25 + *		0806	ARP + *		8137	IPX + *		0009	Localtalk + *		86DD	IPv6 + */ + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[16];	/* 16 way hashed list */ +static struct list_head ptype_all;		/* Taps */ + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy); +static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); +#endif + +/* + * The @dev_base list is protected by @dev_base_lock and the rtln + * semaphore. + * + * Pure readers hold dev_base_lock for reading. + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base list, and hold dev_base_lock for writing when they do the + * actual updates.  This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +struct net_device *dev_base; +static struct net_device **dev_tail = &dev_base; +DEFINE_RWLOCK(dev_base_lock); + +EXPORT_SYMBOL(dev_base); +EXPORT_SYMBOL(dev_base_lock); + +#define NETDEV_HASHBITS	8 +static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; +static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; + +static inline struct hlist_head *dev_name_hash(const char *name) +{ +	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); +	return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; +} + +static inline struct hlist_head *dev_index_hash(int ifindex) +{ +	return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; +} + +/* + *	Our notifier list + */ + +static struct notifier_block *netdev_chain; + +/* + *	Device drivers call our routines to queue packets here. We empty the + *	queue in the local softnet handler. + */ +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; + +#ifdef CONFIG_SYSFS +extern int netdev_sysfs_init(void); +extern int netdev_register_sysfs(struct net_device *); +extern void netdev_unregister_sysfs(struct net_device *); +#else +#define netdev_sysfs_init()	 	(0) +#define netdev_register_sysfs(dev)	(0) +#define	netdev_unregister_sysfs(dev)	do { } while(0) +#endif + + +/******************************************************************************* + +		Protocol management and registration routines + +*******************************************************************************/ + +/* + *	For efficiency + */ + +int netdev_nit; + +/* + *	Add a protocol ID to the list. Now that the input handler is + *	smarter we can dispense with all the messy stuff that used to be + *	here. + * + *	BEWARE!!! Protocol handlers, mangling input packets, + *	MUST BE last in hash buckets and checking protocol handlers + *	MUST start from promiscuous ptype_all chain in net_bh. + *	It is true now, do not change it. + *	Explanation follows: if protocol handler, mangling packet, will + *	be the first on list, it is not able to sense, that packet + *	is cloned and should be copied-on-write, so that it will + *	change it and subsequent readers will get broken packet. + *							--ANK (980803) + */ + +/** + *	dev_add_pack - add packet handler + *	@pt: packet type declaration + * + *	Add a protocol handler to the networking stack. The passed &packet_type + *	is linked into kernel lists and may not be freed until it has been + *	removed from the kernel lists. + * + *	This call does not sleep therefore it can not  + *	guarantee all CPU's that are in middle of receiving packets + *	will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ +	int hash; + +	spin_lock_bh(&ptype_lock); +	if (pt->type == htons(ETH_P_ALL)) { +		netdev_nit++; +		list_add_rcu(&pt->list, &ptype_all); +	} else { +		hash = ntohs(pt->type) & 15; +		list_add_rcu(&pt->list, &ptype_base[hash]); +	} +	spin_unlock_bh(&ptype_lock); +} + +extern void linkwatch_run_queue(void); + + + +/** + *	__dev_remove_pack	 - remove packet handler + *	@pt: packet type declaration + * + *	Remove a protocol handler that was previously added to the kernel + *	protocol handlers by dev_add_pack(). The passed &packet_type is removed + *	from the kernel lists and can be freed or reused once this function + *	returns.  + * + *      The packet type might still be in use by receivers + *	and must not be freed until after all the CPU's have gone + *	through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ +	struct list_head *head; +	struct packet_type *pt1; + +	spin_lock_bh(&ptype_lock); + +	if (pt->type == htons(ETH_P_ALL)) { +		netdev_nit--; +		head = &ptype_all; +	} else +		head = &ptype_base[ntohs(pt->type) & 15]; + +	list_for_each_entry(pt1, head, list) { +		if (pt == pt1) { +			list_del_rcu(&pt->list); +			goto out; +		} +	} + +	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +out: +	spin_unlock_bh(&ptype_lock); +} +/** + *	dev_remove_pack	 - remove packet handler + *	@pt: packet type declaration + * + *	Remove a protocol handler that was previously added to the kernel + *	protocol handlers by dev_add_pack(). The passed &packet_type is removed + *	from the kernel lists and can be freed or reused once this function + *	returns. + * + *	This call sleeps to guarantee that no CPU is looking at the packet + *	type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ +	__dev_remove_pack(pt); +	 +	synchronize_net(); +} + +/****************************************************************************** + +		      Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + *	netdev_boot_setup_add	- add new setup entry + *	@name: name of the device + *	@map: configured settings for the device + * + *	Adds new setup entry to the dev_boot_setup list.  The function + *	returns 0 on error and 1 on success.  This is a generic routine to + *	all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ +	struct netdev_boot_setup *s; +	int i; + +	s = dev_boot_setup; +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { +		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { +			memset(s[i].name, 0, sizeof(s[i].name)); +			strcpy(s[i].name, name); +			memcpy(&s[i].map, map, sizeof(s[i].map)); +			break; +		} +	} + +	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + *	netdev_boot_setup_check	- check boot time settings + *	@dev: the netdevice + * + * 	Check boot time settings for the device. + *	The found settings are set for the device to be used + *	later in the device probing. + *	Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ +	struct netdev_boot_setup *s = dev_boot_setup; +	int i; + +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { +		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && +		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) { +			dev->irq 	= s[i].map.irq; +			dev->base_addr 	= s[i].map.base_addr; +			dev->mem_start 	= s[i].map.mem_start; +			dev->mem_end 	= s[i].map.mem_end; +			return 1; +		} +	} +	return 0; +} + + +/** + *	netdev_boot_base	- get address from boot time settings + *	@prefix: prefix for network device + *	@unit: id for network device + * + * 	Check boot time settings for the base address of device. + *	The found settings are set for the device to be used + *	later in the device probing. + *	Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ +	const struct netdev_boot_setup *s = dev_boot_setup; +	char name[IFNAMSIZ]; +	int i; + +	sprintf(name, "%s%d", prefix, unit); + +	/* +	 * If device already registered then return base of 1 +	 * to indicate not to probe for this interface +	 */ +	if (__dev_get_by_name(name)) +		return 1; + +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) +		if (!strcmp(name, s[i].name)) +			return s[i].map.base_addr; +	return 0; +} + +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ +	int ints[5]; +	struct ifmap map; + +	str = get_options(str, ARRAY_SIZE(ints), ints); +	if (!str || !*str) +		return 0; + +	/* Save settings */ +	memset(&map, 0, sizeof(map)); +	if (ints[0] > 0) +		map.irq = ints[1]; +	if (ints[0] > 1) +		map.base_addr = ints[2]; +	if (ints[0] > 2) +		map.mem_start = ints[3]; +	if (ints[0] > 3) +		map.mem_end = ints[4]; + +	/* Add new entry to the list */ +	return netdev_boot_setup_add(str, &map); +} + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + +			    Device Interface Subroutines + +*******************************************************************************/ + +/** + *	__dev_get_by_name	- find a device by its name + *	@name: name to find + * + *	Find an interface by name. Must be called under RTNL semaphore + *	or @dev_base_lock. If the name is found a pointer to the device + *	is returned. If the name is not found then %NULL is returned. The + *	reference counters are not incremented so the caller must be + *	careful with locks. + */ + +struct net_device *__dev_get_by_name(const char *name) +{ +	struct hlist_node *p; + +	hlist_for_each(p, dev_name_hash(name)) { +		struct net_device *dev +			= hlist_entry(p, struct net_device, name_hlist); +		if (!strncmp(dev->name, name, IFNAMSIZ)) +			return dev; +	} +	return NULL; +} + +/** + *	dev_get_by_name		- find a device by its name + *	@name: name to find + * + *	Find an interface by name. This can be called from any + *	context and does its own locking. The returned handle has + *	the usage count incremented and the caller must use dev_put() to + *	release it when it is no longer needed. %NULL is returned if no + *	matching device is found. + */ + +struct net_device *dev_get_by_name(const char *name) +{ +	struct net_device *dev; + +	read_lock(&dev_base_lock); +	dev = __dev_get_by_name(name); +	if (dev) +		dev_hold(dev); +	read_unlock(&dev_base_lock); +	return dev; +} + +/** + *	__dev_get_by_index - find a device by its ifindex + *	@ifindex: index of device + * + *	Search for an interface by index. Returns %NULL if the device + *	is not found or a pointer to the device. The device has not + *	had its reference counter increased so the caller must be careful + *	about locking. The caller must hold either the RTNL semaphore + *	or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(int ifindex) +{ +	struct hlist_node *p; + +	hlist_for_each(p, dev_index_hash(ifindex)) { +		struct net_device *dev +			= hlist_entry(p, struct net_device, index_hlist); +		if (dev->ifindex == ifindex) +			return dev; +	} +	return NULL; +} + + +/** + *	dev_get_by_index - find a device by its ifindex + *	@ifindex: index of device + * + *	Search for an interface by index. Returns NULL if the device + *	is not found or a pointer to the device. The device returned has + *	had a reference added and the pointer is safe until the user calls + *	dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(int ifindex) +{ +	struct net_device *dev; + +	read_lock(&dev_base_lock); +	dev = __dev_get_by_index(ifindex); +	if (dev) +		dev_hold(dev); +	read_unlock(&dev_base_lock); +	return dev; +} + +/** + *	dev_getbyhwaddr - find a device by its hardware address + *	@type: media type of device + *	@ha: hardware address + * + *	Search for an interface by MAC address. Returns NULL if the device + *	is not found or a pointer to the device. The caller must hold the + *	rtnl semaphore. The returned device has not had its ref count increased + *	and the caller must therefore be careful about locking + * + *	BUGS: + *	If the API was consistent this would be __dev_get_by_hwaddr + */ + +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +{ +	struct net_device *dev; + +	ASSERT_RTNL(); + +	for (dev = dev_base; dev; dev = dev->next) +		if (dev->type == type && +		    !memcmp(dev->dev_addr, ha, dev->addr_len)) +			break; +	return dev; +} + +struct net_device *dev_getfirstbyhwtype(unsigned short type) +{ +	struct net_device *dev; + +	rtnl_lock(); +	for (dev = dev_base; dev; dev = dev->next) { +		if (dev->type == type) { +			dev_hold(dev); +			break; +		} +	} +	rtnl_unlock(); +	return dev; +} + +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + *	dev_get_by_flags - find any device with given flags + *	@if_flags: IFF_* values + *	@mask: bitmask of bits in if_flags to check + * + *	Search for any interface with the given flags. Returns NULL if a device + *	is not found or a pointer to the device. The device returned has  + *	had a reference added and the pointer is safe until the user calls + *	dev_put to indicate they have finished with it. + */ + +struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) +{ +	struct net_device *dev; + +	read_lock(&dev_base_lock); +	for (dev = dev_base; dev != NULL; dev = dev->next) { +		if (((dev->flags ^ if_flags) & mask) == 0) { +			dev_hold(dev); +			break; +		} +	} +	read_unlock(&dev_base_lock); +	return dev; +} + +/** + *	dev_valid_name - check if name is okay for network device + *	@name: name string + * + *	Network device names need to be valid file names to + *	to allow sysfs to work + */ +static int dev_valid_name(const char *name) +{ +	return !(*name == '\0'  +		 || !strcmp(name, ".") +		 || !strcmp(name, "..") +		 || strchr(name, '/')); +} + +/** + *	dev_alloc_name - allocate a name for a device + *	@dev: device + *	@name: name format string + * + *	Passed a format string - eg "lt%d" it will try and find a suitable + *	id. Not efficient for many devices, not called a lot. The caller + *	must hold the dev_base or rtnl lock while allocating the name and + *	adding the device in order to avoid duplicates. Returns the number + *	of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ +	int i = 0; +	char buf[IFNAMSIZ]; +	const char *p; +	const int max_netdevices = 8*PAGE_SIZE; +	long *inuse; +	struct net_device *d; + +	p = strnchr(name, IFNAMSIZ-1, '%'); +	if (p) { +		/* +		 * Verify the string as this thing may have come from +		 * the user.  There must be either one "%d" and no other "%" +		 * characters. +		 */ +		if (p[1] != 'd' || strchr(p + 2, '%')) +			return -EINVAL; + +		/* Use one page as a bit array of possible slots */ +		inuse = (long *) get_zeroed_page(GFP_ATOMIC); +		if (!inuse) +			return -ENOMEM; + +		for (d = dev_base; d; d = d->next) { +			if (!sscanf(d->name, name, &i)) +				continue; +			if (i < 0 || i >= max_netdevices) +				continue; + +			/*  avoid cases where sscanf is not exact inverse of printf */ +			snprintf(buf, sizeof(buf), name, i); +			if (!strncmp(buf, d->name, IFNAMSIZ)) +				set_bit(i, inuse); +		} + +		i = find_first_zero_bit(inuse, max_netdevices); +		free_page((unsigned long) inuse); +	} + +	snprintf(buf, sizeof(buf), name, i); +	if (!__dev_get_by_name(buf)) { +		strlcpy(dev->name, buf, IFNAMSIZ); +		return i; +	} + +	/* It is possible to run out of possible slots +	 * when the name is long and there isn't enough space left +	 * for the digits, or if all bits are used. +	 */ +	return -ENFILE; +} + + +/** + *	dev_change_name - change name of a device + *	@dev: device + *	@newname: name (or format string) must be at least IFNAMSIZ + * + *	Change name of a device, can pass format strings "eth%d". + *	for wildcarding. + */ +int dev_change_name(struct net_device *dev, char *newname) +{ +	int err = 0; + +	ASSERT_RTNL(); + +	if (dev->flags & IFF_UP) +		return -EBUSY; + +	if (!dev_valid_name(newname)) +		return -EINVAL; + +	if (strchr(newname, '%')) { +		err = dev_alloc_name(dev, newname); +		if (err < 0) +			return err; +		strcpy(newname, dev->name); +	} +	else if (__dev_get_by_name(newname)) +		return -EEXIST; +	else +		strlcpy(dev->name, newname, IFNAMSIZ); + +	err = class_device_rename(&dev->class_dev, dev->name); +	if (!err) { +		hlist_del(&dev->name_hlist); +		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); +		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); +	} + +	return err; +} + +/** + *	netdev_state_change - device changes state + *	@dev: device to cause notification + * + *	Called to indicate a device has changed state. This function calls + *	the notifier chains for netdev_chain and sends a NEWLINK message + *	to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ +	if (dev->flags & IFF_UP) { +		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +	} +} + +/** + *	dev_load 	- load a network module + *	@name: name of interface + * + *	If a network interface is not present and the process has suitable + *	privileges this function loads the module. If module loading is not + *	available in this kernel then it becomes a nop. + */ + +void dev_load(const char *name) +{ +	struct net_device *dev;   + +	read_lock(&dev_base_lock); +	dev = __dev_get_by_name(name); +	read_unlock(&dev_base_lock); + +	if (!dev && capable(CAP_SYS_MODULE)) +		request_module("%s", name); +} + +static int default_rebuild_header(struct sk_buff *skb) +{ +	printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", +	       skb->dev ? skb->dev->name : "NULL!!!"); +	kfree_skb(skb); +	return 1; +} + + +/** + *	dev_open	- prepare an interface for use. + *	@dev:	device to open + * + *	Takes a device from down to up state. The device's private open + *	function is invoked and then the multicast lists are loaded. Finally + *	the device is moved into the up state and a %NETDEV_UP message is + *	sent to the netdev notifier chain. + * + *	Calling this function on an active interface is a nop. On a failure + *	a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ +	int ret = 0; + +	/* +	 *	Is it already up? +	 */ + +	if (dev->flags & IFF_UP) +		return 0; + +	/* +	 *	Is it even present? +	 */ +	if (!netif_device_present(dev)) +		return -ENODEV; + +	/* +	 *	Call device private open method +	 */ +	set_bit(__LINK_STATE_START, &dev->state); +	if (dev->open) { +		ret = dev->open(dev); +		if (ret) +			clear_bit(__LINK_STATE_START, &dev->state); +	} + + 	/* +	 *	If it went open OK then: +	 */ + +	if (!ret) { +		/* +		 *	Set the flags. +		 */ +		dev->flags |= IFF_UP; + +		/* +		 *	Initialize multicasting status +		 */ +		dev_mc_upload(dev); + +		/* +		 *	Wakeup transmit queue engine +		 */ +		dev_activate(dev); + +		/* +		 *	... and announce new interface. +		 */ +		notifier_call_chain(&netdev_chain, NETDEV_UP, dev); +	} +	return ret; +} + +/** + *	dev_close - shutdown an interface. + *	@dev: device to shutdown + * + *	This function moves an active device into down state. A + *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + *	chain. + */ +int dev_close(struct net_device *dev) +{ +	if (!(dev->flags & IFF_UP)) +		return 0; + +	/* +	 *	Tell people we are going down, so that they can +	 *	prepare to death, when device is still operating. +	 */ +	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); + +	dev_deactivate(dev); + +	clear_bit(__LINK_STATE_START, &dev->state); + +	/* Synchronize to scheduled poll. We cannot touch poll list, +	 * it can be even on different cpu. So just clear netif_running(), +	 * and wait when poll really will happen. Actually, the best place +	 * for this is inside dev->stop() after device stopped its irq +	 * engine, but this requires more changes in devices. */ + +	smp_mb__after_clear_bit(); /* Commit netif_running(). */ +	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { +		/* No hurry. */ +		current->state = TASK_INTERRUPTIBLE; +		schedule_timeout(1); +	} + +	/* +	 *	Call the device specific close. This cannot fail. +	 *	Only if device is UP +	 * +	 *	We allow it to be called even after a DETACH hot-plug +	 *	event. +	 */ +	if (dev->stop) +		dev->stop(dev); + +	/* +	 *	Device is now down. +	 */ + +	dev->flags &= ~IFF_UP; + +	/* +	 * Tell people we are down +	 */ +	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + +	return 0; +} + + +/* + *	Device change register/unregister. These are not inline or static + *	as we export them to the world. + */ + +/** + *	register_netdevice_notifier - register a network notifier block + *	@nb: notifier + * + *	Register a notifier to be called when network device events occur. + *	The notifier passed is linked into the kernel structures and must + *	not be reused until it has been unregistered. A negative errno code + *	is returned on a failure. + * + * 	When registered all registration and up events are replayed + *	to the new notifier to allow device to have a race free  + *	view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ +	struct net_device *dev; +	int err; + +	rtnl_lock(); +	err = notifier_chain_register(&netdev_chain, nb); +	if (!err) { +		for (dev = dev_base; dev; dev = dev->next) { +			nb->notifier_call(nb, NETDEV_REGISTER, dev); + +			if (dev->flags & IFF_UP)  +				nb->notifier_call(nb, NETDEV_UP, dev); +		} +	} +	rtnl_unlock(); +	return err; +} + +/** + *	unregister_netdevice_notifier - unregister a network notifier block + *	@nb: notifier + * + *	Unregister a notifier previously registered by + *	register_netdevice_notifier(). The notifier is unlinked into the + *	kernel structures and may then be reused. A negative errno code + *	is returned on a failure. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ +	return notifier_chain_unregister(&netdev_chain, nb); +} + +/** + *	call_netdevice_notifiers - call all network notifier blocks + *      @val: value passed unmodified to notifier function + *      @v:   pointer passed unmodified to notifier function + * + *	Call all network notifier blocks.  Parameters and return value + *	are as for notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, void *v) +{ +	return notifier_call_chain(&netdev_chain, val, v); +} + +/* When > 0 there are consumers of rx skb time stamps */ +static atomic_t netstamp_needed = ATOMIC_INIT(0); + +void net_enable_timestamp(void) +{ +	atomic_inc(&netstamp_needed); +} + +void net_disable_timestamp(void) +{ +	atomic_dec(&netstamp_needed); +} + +static inline void net_timestamp(struct timeval *stamp) +{ +	if (atomic_read(&netstamp_needed)) +		do_gettimeofday(stamp); +	else { +		stamp->tv_sec = 0; +		stamp->tv_usec = 0; +	} +} + +/* + *	Support routine. Sends outgoing frames to any network + *	taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ +	struct packet_type *ptype; +	net_timestamp(&skb->stamp); + +	rcu_read_lock(); +	list_for_each_entry_rcu(ptype, &ptype_all, list) { +		/* Never send packets back to the socket +		 * they originated from - MvS (miquels@drinkel.ow.org) +		 */ +		if ((ptype->dev == dev || !ptype->dev) && +		    (ptype->af_packet_priv == NULL || +		     (struct sock *)ptype->af_packet_priv != skb->sk)) { +			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); +			if (!skb2) +				break; + +			/* skb->nh should be correctly +			   set by sender, so that the second statement is +			   just protection against buggy protocols. +			 */ +			skb2->mac.raw = skb2->data; + +			if (skb2->nh.raw < skb2->data || +			    skb2->nh.raw > skb2->tail) { +				if (net_ratelimit()) +					printk(KERN_CRIT "protocol %04x is " +					       "buggy, dev %s\n", +					       skb2->protocol, dev->name); +				skb2->nh.raw = skb2->data; +			} + +			skb2->h.raw = skb2->nh.raw; +			skb2->pkt_type = PACKET_OUTGOING; +			ptype->func(skb2, skb->dev, ptype); +		} +	} +	rcu_read_unlock(); +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb, int inward) +{ +	unsigned int csum; +	int ret = 0, offset = skb->h.raw - skb->data; + +	if (inward) { +		skb->ip_summed = CHECKSUM_NONE; +		goto out; +	} + +	if (skb_cloned(skb)) { +		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +		if (ret) +			goto out; +	} + +	if (offset > (int)skb->len) +		BUG(); +	csum = skb_checksum(skb, offset, skb->len-offset, 0); + +	offset = skb->tail - skb->h.raw; +	if (offset <= 0) +		BUG(); +	if (skb->csum + 2 > offset) +		BUG(); + +	*(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); +	skb->ip_summed = CHECKSUM_NONE; +out:	 +	return ret; +} + +#ifdef CONFIG_HIGHMEM +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ +	int i; + +	if (dev->features & NETIF_F_HIGHDMA) +		return 0; + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +		if (PageHighMem(skb_shinfo(skb)->frags[i].page)) +			return 1; + +	return 0; +} +#else +#define illegal_highdma(dev, skb)	(0) +#endif + +extern void skb_release_data(struct sk_buff *); + +/* Keep head the same: replace data */ +int __skb_linearize(struct sk_buff *skb, int gfp_mask) +{ +	unsigned int size; +	u8 *data; +	long offset; +	struct skb_shared_info *ninfo; +	int headerlen = skb->data - skb->head; +	int expand = (skb->tail + skb->data_len) - skb->end; + +	if (skb_shared(skb)) +		BUG(); + +	if (expand <= 0) +		expand = 0; + +	size = skb->end - skb->head + expand; +	size = SKB_DATA_ALIGN(size); +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +	if (!data) +		return -ENOMEM; + +	/* Copy entire thing */ +	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) +		BUG(); + +	/* Set up shinfo */ +	ninfo = (struct skb_shared_info*)(data + size); +	atomic_set(&ninfo->dataref, 1); +	ninfo->tso_size = skb_shinfo(skb)->tso_size; +	ninfo->tso_segs = skb_shinfo(skb)->tso_segs; +	ninfo->nr_frags = 0; +	ninfo->frag_list = NULL; + +	/* Offset between the two in bytes */ +	offset = data - skb->head; + +	/* Free old data. */ +	skb_release_data(skb); + +	skb->head = data; +	skb->end  = data + size; + +	/* Set up new pointers */ +	skb->h.raw   += offset; +	skb->nh.raw  += offset; +	skb->mac.raw += offset; +	skb->tail    += offset; +	skb->data    += offset; + +	/* We are no longer a clone, even if we were. */ +	skb->cloned    = 0; + +	skb->tail     += skb->data_len; +	skb->data_len  = 0; +	return 0; +} + +#define HARD_TX_LOCK(dev, cpu) {			\ +	if ((dev->features & NETIF_F_LLTX) == 0) {	\ +		spin_lock(&dev->xmit_lock);		\ +		dev->xmit_lock_owner = cpu;		\ +	}						\ +} + +#define HARD_TX_UNLOCK(dev) {				\ +	if ((dev->features & NETIF_F_LLTX) == 0) {	\ +		dev->xmit_lock_owner = -1;		\ +		spin_unlock(&dev->xmit_lock);		\ +	}						\ +} + +/** + *	dev_queue_xmit - transmit a buffer + *	@skb: buffer to transmit + * + *	Queue a buffer for transmission to a network device. The caller must + *	have set the device and priority and built the buffer before calling + *	this function. The function can be called from an interrupt. + * + *	A negative errno code is returned on a failure. A success does not + *	guarantee the frame will be transmitted as it may be dropped due + *	to congestion or traffic shaping. + */ + +int dev_queue_xmit(struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; +	struct Qdisc *q; +	int rc = -ENOMEM; + +	if (skb_shinfo(skb)->frag_list && +	    !(dev->features & NETIF_F_FRAGLIST) && +	    __skb_linearize(skb, GFP_ATOMIC)) +		goto out_kfree_skb; + +	/* Fragmented skb is linearized if device does not support SG, +	 * or if at least one of fragments is in highmem and device +	 * does not support DMA from it. +	 */ +	if (skb_shinfo(skb)->nr_frags && +	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && +	    __skb_linearize(skb, GFP_ATOMIC)) +		goto out_kfree_skb; + +	/* If packet is not checksummed and device does not support +	 * checksumming for this protocol, complete checksumming here. +	 */ +	if (skb->ip_summed == CHECKSUM_HW && +	    (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && +	     (!(dev->features & NETIF_F_IP_CSUM) || +	      skb->protocol != htons(ETH_P_IP)))) +	      	if (skb_checksum_help(skb, 0)) +	      		goto out_kfree_skb; + +	/* Disable soft irqs for various locks below. Also  +	 * stops preemption for RCU.  +	 */ +	local_bh_disable();  + +	/* Updates of qdisc are serialized by queue_lock.  +	 * The struct Qdisc which is pointed to by qdisc is now a  +	 * rcu structure - it may be accessed without acquiring  +	 * a lock (but the structure may be stale.) The freeing of the +	 * qdisc will be deferred until it's known that there are no  +	 * more references to it. +	 *  +	 * If the qdisc has an enqueue function, we still need to  +	 * hold the queue_lock before calling it, since queue_lock +	 * also serializes access to the device queue. +	 */ + +	q = rcu_dereference(dev->qdisc); +#ifdef CONFIG_NET_CLS_ACT +	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); +#endif +	if (q->enqueue) { +		/* Grab device queue */ +		spin_lock(&dev->queue_lock); + +		rc = q->enqueue(skb, q); + +		qdisc_run(dev); + +		spin_unlock(&dev->queue_lock); +		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; +		goto out; +	} + +	/* The device has no queue. Common case for software devices: +	   loopback, all the sorts of tunnels... + +	   Really, it is unlikely that xmit_lock protection is necessary here. +	   (f.e. loopback and IP tunnels are clean ignoring statistics +	   counters.) +	   However, it is possible, that they rely on protection +	   made by us here. + +	   Check this and shot the lock. It is not prone from deadlocks. +	   Either shot noqueue qdisc, it is even simpler 8) +	 */ +	if (dev->flags & IFF_UP) { +		int cpu = smp_processor_id(); /* ok because BHs are off */ + +		if (dev->xmit_lock_owner != cpu) { + +			HARD_TX_LOCK(dev, cpu); + +			if (!netif_queue_stopped(dev)) { +				if (netdev_nit) +					dev_queue_xmit_nit(skb, dev); + +				rc = 0; +				if (!dev->hard_start_xmit(skb, dev)) { +					HARD_TX_UNLOCK(dev); +					goto out; +				} +			} +			HARD_TX_UNLOCK(dev); +			if (net_ratelimit()) +				printk(KERN_CRIT "Virtual device %s asks to " +				       "queue packet!\n", dev->name); +		} else { +			/* Recursion is detected! It is possible, +			 * unfortunately */ +			if (net_ratelimit()) +				printk(KERN_CRIT "Dead loop on virtual device " +				       "%s, fix it urgently!\n", dev->name); +		} +	} + +	rc = -ENETDOWN; +	local_bh_enable(); + +out_kfree_skb: +	kfree_skb(skb); +	return rc; +out: +	local_bh_enable(); +	return rc; +} + + +/*======================================================================= +			Receiver routines +  =======================================================================*/ + +int netdev_max_backlog = 300; +int weight_p = 64;            /* old backlog weight */ +/* These numbers are selected based on intuition and some + * experimentatiom, if you have more scientific way of doing this + * please go ahead and fix things. + */ +int no_cong_thresh = 10; +int no_cong = 20; +int lo_cong = 100; +int mod_cong = 290; + +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; + + +static void get_sample_stats(int cpu) +{ +#ifdef RAND_LIE +	unsigned long rd; +	int rq; +#endif +	struct softnet_data *sd = &per_cpu(softnet_data, cpu); +	int blog = sd->input_pkt_queue.qlen; +	int avg_blog = sd->avg_blog; + +	avg_blog = (avg_blog >> 1) + (blog >> 1); + +	if (avg_blog > mod_cong) { +		/* Above moderate congestion levels. */ +		sd->cng_level = NET_RX_CN_HIGH; +#ifdef RAND_LIE +		rd = net_random(); +		rq = rd % netdev_max_backlog; +		if (rq < avg_blog) /* unlucky bastard */ +			sd->cng_level = NET_RX_DROP; +#endif +	} else if (avg_blog > lo_cong) { +		sd->cng_level = NET_RX_CN_MOD; +#ifdef RAND_LIE +		rd = net_random(); +		rq = rd % netdev_max_backlog; +			if (rq < avg_blog) /* unlucky bastard */ +				sd->cng_level = NET_RX_CN_HIGH; +#endif +	} else if (avg_blog > no_cong) +		sd->cng_level = NET_RX_CN_LOW; +	else  /* no congestion */ +		sd->cng_level = NET_RX_SUCCESS; + +	sd->avg_blog = avg_blog; +} + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy) +{ +/* 10 ms 0r 1ms -- i don't care -- JHS */ +	int next_tick = 1; +	int cpu = smp_processor_id(); + +	get_sample_stats(cpu); +	next_tick += jiffies; +	mod_timer(&samp_timer, next_tick); +} +#endif + + +/** + *	netif_rx	-	post buffer to the network code + *	@skb: buffer to post + * + *	This function receives a packet from a device driver and queues it for + *	the upper (protocol) levels to process.  It always succeeds. The buffer + *	may be dropped during processing for congestion control or by the + *	protocol layers. + * + *	return values: + *	NET_RX_SUCCESS	(no congestion) + *	NET_RX_CN_LOW   (low congestion) + *	NET_RX_CN_MOD   (moderate congestion) + *	NET_RX_CN_HIGH  (high congestion) + *	NET_RX_DROP     (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ +	int this_cpu; +	struct softnet_data *queue; +	unsigned long flags; + +	/* if netpoll wants it, pretend we never saw it */ +	if (netpoll_rx(skb)) +		return NET_RX_DROP; + +	if (!skb->stamp.tv_sec) +		net_timestamp(&skb->stamp); + +	/* +	 * The code is rearranged so that the path is the most +	 * short when CPU is congested, but is still operating. +	 */ +	local_irq_save(flags); +	this_cpu = smp_processor_id(); +	queue = &__get_cpu_var(softnet_data); + +	__get_cpu_var(netdev_rx_stat).total++; +	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { +		if (queue->input_pkt_queue.qlen) { +			if (queue->throttle) +				goto drop; + +enqueue: +			dev_hold(skb->dev); +			__skb_queue_tail(&queue->input_pkt_queue, skb); +#ifndef OFFLINE_SAMPLE +			get_sample_stats(this_cpu); +#endif +			local_irq_restore(flags); +			return queue->cng_level; +		} + +		if (queue->throttle) +			queue->throttle = 0; + +		netif_rx_schedule(&queue->backlog_dev); +		goto enqueue; +	} + +	if (!queue->throttle) { +		queue->throttle = 1; +		__get_cpu_var(netdev_rx_stat).throttled++; +	} + +drop: +	__get_cpu_var(netdev_rx_stat).dropped++; +	local_irq_restore(flags); + +	kfree_skb(skb); +	return NET_RX_DROP; +} + +int netif_rx_ni(struct sk_buff *skb) +{ +	int err; + +	preempt_disable(); +	err = netif_rx(skb); +	if (local_softirq_pending()) +		do_softirq(); +	preempt_enable(); + +	return err; +} + +EXPORT_SYMBOL(netif_rx_ni); + +static __inline__ void skb_bond(struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; + +	if (dev->master) { +		skb->real_dev = skb->dev; +		skb->dev = dev->master; +	} +} + +static void net_tx_action(struct softirq_action *h) +{ +	struct softnet_data *sd = &__get_cpu_var(softnet_data); + +	if (sd->completion_queue) { +		struct sk_buff *clist; + +		local_irq_disable(); +		clist = sd->completion_queue; +		sd->completion_queue = NULL; +		local_irq_enable(); + +		while (clist) { +			struct sk_buff *skb = clist; +			clist = clist->next; + +			BUG_TRAP(!atomic_read(&skb->users)); +			__kfree_skb(skb); +		} +	} + +	if (sd->output_queue) { +		struct net_device *head; + +		local_irq_disable(); +		head = sd->output_queue; +		sd->output_queue = NULL; +		local_irq_enable(); + +		while (head) { +			struct net_device *dev = head; +			head = head->next_sched; + +			smp_mb__before_clear_bit(); +			clear_bit(__LINK_STATE_SCHED, &dev->state); + +			if (spin_trylock(&dev->queue_lock)) { +				qdisc_run(dev); +				spin_unlock(&dev->queue_lock); +			} else { +				netif_schedule(dev); +			} +		} +	} +} + +static __inline__ int deliver_skb(struct sk_buff *skb, +				  struct packet_type *pt_prev) +{ +	atomic_inc(&skb->users); +	return pt_prev->func(skb, skb->dev, pt_prev); +} + +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, +						unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); + +static __inline__ int handle_bridge(struct sk_buff **pskb, +				    struct packet_type **pt_prev, int *ret) +{ +	struct net_bridge_port *port; + +	if ((*pskb)->pkt_type == PACKET_LOOPBACK || +	    (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) +		return 0; + +	if (*pt_prev) { +		*ret = deliver_skb(*pskb, *pt_prev); +		*pt_prev = NULL; +	}  +	 +	return br_handle_frame_hook(port, pskb); +} +#else +#define handle_bridge(skb, pt_prev, ret)	(0) +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesnt stop any functionality; if you dont have  + * the ingress scheduler, you just cant add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb)  +{ +	struct Qdisc *q; +	struct net_device *dev = skb->dev; +	int result = TC_ACT_OK; +	 +	if (dev->qdisc_ingress) { +		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); +		if (MAX_RED_LOOP < ttl++) { +			printk("Redir loop detected Dropping packet (%s->%s)\n", +				skb->input_dev?skb->input_dev->name:"??",skb->dev->name); +			return TC_ACT_SHOT; +		} + +		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); + +		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); +		if (NULL == skb->input_dev) { +			skb->input_dev = skb->dev; +			printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name); +		} +		spin_lock(&dev->ingress_lock); +		if ((q = dev->qdisc_ingress) != NULL) +			result = q->enqueue(skb, q); +		spin_unlock(&dev->ingress_lock); + +	} + +	return result; +} +#endif + +int netif_receive_skb(struct sk_buff *skb) +{ +	struct packet_type *ptype, *pt_prev; +	int ret = NET_RX_DROP; +	unsigned short type; + +	/* if we've gotten here through NAPI, check netpoll */ +	if (skb->dev->poll && netpoll_rx(skb)) +		return NET_RX_DROP; + +	if (!skb->stamp.tv_sec) +		net_timestamp(&skb->stamp); + +	skb_bond(skb); + +	__get_cpu_var(netdev_rx_stat).total++; + +	skb->h.raw = skb->nh.raw = skb->data; +	skb->mac_len = skb->nh.raw - skb->mac.raw; + +	pt_prev = NULL; + +	rcu_read_lock(); + +#ifdef CONFIG_NET_CLS_ACT +	if (skb->tc_verd & TC_NCLS) { +		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); +		goto ncls; +	} +#endif + +	list_for_each_entry_rcu(ptype, &ptype_all, list) { +		if (!ptype->dev || ptype->dev == skb->dev) { +			if (pt_prev)  +				ret = deliver_skb(skb, pt_prev); +			pt_prev = ptype; +		} +	} + +#ifdef CONFIG_NET_CLS_ACT +	if (pt_prev) { +		ret = deliver_skb(skb, pt_prev); +		pt_prev = NULL; /* noone else should process this after*/ +	} else { +		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); +	} + +	ret = ing_filter(skb); + +	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { +		kfree_skb(skb); +		goto out; +	} + +	skb->tc_verd = 0; +ncls: +#endif + +	handle_diverter(skb); + +	if (handle_bridge(&skb, &pt_prev, &ret)) +		goto out; + +	type = skb->protocol; +	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { +		if (ptype->type == type && +		    (!ptype->dev || ptype->dev == skb->dev)) { +			if (pt_prev)  +				ret = deliver_skb(skb, pt_prev); +			pt_prev = ptype; +		} +	} + +	if (pt_prev) { +		ret = pt_prev->func(skb, skb->dev, pt_prev); +	} else { +		kfree_skb(skb); +		/* Jamal, now you will not able to escape explaining +		 * me how you were going to use this. :-) +		 */ +		ret = NET_RX_DROP; +	} + +out: +	rcu_read_unlock(); +	return ret; +} + +static int process_backlog(struct net_device *backlog_dev, int *budget) +{ +	int work = 0; +	int quota = min(backlog_dev->quota, *budget); +	struct softnet_data *queue = &__get_cpu_var(softnet_data); +	unsigned long start_time = jiffies; + +	for (;;) { +		struct sk_buff *skb; +		struct net_device *dev; + +		local_irq_disable(); +		skb = __skb_dequeue(&queue->input_pkt_queue); +		if (!skb) +			goto job_done; +		local_irq_enable(); + +		dev = skb->dev; + +		netif_receive_skb(skb); + +		dev_put(dev); + +		work++; + +		if (work >= quota || jiffies - start_time > 1) +			break; + +	} + +	backlog_dev->quota -= work; +	*budget -= work; +	return -1; + +job_done: +	backlog_dev->quota -= work; +	*budget -= work; + +	list_del(&backlog_dev->poll_list); +	smp_mb__before_clear_bit(); +	netif_poll_enable(backlog_dev); + +	if (queue->throttle) +		queue->throttle = 0; +	local_irq_enable(); +	return 0; +} + +static void net_rx_action(struct softirq_action *h) +{ +	struct softnet_data *queue = &__get_cpu_var(softnet_data); +	unsigned long start_time = jiffies; +	int budget = netdev_max_backlog; + +	 +	local_irq_disable(); + +	while (!list_empty(&queue->poll_list)) { +		struct net_device *dev; + +		if (budget <= 0 || jiffies - start_time > 1) +			goto softnet_break; + +		local_irq_enable(); + +		dev = list_entry(queue->poll_list.next, +				 struct net_device, poll_list); +		netpoll_poll_lock(dev); + +		if (dev->quota <= 0 || dev->poll(dev, &budget)) { +			netpoll_poll_unlock(dev); +			local_irq_disable(); +			list_del(&dev->poll_list); +			list_add_tail(&dev->poll_list, &queue->poll_list); +			if (dev->quota < 0) +				dev->quota += dev->weight; +			else +				dev->quota = dev->weight; +		} else { +			netpoll_poll_unlock(dev); +			dev_put(dev); +			local_irq_disable(); +		} +	} +out: +	local_irq_enable(); +	return; + +softnet_break: +	__get_cpu_var(netdev_rx_stat).time_squeeze++; +	__raise_softirq_irqoff(NET_RX_SOFTIRQ); +	goto out; +} + +static gifconf_func_t * gifconf_list [NPROTO]; + +/** + *	register_gifconf	-	register a SIOCGIF handler + *	@family: Address family + *	@gifconf: Function handler + * + *	Register protocol dependent address dumping routines. The handler + *	that is passed must not be freed or reused until it has been replaced + *	by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ +	if (family >= NPROTO) +		return -EINVAL; +	gifconf_list[family] = gifconf; +	return 0; +} + + +/* + *	Map an interface index to its name (SIOCGIFNAME) + */ + +/* + *	We need this ioctl for efficient implementation of the + *	if_indextoname() function required by the IPv6 API.  Without + *	it, we would have to search all the interfaces to find a + *	match.  --pb + */ + +static int dev_ifname(struct ifreq __user *arg) +{ +	struct net_device *dev; +	struct ifreq ifr; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +	read_lock(&dev_base_lock); +	dev = __dev_get_by_index(ifr.ifr_ifindex); +	if (!dev) { +		read_unlock(&dev_base_lock); +		return -ENODEV; +	} + +	strcpy(ifr.ifr_name, dev->name); +	read_unlock(&dev_base_lock); + +	if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) +		return -EFAULT; +	return 0; +} + +/* + *	Perform a SIOCGIFCONF call. This structure will change + *	size eventually, and there is nothing I can do about it. + *	Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(char __user *arg) +{ +	struct ifconf ifc; +	struct net_device *dev; +	char __user *pos; +	int len; +	int total; +	int i; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) +		return -EFAULT; + +	pos = ifc.ifc_buf; +	len = ifc.ifc_len; + +	/* +	 *	Loop over the interfaces, and write an info block for each. +	 */ + +	total = 0; +	for (dev = dev_base; dev; dev = dev->next) { +		for (i = 0; i < NPROTO; i++) { +			if (gifconf_list[i]) { +				int done; +				if (!pos) +					done = gifconf_list[i](dev, NULL, 0); +				else +					done = gifconf_list[i](dev, pos + total, +							       len - total); +				if (done < 0) +					return -EFAULT; +				total += done; +			} +		} +  	} + +	/* +	 *	All done.  Write the updated control block back to the caller. +	 */ +	ifc.ifc_len = total; + +	/* +	 * 	Both BSD and Solaris return 0 here, so we do too. +	 */ +	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS +/* + *	This is invoked by the /proc filesystem handler to display a device + *	in detail. + */ +static __inline__ struct net_device *dev_get_idx(loff_t pos) +{ +	struct net_device *dev; +	loff_t i; + +	for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); + +	return i == pos ? dev : NULL; +} + +void *dev_seq_start(struct seq_file *seq, loff_t *pos) +{ +	read_lock(&dev_base_lock); +	return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; +} + +void dev_seq_stop(struct seq_file *seq, void *v) +{ +	read_unlock(&dev_base_lock); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ +	if (dev->get_stats) { +		struct net_device_stats *stats = dev->get_stats(dev); + +		seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " +				"%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", +			   dev->name, stats->rx_bytes, stats->rx_packets, +			   stats->rx_errors, +			   stats->rx_dropped + stats->rx_missed_errors, +			   stats->rx_fifo_errors, +			   stats->rx_length_errors + stats->rx_over_errors + +			     stats->rx_crc_errors + stats->rx_frame_errors, +			   stats->rx_compressed, stats->multicast, +			   stats->tx_bytes, stats->tx_packets, +			   stats->tx_errors, stats->tx_dropped, +			   stats->tx_fifo_errors, stats->collisions, +			   stats->tx_carrier_errors + +			     stats->tx_aborted_errors + +			     stats->tx_window_errors + +			     stats->tx_heartbeat_errors, +			   stats->tx_compressed); +	} else +		seq_printf(seq, "%6s: No statistics available.\n", dev->name); +} + +/* + *	Called from the PROCfs module. This now uses the new arbitrary sized + *	/proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "Inter-|   Receive                            " +			      "                    |  Transmit\n" +			      " face |bytes    packets errs drop fifo frame " +			      "compressed multicast|bytes    packets errs " +			      "drop fifo colls carrier compressed\n"); +	else +		dev_seq_printf_stats(seq, v); +	return 0; +} + +static struct netif_rx_stats *softnet_get_online(loff_t *pos) +{ +	struct netif_rx_stats *rc = NULL; + +	while (*pos < NR_CPUS) +	       	if (cpu_online(*pos)) { +			rc = &per_cpu(netdev_rx_stat, *pos); +			break; +		} else +			++*pos; +	return rc; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ +	return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ +	struct netif_rx_stats *s = v; + +	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +		   s->total, s->dropped, s->time_squeeze, s->throttled, +		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer, +		   s->fastroute_deferred_out, +#if 0 +		   s->fastroute_latency_reduction +#else +		   s->cpu_collision +#endif +		  ); +	return 0; +} + +static struct seq_operations dev_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &dev_seq_ops); +} + +static struct file_operations dev_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +static struct seq_operations softnet_seq_ops = { +	.start = softnet_seq_start, +	.next  = softnet_seq_next, +	.stop  = softnet_seq_stop, +	.show  = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &softnet_seq_ops); +} + +static struct file_operations softnet_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = softnet_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +#ifdef WIRELESS_EXT +extern int wireless_proc_init(void); +#else +#define wireless_proc_init() 0 +#endif + +static int __init dev_proc_init(void) +{ +	int rc = -ENOMEM; + +	if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) +		goto out; +	if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) +		goto out_dev; +	if (wireless_proc_init()) +		goto out_softnet; +	rc = 0; +out: +	return rc; +out_softnet: +	proc_net_remove("softnet_stat"); +out_dev: +	proc_net_remove("dev"); +	goto out; +} +#else +#define dev_proc_init() 0 +#endif	/* CONFIG_PROC_FS */ + + +/** + *	netdev_set_master	-	set up master/slave pair + *	@slave: slave device + *	@master: new master device + * + *	Changes the master device of the slave. Pass %NULL to break the + *	bonding. The caller must hold the RTNL semaphore. On a failure + *	a negative errno code is returned. On success the reference counts + *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the + *	function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ +	struct net_device *old = slave->master; + +	ASSERT_RTNL(); + +	if (master) { +		if (old) +			return -EBUSY; +		dev_hold(master); +	} + +	slave->master = master; +	 +	synchronize_net(); + +	if (old) +		dev_put(old); + +	if (master) +		slave->flags |= IFF_SLAVE; +	else +		slave->flags &= ~IFF_SLAVE; + +	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); +	return 0; +} + +/** + *	dev_set_promiscuity	- update promiscuity count on a device + *	@dev: device + *	@inc: modifier + * + *	Add or remove promsicuity from a device. While the count in the device + *	remains above zero the interface remains promiscuous. Once it hits zero + *	the device reverts back to normal filtering operation. A negative inc + *	value is used to drop promiscuity on the device. + */ +void dev_set_promiscuity(struct net_device *dev, int inc) +{ +	unsigned short old_flags = dev->flags; + +	dev->flags |= IFF_PROMISC; +	if ((dev->promiscuity += inc) == 0) +		dev->flags &= ~IFF_PROMISC; +	if (dev->flags ^ old_flags) { +		dev_mc_upload(dev); +		printk(KERN_INFO "device %s %s promiscuous mode\n", +		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" : +		       					       "left"); +	} +} + +/** + *	dev_set_allmulti	- update allmulti count on a device + *	@dev: device + *	@inc: modifier + * + *	Add or remove reception of all multicast frames to a device. While the + *	count in the device remains above zero the interface remains listening + *	to all interfaces. Once it hits zero the device reverts back to normal + *	filtering operation. A negative @inc value is used to drop the counter + *	when releasing a resource needing all multicasts. + */ + +void dev_set_allmulti(struct net_device *dev, int inc) +{ +	unsigned short old_flags = dev->flags; + +	dev->flags |= IFF_ALLMULTI; +	if ((dev->allmulti += inc) == 0) +		dev->flags &= ~IFF_ALLMULTI; +	if (dev->flags ^ old_flags) +		dev_mc_upload(dev); +} + +unsigned dev_get_flags(const struct net_device *dev) +{ +	unsigned flags; + +	flags = (dev->flags & ~(IFF_PROMISC | +				IFF_ALLMULTI | +				IFF_RUNNING)) |  +		(dev->gflags & (IFF_PROMISC | +				IFF_ALLMULTI)); + +	if (netif_running(dev) && netif_carrier_ok(dev)) +		flags |= IFF_RUNNING; + +	return flags; +} + +int dev_change_flags(struct net_device *dev, unsigned flags) +{ +	int ret; +	int old_flags = dev->flags; + +	/* +	 *	Set the flags on our device. +	 */ + +	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | +			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | +			       IFF_AUTOMEDIA)) | +		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | +				    IFF_ALLMULTI)); + +	/* +	 *	Load in the correct multicast list now the flags have changed. +	 */ + +	dev_mc_upload(dev); + +	/* +	 *	Have we downed the interface. We handle IFF_UP ourselves +	 *	according to user attempts to set it, rather than blindly +	 *	setting it. +	 */ + +	ret = 0; +	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */ +		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + +		if (!ret) +			dev_mc_upload(dev); +	} + +	if (dev->flags & IFF_UP && +	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | +					  IFF_VOLATILE))) +		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + +	if ((flags ^ dev->gflags) & IFF_PROMISC) { +		int inc = (flags & IFF_PROMISC) ? +1 : -1; +		dev->gflags ^= IFF_PROMISC; +		dev_set_promiscuity(dev, inc); +	} + +	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI +	   is important. Some (broken) drivers set IFF_PROMISC, when +	   IFF_ALLMULTI is requested not asking us and not reporting. +	 */ +	if ((flags ^ dev->gflags) & IFF_ALLMULTI) { +		int inc = (flags & IFF_ALLMULTI) ? +1 : -1; +		dev->gflags ^= IFF_ALLMULTI; +		dev_set_allmulti(dev, inc); +	} + +	if (old_flags ^ dev->flags) +		rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); + +	return ret; +} + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ +	int err; + +	if (new_mtu == dev->mtu) +		return 0; + +	/*	MTU must be positive.	 */ +	if (new_mtu < 0) +		return -EINVAL; + +	if (!netif_device_present(dev)) +		return -ENODEV; + +	err = 0; +	if (dev->change_mtu) +		err = dev->change_mtu(dev, new_mtu); +	else +		dev->mtu = new_mtu; +	if (!err && dev->flags & IFF_UP) +		notifier_call_chain(&netdev_chain, +				    NETDEV_CHANGEMTU, dev); +	return err; +} + +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ +	int err; + +	if (!dev->set_mac_address) +		return -EOPNOTSUPP; +	if (sa->sa_family != dev->type) +		return -EINVAL; +	if (!netif_device_present(dev)) +		return -ENODEV; +	err = dev->set_mac_address(dev, sa); +	if (!err) +		notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); +	return err; +} + +/* + *	Perform the SIOCxIFxxx calls. + */ +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ +	int err; +	struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + +	if (!dev) +		return -ENODEV; + +	switch (cmd) { +		case SIOCGIFFLAGS:	/* Get interface flags */ +			ifr->ifr_flags = dev_get_flags(dev); +			return 0; + +		case SIOCSIFFLAGS:	/* Set interface flags */ +			return dev_change_flags(dev, ifr->ifr_flags); + +		case SIOCGIFMETRIC:	/* Get the metric on the interface +					   (currently unused) */ +			ifr->ifr_metric = 0; +			return 0; + +		case SIOCSIFMETRIC:	/* Set the metric on the interface +					   (currently unused) */ +			return -EOPNOTSUPP; + +		case SIOCGIFMTU:	/* Get the MTU of a device */ +			ifr->ifr_mtu = dev->mtu; +			return 0; + +		case SIOCSIFMTU:	/* Set the MTU of a device */ +			return dev_set_mtu(dev, ifr->ifr_mtu); + +		case SIOCGIFHWADDR: +			if (!dev->addr_len) +				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); +			else +				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, +				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +			ifr->ifr_hwaddr.sa_family = dev->type; +			return 0; + +		case SIOCSIFHWADDR: +			return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + +		case SIOCSIFHWBROADCAST: +			if (ifr->ifr_hwaddr.sa_family != dev->type) +				return -EINVAL; +			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, +			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +			notifier_call_chain(&netdev_chain, +					    NETDEV_CHANGEADDR, dev); +			return 0; + +		case SIOCGIFMAP: +			ifr->ifr_map.mem_start = dev->mem_start; +			ifr->ifr_map.mem_end   = dev->mem_end; +			ifr->ifr_map.base_addr = dev->base_addr; +			ifr->ifr_map.irq       = dev->irq; +			ifr->ifr_map.dma       = dev->dma; +			ifr->ifr_map.port      = dev->if_port; +			return 0; + +		case SIOCSIFMAP: +			if (dev->set_config) { +				if (!netif_device_present(dev)) +					return -ENODEV; +				return dev->set_config(dev, &ifr->ifr_map); +			} +			return -EOPNOTSUPP; + +		case SIOCADDMULTI: +			if (!dev->set_multicast_list || +			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +				return -EINVAL; +			if (!netif_device_present(dev)) +				return -ENODEV; +			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, +					  dev->addr_len, 1); + +		case SIOCDELMULTI: +			if (!dev->set_multicast_list || +			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +				return -EINVAL; +			if (!netif_device_present(dev)) +				return -ENODEV; +			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, +					     dev->addr_len, 1); + +		case SIOCGIFINDEX: +			ifr->ifr_ifindex = dev->ifindex; +			return 0; + +		case SIOCGIFTXQLEN: +			ifr->ifr_qlen = dev->tx_queue_len; +			return 0; + +		case SIOCSIFTXQLEN: +			if (ifr->ifr_qlen < 0) +				return -EINVAL; +			dev->tx_queue_len = ifr->ifr_qlen; +			return 0; + +		case SIOCSIFNAME: +			ifr->ifr_newname[IFNAMSIZ-1] = '\0'; +			return dev_change_name(dev, ifr->ifr_newname); + +		/* +		 *	Unknown or private ioctl +		 */ + +		default: +			if ((cmd >= SIOCDEVPRIVATE && +			    cmd <= SIOCDEVPRIVATE + 15) || +			    cmd == SIOCBONDENSLAVE || +			    cmd == SIOCBONDRELEASE || +			    cmd == SIOCBONDSETHWADDR || +			    cmd == SIOCBONDSLAVEINFOQUERY || +			    cmd == SIOCBONDINFOQUERY || +			    cmd == SIOCBONDCHANGEACTIVE || +			    cmd == SIOCGMIIPHY || +			    cmd == SIOCGMIIREG || +			    cmd == SIOCSMIIREG || +			    cmd == SIOCBRADDIF || +			    cmd == SIOCBRDELIF || +			    cmd == SIOCWANDEV) { +				err = -EOPNOTSUPP; +				if (dev->do_ioctl) { +					if (netif_device_present(dev)) +						err = dev->do_ioctl(dev, ifr, +								    cmd); +					else +						err = -ENODEV; +				} +			} else +				err = -EINVAL; + +	} +	return err; +} + +/* + *	This function handles all "interface"-type I/O control requests. The actual + *	'doing' part of this is dev_ifsioc above. + */ + +/** + *	dev_ioctl	-	network device ioctl + *	@cmd: command to issue + *	@arg: pointer to a struct ifreq in user space + * + *	Issue ioctl functions to devices. This is normally called by the + *	user space syscall interfaces but can sometimes be useful for + *	other purposes. The return value is the return from the syscall if + *	positive or a negative errno code on error. + */ + +int dev_ioctl(unsigned int cmd, void __user *arg) +{ +	struct ifreq ifr; +	int ret; +	char *colon; + +	/* One special case: SIOCGIFCONF takes ifconf argument +	   and requires shared lock, because it sleeps writing +	   to user space. +	 */ + +	if (cmd == SIOCGIFCONF) { +		rtnl_shlock(); +		ret = dev_ifconf((char __user *) arg); +		rtnl_shunlock(); +		return ret; +	} +	if (cmd == SIOCGIFNAME) +		return dev_ifname((struct ifreq __user *)arg); + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +	ifr.ifr_name[IFNAMSIZ-1] = 0; + +	colon = strchr(ifr.ifr_name, ':'); +	if (colon) +		*colon = 0; + +	/* +	 *	See which interface the caller is talking about. +	 */ + +	switch (cmd) { +		/* +		 *	These ioctl calls: +		 *	- can be done by all. +		 *	- atomic and do not require locking. +		 *	- return a value +		 */ +		case SIOCGIFFLAGS: +		case SIOCGIFMETRIC: +		case SIOCGIFMTU: +		case SIOCGIFHWADDR: +		case SIOCGIFSLAVE: +		case SIOCGIFMAP: +		case SIOCGIFINDEX: +		case SIOCGIFTXQLEN: +			dev_load(ifr.ifr_name); +			read_lock(&dev_base_lock); +			ret = dev_ifsioc(&ifr, cmd); +			read_unlock(&dev_base_lock); +			if (!ret) { +				if (colon) +					*colon = ':'; +				if (copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +					ret = -EFAULT; +			} +			return ret; + +		case SIOCETHTOOL: +			dev_load(ifr.ifr_name); +			rtnl_lock(); +			ret = dev_ethtool(&ifr); +			rtnl_unlock(); +			if (!ret) { +				if (colon) +					*colon = ':'; +				if (copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +					ret = -EFAULT; +			} +			return ret; + +		/* +		 *	These ioctl calls: +		 *	- require superuser power. +		 *	- require strict serialization. +		 *	- return a value +		 */ +		case SIOCGMIIPHY: +		case SIOCGMIIREG: +		case SIOCSIFNAME: +			if (!capable(CAP_NET_ADMIN)) +				return -EPERM; +			dev_load(ifr.ifr_name); +			rtnl_lock(); +			ret = dev_ifsioc(&ifr, cmd); +			rtnl_unlock(); +			if (!ret) { +				if (colon) +					*colon = ':'; +				if (copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +					ret = -EFAULT; +			} +			return ret; + +		/* +		 *	These ioctl calls: +		 *	- require superuser power. +		 *	- require strict serialization. +		 *	- do not return a value +		 */ +		case SIOCSIFFLAGS: +		case SIOCSIFMETRIC: +		case SIOCSIFMTU: +		case SIOCSIFMAP: +		case SIOCSIFHWADDR: +		case SIOCSIFSLAVE: +		case SIOCADDMULTI: +		case SIOCDELMULTI: +		case SIOCSIFHWBROADCAST: +		case SIOCSIFTXQLEN: +		case SIOCSMIIREG: +		case SIOCBONDENSLAVE: +		case SIOCBONDRELEASE: +		case SIOCBONDSETHWADDR: +		case SIOCBONDSLAVEINFOQUERY: +		case SIOCBONDINFOQUERY: +		case SIOCBONDCHANGEACTIVE: +		case SIOCBRADDIF: +		case SIOCBRDELIF: +			if (!capable(CAP_NET_ADMIN)) +				return -EPERM; +			dev_load(ifr.ifr_name); +			rtnl_lock(); +			ret = dev_ifsioc(&ifr, cmd); +			rtnl_unlock(); +			return ret; + +		case SIOCGIFMEM: +			/* Get the per device memory space. We can add this but +			 * currently do not support it */ +		case SIOCSIFMEM: +			/* Set the per device memory buffer space. +			 * Not applicable in our case */ +		case SIOCSIFLINK: +			return -EINVAL; + +		/* +		 *	Unknown or private ioctl. +		 */ +		default: +			if (cmd == SIOCWANDEV || +			    (cmd >= SIOCDEVPRIVATE && +			     cmd <= SIOCDEVPRIVATE + 15)) { +				dev_load(ifr.ifr_name); +				rtnl_lock(); +				ret = dev_ifsioc(&ifr, cmd); +				rtnl_unlock(); +				if (!ret && copy_to_user(arg, &ifr, +							 sizeof(struct ifreq))) +					ret = -EFAULT; +				return ret; +			} +#ifdef WIRELESS_EXT +			/* Take care of Wireless Extensions */ +			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { +				/* If command is `set a parameter', or +				 * `get the encoding parameters', check if +				 * the user has the right to do it */ +				if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { +					if (!capable(CAP_NET_ADMIN)) +						return -EPERM; +				} +				dev_load(ifr.ifr_name); +				rtnl_lock(); +				/* Follow me in net/core/wireless.c */ +				ret = wireless_process_ioctl(&ifr, cmd); +				rtnl_unlock(); +				if (IW_IS_GET(cmd) && +				    copy_to_user(arg, &ifr, +					    	 sizeof(struct ifreq))) +					ret = -EFAULT; +				return ret; +			} +#endif	/* WIRELESS_EXT */ +			return -EINVAL; +	} +} + + +/** + *	dev_new_index	-	allocate an ifindex + * + *	Returns a suitable unique value for a new device interface + *	number.  The caller must hold the rtnl semaphore or the + *	dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(void) +{ +	static int ifindex; +	for (;;) { +		if (++ifindex <= 0) +			ifindex = 1; +		if (!__dev_get_by_index(ifindex)) +			return ifindex; +	} +} + +static int dev_boot_phase = 1; + +/* Delayed registration/unregisteration */ +static DEFINE_SPINLOCK(net_todo_list_lock); +static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); + +static inline void net_set_todo(struct net_device *dev) +{ +	spin_lock(&net_todo_list_lock); +	list_add_tail(&dev->todo_list, &net_todo_list); +	spin_unlock(&net_todo_list_lock); +} + +/** + *	register_netdevice	- register a network device + *	@dev: device to register + * + *	Take a completed network device structure and add it to the kernel + *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + *	chain. 0 is returned on success. A negative errno code is returned + *	on a failure to set up the device, or if the name is a duplicate. + * + *	Callers must hold the rtnl semaphore. You may want + *	register_netdev() instead of this. + * + *	BUGS: + *	The locking appears insufficient to guarantee two parallel registers + *	will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ +	struct hlist_head *head; +	struct hlist_node *p; +	int ret; + +	BUG_ON(dev_boot_phase); +	ASSERT_RTNL(); + +	/* When net_device's are persistent, this will be fatal. */ +	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + +	spin_lock_init(&dev->queue_lock); +	spin_lock_init(&dev->xmit_lock); +	dev->xmit_lock_owner = -1; +#ifdef CONFIG_NET_CLS_ACT +	spin_lock_init(&dev->ingress_lock); +#endif + +	ret = alloc_divert_blk(dev); +	if (ret) +		goto out; + +	dev->iflink = -1; + +	/* Init, if this function is available */ +	if (dev->init) { +		ret = dev->init(dev); +		if (ret) { +			if (ret > 0) +				ret = -EIO; +			goto out_err; +		} +	} +  +	if (!dev_valid_name(dev->name)) { +		ret = -EINVAL; +		goto out_err; +	} + +	dev->ifindex = dev_new_index(); +	if (dev->iflink == -1) +		dev->iflink = dev->ifindex; + +	/* Check for existence of name */ +	head = dev_name_hash(dev->name); +	hlist_for_each(p, head) { +		struct net_device *d +			= hlist_entry(p, struct net_device, name_hlist); +		if (!strncmp(d->name, dev->name, IFNAMSIZ)) { +			ret = -EEXIST; + 			goto out_err; +		} + 	} + +	/* Fix illegal SG+CSUM combinations. */ +	if ((dev->features & NETIF_F_SG) && +	    !(dev->features & (NETIF_F_IP_CSUM | +			       NETIF_F_NO_CSUM | +			       NETIF_F_HW_CSUM))) { +		printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", +		       dev->name); +		dev->features &= ~NETIF_F_SG; +	} + +	/* TSO requires that SG is present as well. */ +	if ((dev->features & NETIF_F_TSO) && +	    !(dev->features & NETIF_F_SG)) { +		printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", +		       dev->name); +		dev->features &= ~NETIF_F_TSO; +	} + +	/* +	 *	nil rebuild_header routine, +	 *	that should be never called and used as just bug trap. +	 */ + +	if (!dev->rebuild_header) +		dev->rebuild_header = default_rebuild_header; + +	/* +	 *	Default initial state at registry is that the +	 *	device is present. +	 */ + +	set_bit(__LINK_STATE_PRESENT, &dev->state); + +	dev->next = NULL; +	dev_init_scheduler(dev); +	write_lock_bh(&dev_base_lock); +	*dev_tail = dev; +	dev_tail = &dev->next; +	hlist_add_head(&dev->name_hlist, head); +	hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); +	dev_hold(dev); +	dev->reg_state = NETREG_REGISTERING; +	write_unlock_bh(&dev_base_lock); + +	/* Notify protocols, that a new device appeared. */ +	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + +	/* Finish registration after unlock */ +	net_set_todo(dev); +	ret = 0; + +out: +	return ret; +out_err: +	free_divert_blk(dev); +	goto out; +} + +/** + *	register_netdev	- register a network device + *	@dev: device to register + * + *	Take a completed network device structure and add it to the kernel + *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + *	chain. 0 is returned on success. A negative errno code is returned + *	on a failure to set up the device, or if the name is a duplicate. + * + *	This is a wrapper around register_netdev that takes the rtnl semaphore + *	and expands the device name if you passed a format string to + *	alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ +	int err; + +	rtnl_lock(); + +	/* +	 * If the name is a format string the caller wants us to do a +	 * name allocation. +	 */ +	if (strchr(dev->name, '%')) { +		err = dev_alloc_name(dev, dev->name); +		if (err < 0) +			goto out; +	} +	 +	/* +	 * Back compatibility hook. Kill this one in 2.5 +	 */ +	if (dev->name[0] == 0 || dev->name[0] == ' ') { +		err = dev_alloc_name(dev, "eth%d"); +		if (err < 0) +			goto out; +	} + +	err = register_netdevice(dev); +out: +	rtnl_unlock(); +	return err; +} +EXPORT_SYMBOL(register_netdev); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put.  + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ +	unsigned long rebroadcast_time, warning_time; + +	rebroadcast_time = warning_time = jiffies; +	while (atomic_read(&dev->refcnt) != 0) { +		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { +			rtnl_shlock(); + +			/* Rebroadcast unregister notification */ +			notifier_call_chain(&netdev_chain, +					    NETDEV_UNREGISTER, dev); + +			if (test_bit(__LINK_STATE_LINKWATCH_PENDING, +				     &dev->state)) { +				/* We must not have linkwatch events +				 * pending on unregister. If this +				 * happens, we simply run the queue +				 * unscheduled, resulting in a noop +				 * for this device. +				 */ +				linkwatch_run_queue(); +			} + +			rtnl_shunlock(); + +			rebroadcast_time = jiffies; +		} + +		msleep(250); + +		if (time_after(jiffies, warning_time + 10 * HZ)) { +			printk(KERN_EMERG "unregister_netdevice: " +			       "waiting for %s to become free. Usage " +			       "count = %d\n", +			       dev->name, atomic_read(&dev->refcnt)); +			warning_time = jiffies; +		} +	} +} + +/* The sequence is: + * + *	rtnl_lock(); + *	... + *	register_netdevice(x1); + *	register_netdevice(x2); + *	... + *	unregister_netdevice(y1); + *	unregister_netdevice(y2); + *      ... + *	rtnl_unlock(); + *	free_netdev(y1); + *	free_netdev(y2); + * + * We are invoked by rtnl_unlock() after it drops the semaphore. + * This allows us to deal with problems: + * 1) We can create/delete sysfs objects which invoke hotplug + *    without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + *    safely in order to wait for the netdev refcnt to drop to zero. + */ +static DECLARE_MUTEX(net_todo_run_mutex); +void netdev_run_todo(void) +{ +	struct list_head list = LIST_HEAD_INIT(list); +	int err; + + +	/* Need to guard against multiple cpu's getting out of order. */ +	down(&net_todo_run_mutex); + +	/* Not safe to do outside the semaphore.  We must not return +	 * until all unregister events invoked by the local processor +	 * have been completed (either by this todo run, or one on +	 * another cpu). +	 */ +	if (list_empty(&net_todo_list)) +		goto out; + +	/* Snapshot list, allow later requests */ +	spin_lock(&net_todo_list_lock); +	list_splice_init(&net_todo_list, &list); +	spin_unlock(&net_todo_list_lock); +		 +	while (!list_empty(&list)) { +		struct net_device *dev +			= list_entry(list.next, struct net_device, todo_list); +		list_del(&dev->todo_list); + +		switch(dev->reg_state) { +		case NETREG_REGISTERING: +			err = netdev_register_sysfs(dev); +			if (err) +				printk(KERN_ERR "%s: failed sysfs registration (%d)\n", +				       dev->name, err); +			dev->reg_state = NETREG_REGISTERED; +			break; + +		case NETREG_UNREGISTERING: +			netdev_unregister_sysfs(dev); +			dev->reg_state = NETREG_UNREGISTERED; + +			netdev_wait_allrefs(dev); + +			/* paranoia */ +			BUG_ON(atomic_read(&dev->refcnt)); +			BUG_TRAP(!dev->ip_ptr); +			BUG_TRAP(!dev->ip6_ptr); +			BUG_TRAP(!dev->dn_ptr); + + +			/* It must be the very last action,  +			 * after this 'dev' may point to freed up memory. +			 */ +			if (dev->destructor) +				dev->destructor(dev); +			break; + +		default: +			printk(KERN_ERR "network todo '%s' but state %d\n", +			       dev->name, dev->reg_state); +			break; +		} +	} + +out: +	up(&net_todo_run_mutex); +} + +/** + *	alloc_netdev - allocate network device + *	@sizeof_priv:	size of private data to allocate space for + *	@name:		device name format string + *	@setup:		callback to initialize device + * + *	Allocates a struct net_device with private data area for driver use + *	and performs basic initialization. + */ +struct net_device *alloc_netdev(int sizeof_priv, const char *name, +		void (*setup)(struct net_device *)) +{ +	void *p; +	struct net_device *dev; +	int alloc_size; + +	/* ensure 32-byte alignment of both the device and private area */ +	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; +	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + +	p = kmalloc(alloc_size, GFP_KERNEL); +	if (!p) { +		printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); +		return NULL; +	} +	memset(p, 0, alloc_size); + +	dev = (struct net_device *) +		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); +	dev->padded = (char *)dev - (char *)p; + +	if (sizeof_priv) +		dev->priv = netdev_priv(dev); + +	setup(dev); +	strcpy(dev->name, name); +	return dev; +} +EXPORT_SYMBOL(alloc_netdev); + +/** + *	free_netdev - free network device + *	@dev: device + * + *	This function does the last stage of destroying an allocated device  + * 	interface. The reference to the device object is released.   + *	If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ +#ifdef CONFIG_SYSFS +	/*  Compatiablity with error handling in drivers */ +	if (dev->reg_state == NETREG_UNINITIALIZED) { +		kfree((char *)dev - dev->padded); +		return; +	} + +	BUG_ON(dev->reg_state != NETREG_UNREGISTERED); +	dev->reg_state = NETREG_RELEASED; + +	/* will free via class release */ +	class_device_put(&dev->class_dev); +#else +	kfree((char *)dev - dev->padded); +#endif +} +  +/* Synchronize with packet receive processing. */ +void synchronize_net(void)  +{ +	might_sleep(); +	synchronize_kernel(); +} + +/** + *	unregister_netdevice - remove device from the kernel + *	@dev: device + * + *	This function shuts down a device interface and removes it + *	from the kernel tables. On success 0 is returned, on a failure + *	a negative errno code is returned. + * + *	Callers must hold the rtnl semaphore.  You may want + *	unregister_netdev() instead of this. + */ + +int unregister_netdevice(struct net_device *dev) +{ +	struct net_device *d, **dp; + +	BUG_ON(dev_boot_phase); +	ASSERT_RTNL(); + +	/* Some devices call without registering for initialization unwind. */ +	if (dev->reg_state == NETREG_UNINITIALIZED) { +		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " +				  "was registered\n", dev->name, dev); +		return -ENODEV; +	} + +	BUG_ON(dev->reg_state != NETREG_REGISTERED); + +	/* If device is running, close it first. */ +	if (dev->flags & IFF_UP) +		dev_close(dev); + +	/* And unlink it from device chain. */ +	for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { +		if (d == dev) { +			write_lock_bh(&dev_base_lock); +			hlist_del(&dev->name_hlist); +			hlist_del(&dev->index_hlist); +			if (dev_tail == &dev->next) +				dev_tail = dp; +			*dp = d->next; +			write_unlock_bh(&dev_base_lock); +			break; +		} +	} +	if (!d) { +		printk(KERN_ERR "unregister net_device: '%s' not found\n", +		       dev->name); +		return -ENODEV; +	} + +	dev->reg_state = NETREG_UNREGISTERING; + +	synchronize_net(); + +	/* Shutdown queueing discipline. */ +	dev_shutdown(dev); + +	 +	/* Notify protocols, that we are about to destroy +	   this device. They should clean all the things. +	*/ +	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); +	 +	/* +	 *	Flush the multicast chain +	 */ +	dev_mc_discard(dev); + +	if (dev->uninit) +		dev->uninit(dev); + +	/* Notifier chain MUST detach us from master device. */ +	BUG_TRAP(!dev->master); + +	free_divert_blk(dev); + +	/* Finish processing unregister after unlock */ +	net_set_todo(dev); + +	synchronize_net(); + +	dev_put(dev); +	return 0; +} + +/** + *	unregister_netdev - remove device from the kernel + *	@dev: device + * + *	This function shuts down a device interface and removes it + *	from the kernel tables. On success 0 is returned, on a failure + *	a negative errno code is returned. + * + *	This is just a wrapper for unregister_netdevice that takes + *	the rtnl semaphore.  In general you want to use this and not + *	unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ +	rtnl_lock(); +	unregister_netdevice(dev); +	rtnl_unlock(); +} + +EXPORT_SYMBOL(unregister_netdev); + +#ifdef CONFIG_HOTPLUG_CPU +static int dev_cpu_callback(struct notifier_block *nfb, +			    unsigned long action, +			    void *ocpu) +{ +	struct sk_buff **list_skb; +	struct net_device **list_net; +	struct sk_buff *skb; +	unsigned int cpu, oldcpu = (unsigned long)ocpu; +	struct softnet_data *sd, *oldsd; + +	if (action != CPU_DEAD) +		return NOTIFY_OK; + +	local_irq_disable(); +	cpu = smp_processor_id(); +	sd = &per_cpu(softnet_data, cpu); +	oldsd = &per_cpu(softnet_data, oldcpu); + +	/* Find end of our completion_queue. */ +	list_skb = &sd->completion_queue; +	while (*list_skb) +		list_skb = &(*list_skb)->next; +	/* Append completion queue from offline CPU. */ +	*list_skb = oldsd->completion_queue; +	oldsd->completion_queue = NULL; + +	/* Find end of our output_queue. */ +	list_net = &sd->output_queue; +	while (*list_net) +		list_net = &(*list_net)->next_sched; +	/* Append output queue from offline CPU. */ +	*list_net = oldsd->output_queue; +	oldsd->output_queue = NULL; + +	raise_softirq_irqoff(NET_TX_SOFTIRQ); +	local_irq_enable(); + +	/* Process offline CPU's input_pkt_queue */ +	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) +		netif_rx(skb); + +	return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + + +/* + *	Initialize the DEV module. At boot time this walks the device list and + *	unhooks any devices that fail to initialise (normally hardware not + *	present) and leaves us with a valid list of present and active devices. + * + */ + +/* + *       This is called single threaded during boot, so no need + *       to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ +	int i, rc = -ENOMEM; + +	BUG_ON(!dev_boot_phase); + +	net_random_init(); + +	if (dev_proc_init()) +		goto out; + +	if (netdev_sysfs_init()) +		goto out; + +	INIT_LIST_HEAD(&ptype_all); +	for (i = 0; i < 16; i++)  +		INIT_LIST_HEAD(&ptype_base[i]); + +	for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) +		INIT_HLIST_HEAD(&dev_name_head[i]); + +	for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) +		INIT_HLIST_HEAD(&dev_index_head[i]); + +	/* +	 *	Initialise the packet receive queues. +	 */ + +	for (i = 0; i < NR_CPUS; i++) { +		struct softnet_data *queue; + +		queue = &per_cpu(softnet_data, i); +		skb_queue_head_init(&queue->input_pkt_queue); +		queue->throttle = 0; +		queue->cng_level = 0; +		queue->avg_blog = 10; /* arbitrary non-zero */ +		queue->completion_queue = NULL; +		INIT_LIST_HEAD(&queue->poll_list); +		set_bit(__LINK_STATE_START, &queue->backlog_dev.state); +		queue->backlog_dev.weight = weight_p; +		queue->backlog_dev.poll = process_backlog; +		atomic_set(&queue->backlog_dev.refcnt, 1); +	} + +#ifdef OFFLINE_SAMPLE +	samp_timer.expires = jiffies + (10 * HZ); +	add_timer(&samp_timer); +#endif + +	dev_boot_phase = 0; + +	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); +	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); + +	hotcpu_notifier(dev_cpu_callback, 0); +	dst_init(); +	dev_mcast_init(); +	rc = 0; +out: +	return rc; +} + +subsys_initcall(net_dev_init); + +EXPORT_SYMBOL(__dev_get_by_index); +EXPORT_SYMBOL(__dev_get_by_name); +EXPORT_SYMBOL(__dev_remove_pack); +EXPORT_SYMBOL(__skb_linearize); +EXPORT_SYMBOL(dev_add_pack); +EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(dev_get_by_flags); +EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(dev_ioctl); +EXPORT_SYMBOL(dev_open); +EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(dev_remove_pack); +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_change_flags); +EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); +EXPORT_SYMBOL(free_netdev); +EXPORT_SYMBOL(netdev_boot_setup_check); +EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_rx); +EXPORT_SYMBOL(register_gifconf); +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(register_netdevice_notifier); +EXPORT_SYMBOL(skb_checksum_help); +EXPORT_SYMBOL(synchronize_net); +EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_notifier); +EXPORT_SYMBOL(net_enable_timestamp); +EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); +#endif + +#ifdef CONFIG_KMOD +EXPORT_SYMBOL(dev_load); +#endif + +EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c new file mode 100644 index 00000000000..db098ff3cd6 --- /dev/null +++ b/net/core/dev_mcast.c @@ -0,0 +1,299 @@ +/* + *	Linux NET3:	Multicast List maintenance.  + * + *	Authors: + *		Tim Kordas <tjk@nostromo.eeap.cwru.edu>  + *		Richard Underwood <richard@wuzz.demon.co.uk> + * + *	Stir fried together from the IP multicast and CAP patches above + *		Alan Cox <Alan.Cox@linux.org>	 + * + *	Fixes: + *		Alan Cox	:	Update the device on a real delete + *					rather than any time but... + *		Alan Cox	:	IFF_ALLMULTI support. + *		Alan Cox	: 	New format set_multicast_list() calls. + *		Gleb Natapov    :       Remove dev_mc_lock. + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h>  +#include <linux/module.h>  +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> + + +/* + *	Device multicast list maintenance.  + * + *	This is used both by IP and by the user level maintenance functions.  + *	Unlike BSD we maintain a usage count on a given multicast address so  + *	that a casual user application can add/delete multicasts used by  + *	protocols without doing damage to the protocols when it deletes the + *	entries. It also helps IP as it tracks overlapping maps. + * + *	Device mc lists are changed by bh at least if IPv6 is enabled, + *	so that it must be bh protected. + * + *	We block accesses to device mc filters with dev->xmit_lock. + */ + +/* + *	Update the multicast list into the physical NIC controller. + */ +  +static void __dev_mc_upload(struct net_device *dev) +{ +	/* Don't do anything till we up the interface +	 * [dev_open will call this function so the list will +	 * stay sane] +	 */ + +	if (!(dev->flags&IFF_UP)) +		return; + +	/* +	 *	Devices with no set multicast or which have been +	 *	detached don't get set. +	 */ + +	if (dev->set_multicast_list == NULL || +	    !netif_device_present(dev)) +		return; + +	dev->set_multicast_list(dev); +} + +void dev_mc_upload(struct net_device *dev) +{ +	spin_lock_bh(&dev->xmit_lock); +	__dev_mc_upload(dev); +	spin_unlock_bh(&dev->xmit_lock); +} + +/* + *	Delete a device level multicast + */ +  +int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) +{ +	int err = 0; +	struct dev_mc_list *dmi, **dmip; + +	spin_lock_bh(&dev->xmit_lock); + +	for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { +		/* +		 *	Find the entry we want to delete. The device could +		 *	have variable length entries so check these too. +		 */ +		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && +		    alen == dmi->dmi_addrlen) { +			if (glbl) { +				int old_glbl = dmi->dmi_gusers; +				dmi->dmi_gusers = 0; +				if (old_glbl == 0) +					break; +			} +			if (--dmi->dmi_users) +				goto done; + +			/* +			 *	Last user. So delete the entry. +			 */ +			*dmip = dmi->next; +			dev->mc_count--; + +			kfree(dmi); + +			/* +			 *	We have altered the list, so the card +			 *	loaded filter is now wrong. Fix it +			 */ +			__dev_mc_upload(dev); +			 +			spin_unlock_bh(&dev->xmit_lock); +			return 0; +		} +	} +	err = -ENOENT; +done: +	spin_unlock_bh(&dev->xmit_lock); +	return err; +} + +/* + *	Add a device level multicast + */ +  +int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) +{ +	int err = 0; +	struct dev_mc_list *dmi, *dmi1; + +	dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + +	spin_lock_bh(&dev->xmit_lock); +	for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { +		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && +		    dmi->dmi_addrlen == alen) { +			if (glbl) { +				int old_glbl = dmi->dmi_gusers; +				dmi->dmi_gusers = 1; +				if (old_glbl) +					goto done; +			} +			dmi->dmi_users++; +			goto done; +		} +	} + +	if ((dmi = dmi1) == NULL) { +		spin_unlock_bh(&dev->xmit_lock); +		return -ENOMEM; +	} +	memcpy(dmi->dmi_addr, addr, alen); +	dmi->dmi_addrlen = alen; +	dmi->next = dev->mc_list; +	dmi->dmi_users = 1; +	dmi->dmi_gusers = glbl ? 1 : 0; +	dev->mc_list = dmi; +	dev->mc_count++; + +	__dev_mc_upload(dev); +	 +	spin_unlock_bh(&dev->xmit_lock); +	return 0; + +done: +	spin_unlock_bh(&dev->xmit_lock); +	if (dmi1) +		kfree(dmi1); +	return err; +} + +/* + *	Discard multicast list when a device is downed + */ + +void dev_mc_discard(struct net_device *dev) +{ +	spin_lock_bh(&dev->xmit_lock); +	 +	while (dev->mc_list != NULL) { +		struct dev_mc_list *tmp = dev->mc_list; +		dev->mc_list = tmp->next; +		if (tmp->dmi_users > tmp->dmi_gusers) +			printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); +		kfree(tmp); +	} +	dev->mc_count = 0; + +	spin_unlock_bh(&dev->xmit_lock); +} + +#ifdef CONFIG_PROC_FS +static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ +	struct net_device *dev; +	loff_t off = 0; + +	read_lock(&dev_base_lock); +	for (dev = dev_base; dev; dev = dev->next) { +		if (off++ == *pos)  +			return dev; +	} +	return NULL; +} + +static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct net_device *dev = v; +	++*pos; +	return dev->next; +} + +static void dev_mc_seq_stop(struct seq_file *seq, void *v) +{ +	read_unlock(&dev_base_lock); +} + + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ +	struct dev_mc_list *m; +	struct net_device *dev = v; + +	spin_lock_bh(&dev->xmit_lock); +	for (m = dev->mc_list; m; m = m->next) { +		int i; + +		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, +			   dev->name, m->dmi_users, m->dmi_gusers); + +		for (i = 0; i < m->dmi_addrlen; i++) +			seq_printf(seq, "%02x", m->dmi_addr[i]); + +		seq_putc(seq, '\n'); +	} +	spin_unlock_bh(&dev->xmit_lock); +	return 0; +} + +static struct seq_operations dev_mc_seq_ops = { +	.start = dev_mc_seq_start, +	.next  = dev_mc_seq_next, +	.stop  = dev_mc_seq_stop, +	.show  = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &dev_mc_seq_ops); +} + +static struct file_operations dev_mc_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_mc_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +#endif + +void __init dev_mcast_init(void) +{ +	proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); +} + +EXPORT_SYMBOL(dev_mc_add); +EXPORT_SYMBOL(dev_mc_delete); +EXPORT_SYMBOL(dev_mc_upload); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 00000000000..3bf6cc43481 --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,276 @@ +/* + * net/core/dst.c	Protocol independent destination cache. + * + * Authors:		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <net/dst.h> + +/* Locking strategy: + * 1) Garbage collection state of dead destination cache + *    entries is protected by dst_lock. + * 2) GC is run only from BH context, and is the only remover + *    of entries. + * 3) Entries are added to the garbage list from both BH + *    and non-BH context, so local BH disabling is needed. + * 4) All operations modify state, so a spinlock is used. + */ +static struct dst_entry 	*dst_garbage_list; +#if RT_CACHE_DEBUG >= 2  +static atomic_t			 dst_total = ATOMIC_INIT(0); +#endif +static DEFINE_SPINLOCK(dst_lock); + +static unsigned long dst_gc_timer_expires; +static unsigned long dst_gc_timer_inc = DST_GC_MAX; +static void dst_run_gc(unsigned long); +static void ___dst_free(struct dst_entry * dst); + +static struct timer_list dst_gc_timer = +	TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0); + +static void dst_run_gc(unsigned long dummy) +{ +	int    delayed = 0; +	struct dst_entry * dst, **dstp; + +	if (!spin_trylock(&dst_lock)) { +		mod_timer(&dst_gc_timer, jiffies + HZ/10); +		return; +	} + + +	del_timer(&dst_gc_timer); +	dstp = &dst_garbage_list; +	while ((dst = *dstp) != NULL) { +		if (atomic_read(&dst->__refcnt)) { +			dstp = &dst->next; +			delayed++; +			continue; +		} +		*dstp = dst->next; + +		dst = dst_destroy(dst); +		if (dst) { +			/* NOHASH and still referenced. Unless it is already +			 * on gc list, invalidate it and add to gc list. +			 * +			 * Note: this is temporary. Actually, NOHASH dst's +			 * must be obsoleted when parent is obsoleted. +			 * But we do not have state "obsoleted, but +			 * referenced by parent", so it is right. +			 */ +			if (dst->obsolete > 1) +				continue; + +			___dst_free(dst); +			dst->next = *dstp; +			*dstp = dst; +			dstp = &dst->next; +		} +	} +	if (!dst_garbage_list) { +		dst_gc_timer_inc = DST_GC_MAX; +		goto out; +	} +	if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) +		dst_gc_timer_expires = DST_GC_MAX; +	dst_gc_timer_inc += DST_GC_INC; +	dst_gc_timer.expires = jiffies + dst_gc_timer_expires; +#if RT_CACHE_DEBUG >= 2 +	printk("dst_total: %d/%d %ld\n", +	       atomic_read(&dst_total), delayed,  dst_gc_timer_expires); +#endif +	add_timer(&dst_gc_timer); + +out: +	spin_unlock(&dst_lock); +} + +static int dst_discard_in(struct sk_buff *skb) +{ +	kfree_skb(skb); +	return 0; +} + +static int dst_discard_out(struct sk_buff *skb) +{ +	kfree_skb(skb); +	return 0; +} + +void * dst_alloc(struct dst_ops * ops) +{ +	struct dst_entry * dst; + +	if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { +		if (ops->gc()) +			return NULL; +	} +	dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC); +	if (!dst) +		return NULL; +	memset(dst, 0, ops->entry_size); +	atomic_set(&dst->__refcnt, 0); +	dst->ops = ops; +	dst->lastuse = jiffies; +	dst->path = dst; +	dst->input = dst_discard_in; +	dst->output = dst_discard_out; +#if RT_CACHE_DEBUG >= 2  +	atomic_inc(&dst_total); +#endif +	atomic_inc(&ops->entries); +	return dst; +} + +static void ___dst_free(struct dst_entry * dst) +{ +	/* The first case (dev==NULL) is required, when +	   protocol module is unloaded. +	 */ +	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { +		dst->input = dst_discard_in; +		dst->output = dst_discard_out; +	} +	dst->obsolete = 2; +} + +void __dst_free(struct dst_entry * dst) +{ +	spin_lock_bh(&dst_lock); +	___dst_free(dst); +	dst->next = dst_garbage_list; +	dst_garbage_list = dst; +	if (dst_gc_timer_inc > DST_GC_INC) { +		dst_gc_timer_inc = DST_GC_INC; +		dst_gc_timer_expires = DST_GC_MIN; +		mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); +	} +	spin_unlock_bh(&dst_lock); +} + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ +	struct dst_entry *child; +	struct neighbour *neigh; +	struct hh_cache *hh; + +	smp_rmb(); + +again: +	neigh = dst->neighbour; +	hh = dst->hh; +	child = dst->child; + +	dst->hh = NULL; +	if (hh && atomic_dec_and_test(&hh->hh_refcnt)) +		kfree(hh); + +	if (neigh) { +		dst->neighbour = NULL; +		neigh_release(neigh); +	} + +	atomic_dec(&dst->ops->entries); + +	if (dst->ops->destroy) +		dst->ops->destroy(dst); +	if (dst->dev) +		dev_put(dst->dev); +#if RT_CACHE_DEBUG >= 2  +	atomic_dec(&dst_total); +#endif +	kmem_cache_free(dst->ops->kmem_cachep, dst); + +	dst = child; +	if (dst) { +		if (atomic_dec_and_test(&dst->__refcnt)) { +			/* We were real parent of this dst, so kill child. */ +			if (dst->flags&DST_NOHASH) +				goto again; +		} else { +			/* Child is still referenced, return it for freeing. */ +			if (dst->flags&DST_NOHASH) +				return dst; +			/* Child is still in his hash table */ +		} +	} +	return NULL; +} + +/* Dirty hack. We did it in 2.2 (in __dst_free), + * we have _very_ good reasons not to repeat + * this mistake in 2.3, but we have no choice + * now. _It_ _is_ _explicit_ _deliberate_ + * _race_ _condition_. + * + * Commented and originally written by Alexey. + */ +static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, +			      int unregister) +{ +	if (dst->ops->ifdown) +		dst->ops->ifdown(dst, dev, unregister); + +	if (dev != dst->dev) +		return; + +	if (!unregister) { +		dst->input = dst_discard_in; +		dst->output = dst_discard_out; +	} else { +		dst->dev = &loopback_dev; +		dev_hold(&loopback_dev); +		dev_put(dev); +		if (dst->neighbour && dst->neighbour->dev == dev) { +			dst->neighbour->dev = &loopback_dev; +			dev_put(dev); +			dev_hold(&loopback_dev); +		} +	} +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ +	struct net_device *dev = ptr; +	struct dst_entry *dst; + +	switch (event) { +	case NETDEV_UNREGISTER: +	case NETDEV_DOWN: +		spin_lock_bh(&dst_lock); +		for (dst = dst_garbage_list; dst; dst = dst->next) { +			dst_ifdown(dst, dev, event != NETDEV_DOWN); +		} +		spin_unlock_bh(&dst_lock); +		break; +	} +	return NOTIFY_DONE; +} + +static struct notifier_block dst_dev_notifier = { +	.notifier_call	= dst_dev_event, +}; + +void __init dst_init(void) +{ +	register_netdevice_notifier(&dst_dev_notifier); +} + +EXPORT_SYMBOL(__dst_free); +EXPORT_SYMBOL(dst_alloc); +EXPORT_SYMBOL(dst_destroy); diff --git a/net/core/dv.c b/net/core/dv.c new file mode 100644 index 00000000000..3f25f4aa4e6 --- /dev/null +++ b/net/core/dv.c @@ -0,0 +1,548 @@ +/* + * INET		An implementation of the TCP/IP protocol suite for the LINUX + *		operating system.  INET is implemented using the  BSD Socket + *		interface as the means of communication with the user level. + * + *		Generic frame diversion + * + * Authors:	 + * 		Benoit LOCHER:	initial integration within the kernel with support for ethernet + * 		Dave Miller:	improvement on the code (correctness, performance and source files) + * + */ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <net/dst.h> +#include <net/arp.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/checksum.h> +#include <linux/divert.h> +#include <linux/sockios.h> + +const char sysctl_divert_version[32]="0.46";	/* Current version */ + +static int __init dv_init(void) +{ +	return 0; +} +module_init(dv_init); + +/* + * Allocate a divert_blk for a device. This must be an ethernet nic. + */ +int alloc_divert_blk(struct net_device *dev) +{ +	int alloc_size = (sizeof(struct divert_blk) + 3) & ~3; + +	dev->divert = NULL; +	if (dev->type == ARPHRD_ETHER) { +		dev->divert = (struct divert_blk *) +			kmalloc(alloc_size, GFP_KERNEL); +		if (dev->divert == NULL) { +			printk(KERN_INFO "divert: unable to allocate divert_blk for %s\n", +			       dev->name); +			return -ENOMEM; +		} + +		memset(dev->divert, 0, sizeof(struct divert_blk)); +		dev_hold(dev); +	} + +	return 0; +}  + +/* + * Free a divert_blk allocated by the above function, if it was  + * allocated on that device. + */ +void free_divert_blk(struct net_device *dev) +{ +	if (dev->divert) { +		kfree(dev->divert); +		dev->divert=NULL; +		dev_put(dev); +	} +} + +/* + * Adds a tcp/udp (source or dest) port to an array + */ +static int add_port(u16 ports[], u16 port) +{ +	int i; + +	if (port == 0) +		return -EINVAL; + +	/* Storing directly in network format for performance, +	 * thanks Dave :) +	 */ +	port = htons(port); + +	for (i = 0; i < MAX_DIVERT_PORTS; i++) { +		if (ports[i] == port) +			return -EALREADY; +	} +	 +	for (i = 0; i < MAX_DIVERT_PORTS; i++) { +		if (ports[i] == 0) { +			ports[i] = port; +			return 0; +		} +	} + +	return -ENOBUFS; +} + +/* + * Removes a port from an array tcp/udp (source or dest) + */ +static int remove_port(u16 ports[], u16 port) +{ +	int i; + +	if (port == 0) +		return -EINVAL; +	 +	/* Storing directly in network format for performance, +	 * thanks Dave ! +	 */ +	port = htons(port); + +	for (i = 0; i < MAX_DIVERT_PORTS; i++) { +		if (ports[i] == port) { +			ports[i] = 0; +			return 0; +		} +	} + +	return -EINVAL; +} + +/* Some basic sanity checks on the arguments passed to divert_ioctl() */ +static int check_args(struct divert_cf *div_cf, struct net_device **dev) +{ +	char devname[32]; +	int ret; + +	if (dev == NULL) +		return -EFAULT; +	 +	/* GETVERSION: all other args are unused */ +	if (div_cf->cmd == DIVCMD_GETVERSION) +		return 0; +	 +	/* Network device index should reasonably be between 0 and 1000 :) */ +	if (div_cf->dev_index < 0 || div_cf->dev_index > 1000)  +		return -EINVAL; +			 +	/* Let's try to find the ifname */ +	sprintf(devname, "eth%d", div_cf->dev_index); +	*dev = dev_get_by_name(devname); +	 +	/* dev should NOT be null */ +	if (*dev == NULL) +		return -EINVAL; + +	ret = 0; + +	/* user issuing the ioctl must be a super one :) */ +	if (!capable(CAP_SYS_ADMIN)) { +		ret = -EPERM; +		goto out; +	} + +	/* Device must have a divert_blk member NOT null */ +	if ((*dev)->divert == NULL) +		ret = -EINVAL; +out: +	dev_put(*dev); +	return ret; +} + +/* + * control function of the diverter + */ +#if 0 +#define	DVDBG(a)	\ +	printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a)) +#else +#define	DVDBG(a) +#endif + +int divert_ioctl(unsigned int cmd, struct divert_cf __user *arg) +{ +	struct divert_cf	div_cf; +	struct divert_blk	*div_blk; +	struct net_device	*dev; +	int			ret; + +	switch (cmd) { +	case SIOCGIFDIVERT: +		DVDBG("SIOCGIFDIVERT, copy_from_user"); +		if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) +			return -EFAULT; +		DVDBG("before check_args"); +		ret = check_args(&div_cf, &dev); +		if (ret) +			return ret; +		DVDBG("after checkargs"); +		div_blk = dev->divert; +			 +		DVDBG("befre switch()"); +		switch (div_cf.cmd) { +		case DIVCMD_GETSTATUS: +			/* Now, just give the user the raw divert block +			 * for him to play with :) +			 */ +			if (copy_to_user(div_cf.arg1.ptr, dev->divert, +					 sizeof(struct divert_blk))) +				return -EFAULT; +			break; + +		case DIVCMD_GETVERSION: +			DVDBG("GETVERSION: checking ptr"); +			if (div_cf.arg1.ptr == NULL) +				return -EINVAL; +			DVDBG("GETVERSION: copying data to userland"); +			if (copy_to_user(div_cf.arg1.ptr, +					 sysctl_divert_version, 32)) +				return -EFAULT; +			DVDBG("GETVERSION: data copied"); +			break; + +		default: +			return -EINVAL; +		} + +		break; + +	case SIOCSIFDIVERT: +		if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) +			return -EFAULT; + +		ret = check_args(&div_cf, &dev); +		if (ret) +			return ret; + +		div_blk = dev->divert; + +		switch(div_cf.cmd) { +		case DIVCMD_RESET: +			div_blk->divert = 0; +			div_blk->protos = DIVERT_PROTO_NONE; +			memset(div_blk->tcp_dst, 0, +			       MAX_DIVERT_PORTS * sizeof(u16)); +			memset(div_blk->tcp_src, 0, +			       MAX_DIVERT_PORTS * sizeof(u16)); +			memset(div_blk->udp_dst, 0, +			       MAX_DIVERT_PORTS * sizeof(u16)); +			memset(div_blk->udp_src, 0, +			       MAX_DIVERT_PORTS * sizeof(u16)); +			return 0; +				 +		case DIVCMD_DIVERT: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ENABLE: +				if (div_blk->divert) +					return -EALREADY; +				div_blk->divert = 1; +				break; + +			case DIVARG1_DISABLE: +				if (!div_blk->divert) +					return -EALREADY; +				div_blk->divert = 0; +				break; + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_IP: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ENABLE: +				if (div_blk->protos & DIVERT_PROTO_IP) +					return -EALREADY; +				div_blk->protos |= DIVERT_PROTO_IP; +				break; + +			case DIVARG1_DISABLE: +				if (!(div_blk->protos & DIVERT_PROTO_IP)) +					return -EALREADY; +				div_blk->protos &= ~DIVERT_PROTO_IP; +				break; + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_TCP: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ENABLE: +				if (div_blk->protos & DIVERT_PROTO_TCP) +					return -EALREADY; +				div_blk->protos |= DIVERT_PROTO_TCP; +				break; + +			case DIVARG1_DISABLE: +				if (!(div_blk->protos & DIVERT_PROTO_TCP)) +					return -EALREADY; +				div_blk->protos &= ~DIVERT_PROTO_TCP; +				break; + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_TCPDST: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ADD: +				return add_port(div_blk->tcp_dst, +						div_cf.arg2.uint16); +				 +			case DIVARG1_REMOVE: +				return remove_port(div_blk->tcp_dst, +						   div_cf.arg2.uint16); + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_TCPSRC: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ADD: +				return add_port(div_blk->tcp_src, +						div_cf.arg2.uint16); + +			case DIVARG1_REMOVE: +				return remove_port(div_blk->tcp_src, +						   div_cf.arg2.uint16); + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_UDP: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ENABLE: +				if (div_blk->protos & DIVERT_PROTO_UDP) +					return -EALREADY; +				div_blk->protos |= DIVERT_PROTO_UDP; +				break; + +			case DIVARG1_DISABLE: +				if (!(div_blk->protos & DIVERT_PROTO_UDP)) +					return -EALREADY; +				div_blk->protos &= ~DIVERT_PROTO_UDP; +				break; + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_UDPDST: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ADD: +				return add_port(div_blk->udp_dst, +						div_cf.arg2.uint16); + +			case DIVARG1_REMOVE: +				return remove_port(div_blk->udp_dst, +						   div_cf.arg2.uint16); + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_UDPSRC: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ADD: +				return add_port(div_blk->udp_src, +						div_cf.arg2.uint16); + +			case DIVARG1_REMOVE: +				return remove_port(div_blk->udp_src, +						   div_cf.arg2.uint16); + +			default: +				return -EINVAL; +			} + +			break; + +		case DIVCMD_ICMP: +			switch(div_cf.arg1.int32) { +			case DIVARG1_ENABLE: +				if (div_blk->protos & DIVERT_PROTO_ICMP) +					return -EALREADY; +				div_blk->protos |= DIVERT_PROTO_ICMP; +				break; + +			case DIVARG1_DISABLE: +				if (!(div_blk->protos & DIVERT_PROTO_ICMP)) +					return -EALREADY; +				div_blk->protos &= ~DIVERT_PROTO_ICMP; +				break; + +			default: +				return -EINVAL; +			} + +			break; + +		default: +			return -EINVAL; +		} + +		break; + +	default: +		return -EINVAL; +	} + +	return 0; +} + + +/* + * Check if packet should have its dest mac address set to the box itself + * for diversion + */ + +#define	ETH_DIVERT_FRAME(skb) \ +	memcpy(eth_hdr(skb), skb->dev->dev_addr, ETH_ALEN); \ +	skb->pkt_type=PACKET_HOST +		 +void divert_frame(struct sk_buff *skb) +{ +	struct ethhdr			*eth = eth_hdr(skb); +	struct iphdr			*iph; +	struct tcphdr			*tcph; +	struct udphdr			*udph; +	struct divert_blk		*divert = skb->dev->divert; +	int				i, src, dst; +	unsigned char			*skb_data_end = skb->data + skb->len; + +	/* Packet is already aimed at us, return */ +	if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN)) +		return; +	 +	/* proto is not IP, do nothing */ +	if (eth->h_proto != htons(ETH_P_IP)) +		return; +	 +	/* Divert all IP frames ? */ +	if (divert->protos & DIVERT_PROTO_IP) { +		ETH_DIVERT_FRAME(skb); +		return; +	} +	 +	/* Check for possible (maliciously) malformed IP frame (thanks Dave) */ +	iph = (struct iphdr *) skb->data; +	if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) { +		printk(KERN_INFO "divert: malformed IP packet !\n"); +		return; +	} + +	switch (iph->protocol) { +	/* Divert all ICMP frames ? */ +	case IPPROTO_ICMP: +		if (divert->protos & DIVERT_PROTO_ICMP) { +			ETH_DIVERT_FRAME(skb); +			return; +		} +		break; + +	/* Divert all TCP frames ? */ +	case IPPROTO_TCP: +		if (divert->protos & DIVERT_PROTO_TCP) { +			ETH_DIVERT_FRAME(skb); +			return; +		} + +		/* Check for possible (maliciously) malformed IP +		 * frame (thanx Dave) +		 */ +		tcph = (struct tcphdr *) +			(((unsigned char *)iph) + (iph->ihl<<2)); +		if (((unsigned char *)(tcph+1)) >= skb_data_end) { +			printk(KERN_INFO "divert: malformed TCP packet !\n"); +			return; +		} + +		/* Divert some tcp dst/src ports only ?*/ +		for (i = 0; i < MAX_DIVERT_PORTS; i++) { +			dst = divert->tcp_dst[i]; +			src = divert->tcp_src[i]; +			if ((dst && dst == tcph->dest) || +			    (src && src == tcph->source)) { +				ETH_DIVERT_FRAME(skb); +				return; +			} +		} +		break; + +	/* Divert all UDP frames ? */ +	case IPPROTO_UDP: +		if (divert->protos & DIVERT_PROTO_UDP) { +			ETH_DIVERT_FRAME(skb); +			return; +		} + +		/* Check for possible (maliciously) malformed IP +		 * packet (thanks Dave) +		 */ +		udph = (struct udphdr *) +			(((unsigned char *)iph) + (iph->ihl<<2)); +		if (((unsigned char *)(udph+1)) >= skb_data_end) { +			printk(KERN_INFO +			       "divert: malformed UDP packet !\n"); +			return; +		} + +		/* Divert some udp dst/src ports only ? */ +		for (i = 0; i < MAX_DIVERT_PORTS; i++) { +			dst = divert->udp_dst[i]; +			src = divert->udp_src[i]; +			if ((dst && dst == udph->dest) || +			    (src && src == udph->source)) { +				ETH_DIVERT_FRAME(skb); +				return; +			} +		} +		break; +	} +} diff --git a/net/core/ethtool.c b/net/core/ethtool.c new file mode 100644 index 00000000000..f05fde97c43 --- /dev/null +++ b/net/core/ethtool.c @@ -0,0 +1,819 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs.  We fall back to calling do_ioctl() + * for drivers which haven't been converted to ethtool_ops yet. + * + * It's GPL, stupid. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <asm/uaccess.h> + +/*  + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ +	return netif_carrier_ok(dev) ? 1 : 0; +} + +u32 ethtool_op_get_tx_csum(struct net_device *dev) +{ +	return (dev->features & NETIF_F_IP_CSUM) != 0; +} + +int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) +{ +	if (data) +		dev->features |= NETIF_F_IP_CSUM; +	else +		dev->features &= ~NETIF_F_IP_CSUM; + +	return 0; +} + +u32 ethtool_op_get_sg(struct net_device *dev) +{ +	return (dev->features & NETIF_F_SG) != 0; +} + +int ethtool_op_set_sg(struct net_device *dev, u32 data) +{ +	if (data) +		dev->features |= NETIF_F_SG; +	else +		dev->features &= ~NETIF_F_SG; + +	return 0; +} + +u32 ethtool_op_get_tso(struct net_device *dev) +{ +	return (dev->features & NETIF_F_TSO) != 0; +} + +int ethtool_op_set_tso(struct net_device *dev, u32 data) +{ +	if (data) +		dev->features |= NETIF_F_TSO; +	else +		dev->features &= ~NETIF_F_TSO; + +	return 0; +} + +/* Handlers for each ethtool command */ + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_cmd cmd = { ETHTOOL_GSET }; +	int err; + +	if (!dev->ethtool_ops->get_settings) +		return -EOPNOTSUPP; + +	err = dev->ethtool_ops->get_settings(dev, &cmd); +	if (err < 0) +		return err; + +	if (copy_to_user(useraddr, &cmd, sizeof(cmd))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_cmd cmd; + +	if (!dev->ethtool_ops->set_settings) +		return -EOPNOTSUPP; + +	if (copy_from_user(&cmd, useraddr, sizeof(cmd))) +		return -EFAULT; + +	return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_drvinfo info; +	struct ethtool_ops *ops = dev->ethtool_ops; + +	if (!ops->get_drvinfo) +		return -EOPNOTSUPP; + +	memset(&info, 0, sizeof(info)); +	info.cmd = ETHTOOL_GDRVINFO; +	ops->get_drvinfo(dev, &info); + +	if (ops->self_test_count) +		info.testinfo_len = ops->self_test_count(dev); +	if (ops->get_stats_count) +		info.n_stats = ops->get_stats_count(dev); +	if (ops->get_regs_len) +		info.regdump_len = ops->get_regs_len(dev); +	if (ops->get_eeprom_len) +		info.eedump_len = ops->get_eeprom_len(dev); + +	if (copy_to_user(useraddr, &info, sizeof(info))) +		return -EFAULT; +	return 0; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_regs regs; +	struct ethtool_ops *ops = dev->ethtool_ops; +	void *regbuf; +	int reglen, ret; + +	if (!ops->get_regs || !ops->get_regs_len) +		return -EOPNOTSUPP; + +	if (copy_from_user(®s, useraddr, sizeof(regs))) +		return -EFAULT; + +	reglen = ops->get_regs_len(dev); +	if (regs.len > reglen) +		regs.len = reglen; + +	regbuf = kmalloc(reglen, GFP_USER); +	if (!regbuf) +		return -ENOMEM; + +	ops->get_regs(dev, ®s, regbuf); + +	ret = -EFAULT; +	if (copy_to_user(useraddr, ®s, sizeof(regs))) +		goto out; +	useraddr += offsetof(struct ethtool_regs, data); +	if (copy_to_user(useraddr, regbuf, regs.len)) +		goto out; +	ret = 0; + + out: +	kfree(regbuf); +	return ret; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + +	if (!dev->ethtool_ops->get_wol) +		return -EOPNOTSUPP; + +	dev->ethtool_ops->get_wol(dev, &wol); + +	if (copy_to_user(useraddr, &wol, sizeof(wol))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_wolinfo wol; + +	if (!dev->ethtool_ops->set_wol) +		return -EOPNOTSUPP; + +	if (copy_from_user(&wol, useraddr, sizeof(wol))) +		return -EFAULT; + +	return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GMSGLVL }; + +	if (!dev->ethtool_ops->get_msglevel) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_msglevel(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata; + +	if (!dev->ethtool_ops->set_msglevel) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	dev->ethtool_ops->set_msglevel(dev, edata.data); +	return 0; +} + +static int ethtool_nway_reset(struct net_device *dev) +{ +	if (!dev->ethtool_ops->nway_reset) +		return -EOPNOTSUPP; + +	return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GLINK }; + +	if (!dev->ethtool_ops->get_link) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_link(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_eeprom eeprom; +	struct ethtool_ops *ops = dev->ethtool_ops; +	u8 *data; +	int ret; + +	if (!ops->get_eeprom || !ops->get_eeprom_len) +		return -EOPNOTSUPP; + +	if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) +		return -EFAULT; + +	/* Check for wrap and zero */ +	if (eeprom.offset + eeprom.len <= eeprom.offset) +		return -EINVAL; + +	/* Check for exceeding total eeprom len */ +	if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) +		return -EINVAL; + +	data = kmalloc(eeprom.len, GFP_USER); +	if (!data) +		return -ENOMEM; + +	ret = -EFAULT; +	if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) +		goto out; + +	ret = ops->get_eeprom(dev, &eeprom, data); +	if (ret) +		goto out; + +	ret = -EFAULT; +	if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) +		goto out; +	if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) +		goto out; +	ret = 0; + + out: +	kfree(data); +	return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_eeprom eeprom; +	struct ethtool_ops *ops = dev->ethtool_ops; +	u8 *data; +	int ret; + +	if (!ops->set_eeprom || !ops->get_eeprom_len) +		return -EOPNOTSUPP; + +	if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) +		return -EFAULT; + +	/* Check for wrap and zero */ +	if (eeprom.offset + eeprom.len <= eeprom.offset) +		return -EINVAL; + +	/* Check for exceeding total eeprom len */ +	if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) +		return -EINVAL; + +	data = kmalloc(eeprom.len, GFP_USER); +	if (!data) +		return -ENOMEM; + +	ret = -EFAULT; +	if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) +		goto out; + +	ret = ops->set_eeprom(dev, &eeprom, data); +	if (ret) +		goto out; + +	if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) +		ret = -EFAULT; + + out: +	kfree(data); +	return ret; +} + +static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + +	if (!dev->ethtool_ops->get_coalesce) +		return -EOPNOTSUPP; + +	dev->ethtool_ops->get_coalesce(dev, &coalesce); + +	if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_coalesce coalesce; + +	if (!dev->ethtool_ops->get_coalesce) +		return -EOPNOTSUPP; + +	if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) +		return -EFAULT; + +	return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + +	if (!dev->ethtool_ops->get_ringparam) +		return -EOPNOTSUPP; + +	dev->ethtool_ops->get_ringparam(dev, &ringparam); + +	if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_ringparam ringparam; + +	if (!dev->ethtool_ops->set_ringparam) +		return -EOPNOTSUPP; + +	if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) +		return -EFAULT; + +	return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + +	if (!dev->ethtool_ops->get_pauseparam) +		return -EOPNOTSUPP; + +	dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + +	if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_pauseparam pauseparam; + +	if (!dev->ethtool_ops->get_pauseparam) +		return -EOPNOTSUPP; + +	if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) +		return -EFAULT; + +	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GRXCSUM }; + +	if (!dev->ethtool_ops->get_rx_csum) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_rx_csum(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata; + +	if (!dev->ethtool_ops->set_rx_csum) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	dev->ethtool_ops->set_rx_csum(dev, edata.data); +	return 0; +} + +static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GTXCSUM }; + +	if (!dev->ethtool_ops->get_tx_csum) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_tx_csum(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int __ethtool_set_sg(struct net_device *dev, u32 data) +{ +	int err; + +	if (!data && dev->ethtool_ops->set_tso) { +		err = dev->ethtool_ops->set_tso(dev, 0); +		if (err) +			return err; +	} + +	return dev->ethtool_ops->set_sg(dev, data); +} + +static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata; +	int err; + +	if (!dev->ethtool_ops->set_tx_csum) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	if (!edata.data && dev->ethtool_ops->set_sg) { +		err = __ethtool_set_sg(dev, 0); +		if (err) +			return err; +	} + +	return dev->ethtool_ops->set_tx_csum(dev, edata.data); +} + +static int ethtool_get_sg(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GSG }; + +	if (!dev->ethtool_ops->get_sg) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_sg(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata; + +	if (!dev->ethtool_ops->set_sg) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	if (edata.data &&  +	    !(dev->features & (NETIF_F_IP_CSUM | +			       NETIF_F_NO_CSUM | +			       NETIF_F_HW_CSUM))) +		return -EINVAL; + +	return __ethtool_set_sg(dev, edata.data); +} + +static int ethtool_get_tso(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { ETHTOOL_GTSO }; + +	if (!dev->ethtool_ops->get_tso) +		return -EOPNOTSUPP; + +	edata.data = dev->ethtool_ops->get_tso(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata; + +	if (!dev->ethtool_ops->set_tso) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	if (edata.data && !(dev->features & NETIF_F_SG)) +		return -EINVAL; + +	return dev->ethtool_ops->set_tso(dev, edata.data); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_test test; +	struct ethtool_ops *ops = dev->ethtool_ops; +	u64 *data; +	int ret; + +	if (!ops->self_test || !ops->self_test_count) +		return -EOPNOTSUPP; + +	if (copy_from_user(&test, useraddr, sizeof(test))) +		return -EFAULT; + +	test.len = ops->self_test_count(dev); +	data = kmalloc(test.len * sizeof(u64), GFP_USER); +	if (!data) +		return -ENOMEM; + +	ops->self_test(dev, &test, data); + +	ret = -EFAULT; +	if (copy_to_user(useraddr, &test, sizeof(test))) +		goto out; +	useraddr += sizeof(test); +	if (copy_to_user(useraddr, data, test.len * sizeof(u64))) +		goto out; +	ret = 0; + + out: +	kfree(data); +	return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_gstrings gstrings; +	struct ethtool_ops *ops = dev->ethtool_ops; +	u8 *data; +	int ret; + +	if (!ops->get_strings) +		return -EOPNOTSUPP; + +	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) +		return -EFAULT; + +	switch (gstrings.string_set) { +	case ETH_SS_TEST: +		if (!ops->self_test_count) +			return -EOPNOTSUPP; +		gstrings.len = ops->self_test_count(dev); +		break; +	case ETH_SS_STATS: +		if (!ops->get_stats_count) +			return -EOPNOTSUPP; +		gstrings.len = ops->get_stats_count(dev); +		break; +	default: +		return -EINVAL; +	} + +	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); +	if (!data) +		return -ENOMEM; + +	ops->get_strings(dev, gstrings.string_set, data); + +	ret = -EFAULT; +	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) +		goto out; +	useraddr += sizeof(gstrings); +	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) +		goto out; +	ret = 0; + + out: +	kfree(data); +	return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_value id; + +	if (!dev->ethtool_ops->phys_id) +		return -EOPNOTSUPP; + +	if (copy_from_user(&id, useraddr, sizeof(id))) +		return -EFAULT; + +	return dev->ethtool_ops->phys_id(dev, id.data); +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ +	struct ethtool_stats stats; +	struct ethtool_ops *ops = dev->ethtool_ops; +	u64 *data; +	int ret; + +	if (!ops->get_ethtool_stats || !ops->get_stats_count) +		return -EOPNOTSUPP; + +	if (copy_from_user(&stats, useraddr, sizeof(stats))) +		return -EFAULT; + +	stats.n_stats = ops->get_stats_count(dev); +	data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); +	if (!data) +		return -ENOMEM; + +	ops->get_ethtool_stats(dev, &stats, data); + +	ret = -EFAULT; +	if (copy_to_user(useraddr, &stats, sizeof(stats))) +		goto out; +	useraddr += sizeof(stats); +	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) +		goto out; +	ret = 0; + + out: +	kfree(data); +	return ret; +} + +/* The main entry point in this file.  Called from net/core/dev.c */ + +int dev_ethtool(struct ifreq *ifr) +{ +	struct net_device *dev = __dev_get_by_name(ifr->ifr_name); +	void __user *useraddr = ifr->ifr_data; +	u32 ethcmd; +	int rc; + +	/* +	 * XXX: This can be pushed down into the ethtool_* handlers that +	 * need it.  Keep existing behaviour for the moment. +	 */ +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	if (!dev || !netif_device_present(dev)) +		return -ENODEV; + +	if (!dev->ethtool_ops) +		goto ioctl; + +	if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) +		return -EFAULT; + +	if(dev->ethtool_ops->begin) +		if ((rc = dev->ethtool_ops->begin(dev)) < 0) +			return rc; + +	switch (ethcmd) { +	case ETHTOOL_GSET: +		rc = ethtool_get_settings(dev, useraddr); +		break; +	case ETHTOOL_SSET: +		rc = ethtool_set_settings(dev, useraddr); +		break; +	case ETHTOOL_GDRVINFO: +		rc = ethtool_get_drvinfo(dev, useraddr); + +		break; +	case ETHTOOL_GREGS: +		rc = ethtool_get_regs(dev, useraddr); +		break; +	case ETHTOOL_GWOL: +		rc = ethtool_get_wol(dev, useraddr); +		break; +	case ETHTOOL_SWOL: +		rc = ethtool_set_wol(dev, useraddr); +		break; +	case ETHTOOL_GMSGLVL: +		rc = ethtool_get_msglevel(dev, useraddr); +		break; +	case ETHTOOL_SMSGLVL: +		rc = ethtool_set_msglevel(dev, useraddr); +		break; +	case ETHTOOL_NWAY_RST: +		rc = ethtool_nway_reset(dev); +		break; +	case ETHTOOL_GLINK: +		rc = ethtool_get_link(dev, useraddr); +		break; +	case ETHTOOL_GEEPROM: +		rc = ethtool_get_eeprom(dev, useraddr); +		break; +	case ETHTOOL_SEEPROM: +		rc = ethtool_set_eeprom(dev, useraddr); +		break; +	case ETHTOOL_GCOALESCE: +		rc = ethtool_get_coalesce(dev, useraddr); +		break; +	case ETHTOOL_SCOALESCE: +		rc = ethtool_set_coalesce(dev, useraddr); +		break; +	case ETHTOOL_GRINGPARAM: +		rc = ethtool_get_ringparam(dev, useraddr); +		break; +	case ETHTOOL_SRINGPARAM: +		rc = ethtool_set_ringparam(dev, useraddr); +		break; +	case ETHTOOL_GPAUSEPARAM: +		rc = ethtool_get_pauseparam(dev, useraddr); +		break; +	case ETHTOOL_SPAUSEPARAM: +		rc = ethtool_set_pauseparam(dev, useraddr); +		break; +	case ETHTOOL_GRXCSUM: +		rc = ethtool_get_rx_csum(dev, useraddr); +		break; +	case ETHTOOL_SRXCSUM: +		rc = ethtool_set_rx_csum(dev, useraddr); +		break; +	case ETHTOOL_GTXCSUM: +		rc = ethtool_get_tx_csum(dev, useraddr); +		break; +	case ETHTOOL_STXCSUM: +		rc = ethtool_set_tx_csum(dev, useraddr); +		break; +	case ETHTOOL_GSG: +		rc = ethtool_get_sg(dev, useraddr); +		break; +	case ETHTOOL_SSG: +		rc = ethtool_set_sg(dev, useraddr); +		break; +	case ETHTOOL_GTSO: +		rc = ethtool_get_tso(dev, useraddr); +		break; +	case ETHTOOL_STSO: +		rc = ethtool_set_tso(dev, useraddr); +		break; +	case ETHTOOL_TEST: +		rc = ethtool_self_test(dev, useraddr); +		break; +	case ETHTOOL_GSTRINGS: +		rc = ethtool_get_strings(dev, useraddr); +		break; +	case ETHTOOL_PHYS_ID: +		rc = ethtool_phys_id(dev, useraddr); +		break; +	case ETHTOOL_GSTATS: +		rc = ethtool_get_stats(dev, useraddr); +		break; +	default: +		rc =  -EOPNOTSUPP; +	} +	 +	if(dev->ethtool_ops->complete) +		dev->ethtool_ops->complete(dev); +	return rc; + + ioctl: +	if (dev->do_ioctl) +		return dev->do_ioctl(dev, ifr, SIOCETHTOOL); +	return -EOPNOTSUPP; +} + +EXPORT_SYMBOL(dev_ethtool); +EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL(ethtool_op_get_sg); +EXPORT_SYMBOL(ethtool_op_get_tso); +EXPORT_SYMBOL(ethtool_op_get_tx_csum); +EXPORT_SYMBOL(ethtool_op_set_sg); +EXPORT_SYMBOL(ethtool_op_set_tso); +EXPORT_SYMBOL(ethtool_op_set_tx_csum); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 00000000000..f3b88205ace --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,432 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + *     Jay Schulist <jschlst@samba.org> + * + * Based on the design of: + *     - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/filter.h> + +/* No hurry in this branch */ +static u8 *load_pointer(struct sk_buff *skb, int k) +{ +	u8 *ptr = NULL; + +	if (k >= SKF_NET_OFF) +		ptr = skb->nh.raw + k - SKF_NET_OFF; +	else if (k >= SKF_LL_OFF) +		ptr = skb->mac.raw + k - SKF_LL_OFF; + +	if (ptr >= skb->head && ptr < skb->tail) +		return ptr; +	return NULL; +} + +/** + *	sk_run_filter	- 	run a filter on a socket + *	@skb: buffer to run the filter on + *	@filter: filter to apply + *	@flen: length of filter + * + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ +  +int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +{ +	unsigned char *data = skb->data; +	/* len is UNSIGNED. Byte wide insns relies only on implicit +	   type casts to prevent reading arbitrary memory locations. +	 */ +	unsigned int len = skb->len-skb->data_len; +	struct sock_filter *fentry;	/* We walk down these */ +	u32 A = 0;	   		/* Accumulator */ +	u32 X = 0;   			/* Index Register */ +	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */ +	int k; +	int pc; + +	/* +	 * Process array of filter instructions. +	 */ +	for (pc = 0; pc < flen; pc++) { +		fentry = &filter[pc]; +			 +		switch (fentry->code) { +		case BPF_ALU|BPF_ADD|BPF_X: +			A += X; +			continue; +		case BPF_ALU|BPF_ADD|BPF_K: +			A += fentry->k; +			continue; +		case BPF_ALU|BPF_SUB|BPF_X: +			A -= X; +			continue; +		case BPF_ALU|BPF_SUB|BPF_K: +			A -= fentry->k; +			continue; +		case BPF_ALU|BPF_MUL|BPF_X: +			A *= X; +			continue; +		case BPF_ALU|BPF_MUL|BPF_K: +			A *= fentry->k; +			continue; +		case BPF_ALU|BPF_DIV|BPF_X: +			if (X == 0) +				return 0; +			A /= X; +			continue; +		case BPF_ALU|BPF_DIV|BPF_K: +			if (fentry->k == 0) +				return 0; +			A /= fentry->k; +			continue; +		case BPF_ALU|BPF_AND|BPF_X: +			A &= X; +			continue; +		case BPF_ALU|BPF_AND|BPF_K: +			A &= fentry->k; +			continue; +		case BPF_ALU|BPF_OR|BPF_X: +			A |= X; +			continue; +		case BPF_ALU|BPF_OR|BPF_K: +			A |= fentry->k; +			continue; +		case BPF_ALU|BPF_LSH|BPF_X: +			A <<= X; +			continue; +		case BPF_ALU|BPF_LSH|BPF_K: +			A <<= fentry->k; +			continue; +		case BPF_ALU|BPF_RSH|BPF_X: +			A >>= X; +			continue; +		case BPF_ALU|BPF_RSH|BPF_K: +			A >>= fentry->k; +			continue; +		case BPF_ALU|BPF_NEG: +			A = -A; +			continue; +		case BPF_JMP|BPF_JA: +			pc += fentry->k; +			continue; +		case BPF_JMP|BPF_JGT|BPF_K: +			pc += (A > fentry->k) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JGE|BPF_K: +			pc += (A >= fentry->k) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JEQ|BPF_K: +			pc += (A == fentry->k) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JSET|BPF_K: +			pc += (A & fentry->k) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JGT|BPF_X: +			pc += (A > X) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JGE|BPF_X: +			pc += (A >= X) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JEQ|BPF_X: +			pc += (A == X) ? fentry->jt : fentry->jf; +			continue; +		case BPF_JMP|BPF_JSET|BPF_X: +			pc += (A & X) ? fentry->jt : fentry->jf; +			continue; +		case BPF_LD|BPF_W|BPF_ABS: +			k = fentry->k; + load_w: +			if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { +				A = ntohl(*(u32*)&data[k]); +				continue; +			} +			if (k < 0) { +				u8 *ptr; + +				if (k >= SKF_AD_OFF) +					break; +				ptr = load_pointer(skb, k); +				if (ptr) { +					A = ntohl(*(u32*)ptr); +					continue; +				} +			} else { +				u32 _tmp, *p; +				p = skb_header_pointer(skb, k, 4, &_tmp); +				if (p != NULL) { +					A = ntohl(*p); +					continue; +				} +			} +			return 0; +		case BPF_LD|BPF_H|BPF_ABS: +			k = fentry->k; + load_h: +			if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { +				A = ntohs(*(u16*)&data[k]); +				continue; +			} +			if (k < 0) { +				u8 *ptr; + +				if (k >= SKF_AD_OFF) +					break; +				ptr = load_pointer(skb, k); +				if (ptr) { +					A = ntohs(*(u16*)ptr); +					continue; +				} +			} else { +				u16 _tmp, *p; +				p = skb_header_pointer(skb, k, 2, &_tmp); +				if (p != NULL) { +					A = ntohs(*p); +					continue; +				} +			} +			return 0; +		case BPF_LD|BPF_B|BPF_ABS: +			k = fentry->k; +load_b: +			if (k >= 0 && (unsigned int)k < len) { +				A = data[k]; +				continue; +			} +			if (k < 0) { +				u8 *ptr; + +				if (k >= SKF_AD_OFF) +					break; +				ptr = load_pointer(skb, k); +				if (ptr) { +					A = *ptr; +					continue; +				} +			} else { +				u8 _tmp, *p; +				p = skb_header_pointer(skb, k, 1, &_tmp); +				if (p != NULL) { +					A = *p; +					continue; +				} +			} +			return 0; +		case BPF_LD|BPF_W|BPF_LEN: +			A = len; +			continue; +		case BPF_LDX|BPF_W|BPF_LEN: +			X = len; +			continue; +		case BPF_LD|BPF_W|BPF_IND: +			k = X + fentry->k; +			goto load_w; +		case BPF_LD|BPF_H|BPF_IND: +			k = X + fentry->k; +			goto load_h; +		case BPF_LD|BPF_B|BPF_IND: +			k = X + fentry->k; +			goto load_b; +		case BPF_LDX|BPF_B|BPF_MSH: +			if (fentry->k >= len) +				return 0; +			X = (data[fentry->k] & 0xf) << 2; +			continue; +		case BPF_LD|BPF_IMM: +			A = fentry->k; +			continue; +		case BPF_LDX|BPF_IMM: +			X = fentry->k; +			continue; +		case BPF_LD|BPF_MEM: +			A = mem[fentry->k]; +			continue; +		case BPF_LDX|BPF_MEM: +			X = mem[fentry->k]; +			continue; +		case BPF_MISC|BPF_TAX: +			X = A; +			continue; +		case BPF_MISC|BPF_TXA: +			A = X; +			continue; +		case BPF_RET|BPF_K: +			return ((unsigned int)fentry->k); +		case BPF_RET|BPF_A: +			return ((unsigned int)A); +		case BPF_ST: +			mem[fentry->k] = A; +			continue; +		case BPF_STX: +			mem[fentry->k] = X; +			continue; +		default: +			/* Invalid instruction counts as RET */ +			return 0; +		} + +		/* +		 * Handle ancillary data, which are impossible +		 * (or very difficult) to get parsing packet contents. +		 */ +		switch (k-SKF_AD_OFF) { +		case SKF_AD_PROTOCOL: +			A = htons(skb->protocol); +			continue; +		case SKF_AD_PKTTYPE: +			A = skb->pkt_type; +			continue; +		case SKF_AD_IFINDEX: +			A = skb->dev->ifindex; +			continue; +		default: +			return 0; +		} +	} + +	return 0; +} + +/** + *	sk_chk_filter - verify socket filter code + *	@filter: filter to verify + *	@flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal instructions + * and no backward jumps. It must end with a RET instruction + * + * Returns 0 if the rule set is legal or a negative errno code if not. + */ +int sk_chk_filter(struct sock_filter *filter, int flen) +{ +	struct sock_filter *ftest; +	int pc; + +	if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) +		return -EINVAL; + +	/* check the filter code now */ +	for (pc = 0; pc < flen; pc++) { +		/* all jumps are forward as they are not signed */ +		ftest = &filter[pc]; +		if (BPF_CLASS(ftest->code) == BPF_JMP) { +			/* but they mustn't jump off the end */ +			if (BPF_OP(ftest->code) == BPF_JA) { +				/* +				 * Note, the large ftest->k might cause loops. +				 * Compare this with conditional jumps below, +				 * where offsets are limited. --ANK (981016) +				 */ +				if (ftest->k >= (unsigned)(flen-pc-1)) +					return -EINVAL; +			} else { +				/* for conditionals both must be safe */ + 				if (pc + ftest->jt +1 >= flen || +				    pc + ftest->jf +1 >= flen) +					return -EINVAL; +			} +		} + +		/* check that memory operations use valid addresses. */ +		if (ftest->k >= BPF_MEMWORDS) { +			/* but it might not be a memory operation... */ +			switch (ftest->code) { +			case BPF_ST:	 +			case BPF_STX:	 +			case BPF_LD|BPF_MEM:	 +			case BPF_LDX|BPF_MEM:	 +				return -EINVAL; +			} +		} +	} + +	/* +	 * The program must end with a return. We don't care where they +	 * jumped within the script (its always forwards) but in the end +	 * they _will_ hit this. +	 */ +        return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL; +} + +/** + *	sk_attach_filter - attach a socket filter + *	@fprog: the filter program + *	@sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ +	struct sk_filter *fp;  +	unsigned int fsize = sizeof(struct sock_filter) * fprog->len; +	int err; + +	/* Make sure new filter is there and in the right amounts. */ +        if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) +                return -EINVAL; + +	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); +	if (!fp) +		return -ENOMEM; +	if (copy_from_user(fp->insns, fprog->filter, fsize)) { +		sock_kfree_s(sk, fp, fsize+sizeof(*fp));  +		return -EFAULT; +	} + +	atomic_set(&fp->refcnt, 1); +	fp->len = fprog->len; + +	err = sk_chk_filter(fp->insns, fp->len); +	if (!err) { +		struct sk_filter *old_fp; + +		spin_lock_bh(&sk->sk_lock.slock); +		old_fp = sk->sk_filter; +		sk->sk_filter = fp; +		spin_unlock_bh(&sk->sk_lock.slock); +		fp = old_fp; +	} + +	if (fp) +		sk_filter_release(sk, fp); +	return err; +} + +EXPORT_SYMBOL(sk_chk_filter); +EXPORT_SYMBOL(sk_run_filter); diff --git a/net/core/flow.c b/net/core/flow.c new file mode 100644 index 00000000000..f289570b15a --- /dev/null +++ b/net/core/flow.c @@ -0,0 +1,371 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/completion.h> +#include <linux/percpu.h> +#include <linux/bitops.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <net/flow.h> +#include <asm/atomic.h> +#include <asm/semaphore.h> + +struct flow_cache_entry { +	struct flow_cache_entry	*next; +	u16			family; +	u8			dir; +	struct flowi		key; +	u32			genid; +	void			*object; +	atomic_t		*object_ref; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); + +static u32 flow_hash_shift; +#define flow_hash_size	(1 << flow_hash_shift) +static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; + +#define flow_table(cpu) (per_cpu(flow_tables, cpu)) + +static kmem_cache_t *flow_cachep; + +static int flow_lwm, flow_hwm; + +struct flow_percpu_info { +	int hash_rnd_recalc; +	u32 hash_rnd; +	int count; +} ____cacheline_aligned; +static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; + +#define flow_hash_rnd_recalc(cpu) \ +	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc) +#define flow_hash_rnd(cpu) \ +	(per_cpu(flow_hash_info, cpu).hash_rnd) +#define flow_count(cpu) \ +	(per_cpu(flow_hash_info, cpu).count) + +static struct timer_list flow_hash_rnd_timer; + +#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ) + +struct flow_flush_info { +	atomic_t cpuleft; +	struct completion completion; +}; +static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; + +#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) + +static void flow_cache_new_hashrnd(unsigned long arg) +{ +	int i; + +	for_each_cpu(i) +		flow_hash_rnd_recalc(i) = 1; + +	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; +	add_timer(&flow_hash_rnd_timer); +} + +static void __flow_cache_shrink(int cpu, int shrink_to) +{ +	struct flow_cache_entry *fle, **flp; +	int i; + +	for (i = 0; i < flow_hash_size; i++) { +		int k = 0; + +		flp = &flow_table(cpu)[i]; +		while ((fle = *flp) != NULL && k < shrink_to) { +			k++; +			flp = &fle->next; +		} +		while ((fle = *flp) != NULL) { +			*flp = fle->next; +			if (fle->object) +				atomic_dec(fle->object_ref); +			kmem_cache_free(flow_cachep, fle); +			flow_count(cpu)--; +		} +	} +} + +static void flow_cache_shrink(int cpu) +{ +	int shrink_to = flow_lwm / flow_hash_size; + +	__flow_cache_shrink(cpu, shrink_to); +} + +static void flow_new_hash_rnd(int cpu) +{ +	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); +	flow_hash_rnd_recalc(cpu) = 0; + +	__flow_cache_shrink(cpu, 0); +} + +static u32 flow_hash_code(struct flowi *key, int cpu) +{ +	u32 *k = (u32 *) key; + +	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & +		(flow_hash_size - 1)); +} + +#if (BITS_PER_LONG == 64) +typedef u64 flow_compare_t; +#else +typedef u32 flow_compare_t; +#endif + +extern void flowi_is_missized(void); + +/* I hear what you're saying, use memcmp.  But memcmp cannot make + * important assumptions that we can here, such as alignment and + * constant size. + */ +static int flow_key_compare(struct flowi *key1, struct flowi *key2) +{ +	flow_compare_t *k1, *k1_lim, *k2; +	const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); + +	if (sizeof(struct flowi) % sizeof(flow_compare_t)) +		flowi_is_missized(); + +	k1 = (flow_compare_t *) key1; +	k1_lim = k1 + n_elem; + +	k2 = (flow_compare_t *) key2; + +	do { +		if (*k1++ != *k2++) +			return 1; +	} while (k1 < k1_lim); + +	return 0; +} + +void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +			flow_resolve_t resolver) +{ +	struct flow_cache_entry *fle, **head; +	unsigned int hash; +	int cpu; + +	local_bh_disable(); +	cpu = smp_processor_id(); + +	fle = NULL; +	/* Packet really early in init?  Making flow_cache_init a +	 * pre-smp initcall would solve this.  --RR */ +	if (!flow_table(cpu)) +		goto nocache; + +	if (flow_hash_rnd_recalc(cpu)) +		flow_new_hash_rnd(cpu); +	hash = flow_hash_code(key, cpu); + +	head = &flow_table(cpu)[hash]; +	for (fle = *head; fle; fle = fle->next) { +		if (fle->family == family && +		    fle->dir == dir && +		    flow_key_compare(key, &fle->key) == 0) { +			if (fle->genid == atomic_read(&flow_cache_genid)) { +				void *ret = fle->object; + +				if (ret) +					atomic_inc(fle->object_ref); +				local_bh_enable(); + +				return ret; +			} +			break; +		} +	} + +	if (!fle) { +		if (flow_count(cpu) > flow_hwm) +			flow_cache_shrink(cpu); + +		fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC); +		if (fle) { +			fle->next = *head; +			*head = fle; +			fle->family = family; +			fle->dir = dir; +			memcpy(&fle->key, key, sizeof(*key)); +			fle->object = NULL; +			flow_count(cpu)++; +		} +	} + +nocache: +	{ +		void *obj; +		atomic_t *obj_ref; + +		resolver(key, family, dir, &obj, &obj_ref); + +		if (fle) { +			fle->genid = atomic_read(&flow_cache_genid); + +			if (fle->object) +				atomic_dec(fle->object_ref); + +			fle->object = obj; +			fle->object_ref = obj_ref; +			if (obj) +				atomic_inc(fle->object_ref); +		} +		local_bh_enable(); + +		return obj; +	} +} + +static void flow_cache_flush_tasklet(unsigned long data) +{ +	struct flow_flush_info *info = (void *)data; +	int i; +	int cpu; + +	cpu = smp_processor_id(); +	for (i = 0; i < flow_hash_size; i++) { +		struct flow_cache_entry *fle; + +		fle = flow_table(cpu)[i]; +		for (; fle; fle = fle->next) { +			unsigned genid = atomic_read(&flow_cache_genid); + +			if (!fle->object || fle->genid == genid) +				continue; + +			fle->object = NULL; +			atomic_dec(fle->object_ref); +		} +	} + +	if (atomic_dec_and_test(&info->cpuleft)) +		complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); +static void flow_cache_flush_per_cpu(void *data) +{ +	struct flow_flush_info *info = data; +	int cpu; +	struct tasklet_struct *tasklet; + +	cpu = smp_processor_id(); + +	tasklet = flow_flush_tasklet(cpu); +	tasklet->data = (unsigned long)info; +	tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ +	struct flow_flush_info info; +	static DECLARE_MUTEX(flow_flush_sem); + +	/* Don't want cpus going down or up during this. */ +	lock_cpu_hotplug(); +	down(&flow_flush_sem); +	atomic_set(&info.cpuleft, num_online_cpus()); +	init_completion(&info.completion); + +	local_bh_disable(); +	smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0); +	flow_cache_flush_tasklet((unsigned long)&info); +	local_bh_enable(); + +	wait_for_completion(&info.completion); +	up(&flow_flush_sem); +	unlock_cpu_hotplug(); +} + +static void __devinit flow_cache_cpu_prepare(int cpu) +{ +	struct tasklet_struct *tasklet; +	unsigned long order; + +	for (order = 0; +	     (PAGE_SIZE << order) < +		     (sizeof(struct flow_cache_entry *)*flow_hash_size); +	     order++) +		/* NOTHING */; + +	flow_table(cpu) = (struct flow_cache_entry **) +		__get_free_pages(GFP_KERNEL, order); +	if (!flow_table(cpu)) +		panic("NET: failed to allocate flow cache order %lu\n", order); + +	memset(flow_table(cpu), 0, PAGE_SIZE << order); + +	flow_hash_rnd_recalc(cpu) = 1; +	flow_count(cpu) = 0; + +	tasklet = flow_flush_tasklet(cpu); +	tasklet_init(tasklet, flow_cache_flush_tasklet, 0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int flow_cache_cpu(struct notifier_block *nfb, +			  unsigned long action, +			  void *hcpu) +{ +	if (action == CPU_DEAD) +		__flow_cache_shrink((unsigned long)hcpu, 0); +	return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init flow_cache_init(void) +{ +	int i; + +	flow_cachep = kmem_cache_create("flow_cache", +					sizeof(struct flow_cache_entry), +					0, SLAB_HWCACHE_ALIGN, +					NULL, NULL); + +	if (!flow_cachep) +		panic("NET: failed to allocate flow cache slab\n"); + +	flow_hash_shift = 10; +	flow_lwm = 2 * flow_hash_size; +	flow_hwm = 4 * flow_hash_size; + +	init_timer(&flow_hash_rnd_timer); +	flow_hash_rnd_timer.function = flow_cache_new_hashrnd; +	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; +	add_timer(&flow_hash_rnd_timer); + +	for_each_cpu(i) +		flow_cache_cpu_prepare(i); + +	hotcpu_notifier(flow_cache_cpu, 0); +	return 0; +} + +module_init(flow_cache_init); + +EXPORT_SYMBOL(flow_cache_genid); +EXPORT_SYMBOL(flow_cache_lookup); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 00000000000..b07c029e821 --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,250 @@ +/* + * net/sched/gen_estimator.c	Simple rate estimator. + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + *              Jamal Hadi Salim - moved it to net/core and reshulfed + *              names to make it usable in general net subsystem. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/gen_stats.h> + +/* +   This code is NOT intended to be used for statistics collection, +   its purpose is to provide a base for statistical multiplexing +   for controlled load service. +   If you need only statistics, run a user level daemon which +   periodically reads byte counters. + +   Unfortunately, rate estimation is not a very easy task. +   F.e. I did not find a simple way to estimate the current peak rate +   and even failed to formulate the problem 8)8) + +   So I preferred not to built an estimator into the scheduler, +   but run this task separately. +   Ideally, it should be kernel thread(s), but for now it runs +   from timers, which puts apparent top bounds on the number of rated +   flows, has minimal overhead on small, but is enough +   to handle controlled load service, sets of aggregates. + +   We measure rate over A=(1<<interval) seconds and evaluate EWMA: + +   avrate = avrate*(1-W) + rate*W + +   where W is chosen as negative power of 2: W = 2^(-ewma_log) + +   The resulting time constant is: + +   T = A/(-ln(1-W)) + + +   NOTES. + +   * The stored value for avbps is scaled by 2^5, so that maximal +     rate is ~1Gbit, avpps is scaled by 2^10. + +   * Minimal interval is HZ/4=250msec (it is the greatest common divisor +     for HZ=100 and HZ=1024 8)), maximal interval +     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals +     are too expensive, longer ones can be implemented +     at user level painlessly. + */ + +#define EST_MAX_INTERVAL	5 + +struct gen_estimator +{ +	struct gen_estimator	*next; +	struct gnet_stats_basic	*bstats; +	struct gnet_stats_rate_est	*rate_est; +	spinlock_t		*stats_lock; +	unsigned		interval; +	int			ewma_log; +	u64			last_bytes; +	u32			last_packets; +	u32			avpps; +	u32			avbps; +}; + +struct gen_estimator_head +{ +	struct timer_list	timer; +	struct gen_estimator	*list; +}; + +static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; + +/* Estimator array lock */ +static DEFINE_RWLOCK(est_lock); + +static void est_timer(unsigned long arg) +{ +	int idx = (int)arg; +	struct gen_estimator *e; + +	read_lock(&est_lock); +	for (e = elist[idx].list; e; e = e->next) { +		u64 nbytes; +		u32 npackets; +		u32 rate; + +		spin_lock(e->stats_lock); +		nbytes = e->bstats->bytes; +		npackets = e->bstats->packets; +		rate = (nbytes - e->last_bytes)<<(7 - idx); +		e->last_bytes = nbytes; +		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; +		e->rate_est->bps = (e->avbps+0xF)>>5; + +		rate = (npackets - e->last_packets)<<(12 - idx); +		e->last_packets = npackets; +		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; +		e->rate_est->pps = (e->avpps+0x1FF)>>10; +		spin_unlock(e->stats_lock); +	} + +	mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); +	read_unlock(&est_lock); +} + +/** + * gen_new_estimator - create a new rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est + * as destination. A new timer with the interval specified in the + * configuration TLV is created. Upon each interval, the latest statistics + * will be read from &bstats and the estimated rate will be stored in + * &rate_est with the statistics lock grabed during this period. + *  + * Returns 0 on success or a negative error code. + */ +int gen_new_estimator(struct gnet_stats_basic *bstats, +	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt) +{ +	struct gen_estimator *est; +	struct gnet_estimator *parm = RTA_DATA(opt); + +	if (RTA_PAYLOAD(opt) < sizeof(*parm)) +		return -EINVAL; + +	if (parm->interval < -2 || parm->interval > 3) +		return -EINVAL; + +	est = kmalloc(sizeof(*est), GFP_KERNEL); +	if (est == NULL) +		return -ENOBUFS; + +	memset(est, 0, sizeof(*est)); +	est->interval = parm->interval + 2; +	est->bstats = bstats; +	est->rate_est = rate_est; +	est->stats_lock = stats_lock; +	est->ewma_log = parm->ewma_log; +	est->last_bytes = bstats->bytes; +	est->avbps = rate_est->bps<<5; +	est->last_packets = bstats->packets; +	est->avpps = rate_est->pps<<10; + +	est->next = elist[est->interval].list; +	if (est->next == NULL) { +		init_timer(&elist[est->interval].timer); +		elist[est->interval].timer.data = est->interval; +		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); +		elist[est->interval].timer.function = est_timer; +		add_timer(&elist[est->interval].timer); +	} +	write_lock_bh(&est_lock); +	elist[est->interval].list = est; +	write_unlock_bh(&est_lock); +	return 0; +} + +/** + * gen_kill_estimator - remove a rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Removes the rate estimator specified by &bstats and &rate_est + * and deletes the timer. + */ +void gen_kill_estimator(struct gnet_stats_basic *bstats, +	struct gnet_stats_rate_est *rate_est) +{ +	int idx; +	struct gen_estimator *est, **pest; + +	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { +		int killed = 0; +		pest = &elist[idx].list; +		while ((est=*pest) != NULL) { +			if (est->rate_est != rate_est || est->bstats != bstats) { +				pest = &est->next; +				continue; +			} + +			write_lock_bh(&est_lock); +			*pest = est->next; +			write_unlock_bh(&est_lock); + +			kfree(est); +			killed++; +		} +		if (killed && elist[idx].list == NULL) +			del_timer(&elist[idx].timer); +	} +} + +/** + * gen_replace_estimator - replace rate estimator configruation + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + *  + * Returns 0 on success or a negative error code. + */ +int +gen_replace_estimator(struct gnet_stats_basic *bstats, +	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, +	struct rtattr *opt) +{ +    gen_kill_estimator(bstats, rate_est); +    return gen_new_estimator(bstats, rate_est, stats_lock, opt); +} +     + +EXPORT_SYMBOL(gen_kill_estimator); +EXPORT_SYMBOL(gen_new_estimator); +EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 00000000000..8f21490355f --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,239 @@ +/* + * net/core/gen_stats.c + * + *             This program is free software; you can redistribute it and/or + *             modify it under the terms of the GNU General Public License + *             as published by the Free Software Foundation; either version + *             2 of the License, or (at your option) any later version. + * + * Authors:  Thomas Graf <tgraf@suug.ch> + *           Jamal Hadi Salim + *           Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * See Documentation/networking/gen_stats.txt + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/rtnetlink.h> +#include <linux/gen_stats.h> +#include <net/gen_stats.h> + + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +{ +	RTA_PUT(d->skb, type, size, buf); +	return 0; + +rtattr_failure: +	spin_unlock_bh(d->lock); +	return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, +	int xstats_type, spinlock_t *lock, struct gnet_dump *d) +{ +	memset(d, 0, sizeof(*d)); +	 +	spin_lock_bh(lock); +	d->lock = lock; +	if (type) +		d->tail = (struct rtattr *) skb->tail; +	d->skb = skb; +	d->compat_tc_stats = tc_stats_type; +	d->compat_xstats = xstats_type; + +	if (d->tail) +		return gnet_stats_copy(d, type, NULL, 0); + +	return 0; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, +	struct gnet_dump *d) +{ +	return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); +} + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) +{ +	if (d->compat_tc_stats) { +		d->tc_stats.bytes = b->bytes; +		d->tc_stats.packets = b->packets; +	} + +	if (d->tail) +		return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b)); + +	return 0; +} + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @r: rate estimator statistics + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +{ +	if (d->compat_tc_stats) { +		d->tc_stats.bps = r->bps; +		d->tc_stats.pps = r->pps; +	} + +	if (d->tail) +		return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + +	return 0; +} + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @q: queue statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +{ +	if (d->compat_tc_stats) { +		d->tc_stats.drops = q->drops; +		d->tc_stats.qlen = q->qlen; +		d->tc_stats.backlog = q->backlog; +		d->tc_stats.overlimits = q->overlimits; +	} + +	if (d->tail) +		return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + +	return 0; +} + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application sepecific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ +	if (d->compat_xstats) { +		d->xstats = st; +		d->xstats_len = len; +	} + +	if (d->tail) +		return gnet_stats_copy(d, TCA_STATS_APP, st, len); + +	return 0; +} + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ +	if (d->tail) +		d->tail->rta_len = d->skb->tail - (u8 *) d->tail; + +	if (d->compat_tc_stats) +		if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, +			sizeof(d->tc_stats)) < 0) +			return -1; + +	if (d->compat_xstats && d->xstats) { +		if (gnet_stats_copy(d, d->compat_xstats, d->xstats, +			d->xstats_len) < 0) +			return -1; +	} + +	spin_unlock_bh(d->lock); +	return 0; +} + + +EXPORT_SYMBOL(gnet_stats_start_copy); +EXPORT_SYMBOL(gnet_stats_start_copy_compat); +EXPORT_SYMBOL(gnet_stats_copy_basic); +EXPORT_SYMBOL(gnet_stats_copy_rate_est); +EXPORT_SYMBOL(gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_copy_app); +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 00000000000..d57ace949ab --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,239 @@ +/* + *	iovec manipulation routines. + * + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + *	Fixes: + *		Andrew Lunn	:	Errors in iovec copying. + *		Pedro Roque	:	Added memcpy_fromiovecend and + *					csum_..._fromiovecend. + *		Andi Kleen	:	fixed error handling for 2.1 + *		Alexey Kuznetsov:	2.1 optimisations + *		Andi Kleen	:	Fix csum*fromiovecend for IPv6. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <net/checksum.h> +#include <net/sock.h> + +/* + *	Verify iovec. The caller must ensure that the iovec is big enough + *	to hold the message iovec. + * + *	Save time not doing verify_area. copy_*_user will make this work + *	in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ +	int size, err, ct; +	 +	if (m->msg_namelen) { +		if (mode == VERIFY_READ) { +			err = move_addr_to_kernel(m->msg_name, m->msg_namelen, +						  address); +			if (err < 0) +				return err; +		} +		m->msg_name = address; +	} else { +		m->msg_name = NULL; +	} + +	size = m->msg_iovlen * sizeof(struct iovec); +	if (copy_from_user(iov, m->msg_iov, size)) +		return -EFAULT; + +	m->msg_iov = iov; +	err = 0; + +	for (ct = 0; ct < m->msg_iovlen; ct++) { +		err += iov[ct].iov_len; +		/* +		 * Goal is not to verify user data, but to prevent returning +		 * negative value, which is interpreted as errno. +		 * Overflow is still possible, but it is harmless. +		 */ +		if (err < 0) +			return -EMSGSIZE; +	} + +	return err; +} + +/* + *	Copy kernel to iovec. Returns -EFAULT on error. + * + *	Note: this modifies the original iovec. + */ +  +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ +	while (len > 0) { +		if (iov->iov_len) { +			int copy = min_t(unsigned int, iov->iov_len, len); +			if (copy_to_user(iov->iov_base, kdata, copy)) +				return -EFAULT; +			kdata += copy; +			len -= copy; +			iov->iov_len -= copy; +			iov->iov_base += copy; +		} +		iov++; +	} + +	return 0; +} + +/* + *	Copy iovec to kernel. Returns -EFAULT on error. + * + *	Note: this modifies the original iovec. + */ +  +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ +	while (len > 0) { +		if (iov->iov_len) { +			int copy = min_t(unsigned int, len, iov->iov_len); +			if (copy_from_user(kdata, iov->iov_base, copy)) +				return -EFAULT; +			len -= copy; +			kdata += copy; +			iov->iov_base += copy; +			iov->iov_len -= copy; +		} +		iov++; +	} + +	return 0; +} + +/* + *	For use with ip_build_xmit + */ +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, +			int len) +{ +	/* Skip over the finished iovecs */ +	while (offset >= iov->iov_len) { +		offset -= iov->iov_len; +		iov++; +	} + +	while (len > 0) { +		u8 __user *base = iov->iov_base + offset; +		int copy = min_t(unsigned int, len, iov->iov_len - offset); + +		offset = 0; +		if (copy_from_user(kdata, base, copy)) +			return -EFAULT; +		len -= copy; +		kdata += copy; +		iov++; +	} + +	return 0; +} + +/* + *	And now for the all-in-one: copy and checksum from a user iovec + *	directly to a datagram + *	Calls to csum_partial but the last must be in 32 bit chunks + * + *	ip_build_xmit must ensure that when fragmenting only the last + *	call to this function will be unaligned also. + */ +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, +				 int offset, unsigned int len, int *csump) +{ +	int csum = *csump; +	int partial_cnt = 0, err = 0; + +	/* Skip over the finished iovecs */ +	while (offset >= iov->iov_len) { +		offset -= iov->iov_len; +		iov++; +	} + +	while (len > 0) { +		u8 __user *base = iov->iov_base + offset; +		int copy = min_t(unsigned int, len, iov->iov_len - offset); + +		offset = 0; + +		/* There is a remnant from previous iov. */ +		if (partial_cnt) { +			int par_len = 4 - partial_cnt; + +			/* iov component is too short ... */ +			if (par_len > copy) { +				if (copy_from_user(kdata, base, copy)) +					goto out_fault; +				kdata += copy; +				base += copy; +				partial_cnt += copy; +				len -= copy; +				iov++; +				if (len) +					continue; +				*csump = csum_partial(kdata - partial_cnt, +							 partial_cnt, csum); +				goto out; +			} +			if (copy_from_user(kdata, base, par_len)) +				goto out_fault; +			csum = csum_partial(kdata - partial_cnt, 4, csum); +			kdata += par_len; +			base  += par_len; +			copy  -= par_len; +			len   -= par_len; +			partial_cnt = 0; +		} + +		if (len > copy) { +			partial_cnt = copy % 4; +			if (partial_cnt) { +				copy -= partial_cnt; +				if (copy_from_user(kdata + copy, base + copy, +				 		partial_cnt)) +					goto out_fault; +			} +		} + +		if (copy) { +			csum = csum_and_copy_from_user(base, kdata, copy, +							csum, &err); +			if (err) +				goto out; +		} +		len   -= copy + partial_cnt; +		kdata += copy + partial_cnt; +		iov++; +	} +        *csump = csum; +out: +	return err; + +out_fault: +	err = -EFAULT; +	goto out; +} + +EXPORT_SYMBOL(csum_partial_copy_fromiovecend); +EXPORT_SYMBOL(memcpy_fromiovec); +EXPORT_SYMBOL(memcpy_fromiovecend); +EXPORT_SYMBOL(memcpy_toiovec); diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 00000000000..4859b7446c6 --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,137 @@ +/* + * Linux network device link state notification + * + * Author: + *     Stefan Rompf <sux@loplof.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <asm/types.h> + + +enum lw_bits { +	LW_RUNNING = 0, +	LW_SE_USED +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(void *dummy); +static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +struct lw_event { +	struct list_head list; +	struct net_device *dev; +}; + +/* Avoid kmalloc() for most systems */ +static struct lw_event singleevent; + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ +	LIST_HEAD(head); +	struct list_head *n, *next; + +	spin_lock_irq(&lweventlist_lock); +	list_splice_init(&lweventlist, &head); +	spin_unlock_irq(&lweventlist_lock); + +	list_for_each_safe(n, next, &head) { +		struct lw_event *event = list_entry(n, struct lw_event, list); +		struct net_device *dev = event->dev; + +		if (event == &singleevent) { +			clear_bit(LW_SE_USED, &linkwatch_flags); +		} else { +			kfree(event); +		} + +		/* We are about to handle this device, +		 * so new events can be accepted +		 */ +		clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + +		if (dev->flags & IFF_UP) { +			netdev_state_change(dev); +		} + +		dev_put(dev); +	} +}        + + +static void linkwatch_event(void *dummy) +{ +	/* Limit the number of linkwatch events to one +	 * per second so that a runaway driver does not +	 * cause a storm of messages on the netlink +	 * socket +	 */	 +	linkwatch_nextevent = jiffies + HZ; +	clear_bit(LW_RUNNING, &linkwatch_flags); + +	rtnl_shlock(); +	linkwatch_run_queue(); +	rtnl_shunlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ +	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { +		unsigned long flags; +		struct lw_event *event; + +		if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) { +			event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC); + +			if (unlikely(event == NULL)) { +				clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); +				return; +			} +		} else { +			event = &singleevent; +		} + +		dev_hold(dev); +		event->dev = dev; + +		spin_lock_irqsave(&lweventlist_lock, flags); +		list_add_tail(&event->list, &lweventlist); +		spin_unlock_irqrestore(&lweventlist_lock, flags); + +		if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) { +			unsigned long thisevent = jiffies; + +			if (thisevent >= linkwatch_nextevent) { +				schedule_work(&linkwatch_work); +			} else { +				schedule_delayed_work(&linkwatch_work, linkwatch_nextevent - thisevent); +			} +		} +	} +} + +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 00000000000..0a2f67bbef2 --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,2362 @@ +/* + *	Generic address resolution entity + * + *	Authors: + *	Pedro Roque		<roque@di.fc.ul.pt> + *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru> + * + *	This program is free software; you can redistribute it and/or + *      modify it under the terms of the GNU General Public License + *      as published by the Free Software Foundation; either version + *      2 of the License, or (at your option) any later version. + * + *	Fixes: + *	Vitaly E. Lavrov	releasing NULL neighbor in neigh_add. + *	Harald Welte		Add neighbour cache statistics like rtstat + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/times.h> +#include <net/neighbour.h> +#include <net/dst.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +#define PNEIGH_HASHMASK		0xF + +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); + +static struct neigh_table *neigh_tables; +static struct file_operations neigh_stat_seq_fops; + +/* +   Neighbour hash table buckets are protected with rwlock tbl->lock. + +   - All the scans/updates to hash buckets MUST be made under this lock. +   - NOTHING clever should be made under this lock: no callbacks +     to protocol backends, no attempts to send something to network. +     It will result in deadlocks, if backend/driver wants to use neighbour +     cache. +   - If the entry requires some non-trivial actions, increase +     its reference count and release table lock. + +   Neighbour entries are protected: +   - with reference count. +   - with rwlock neigh->lock + +   Reference count prevents destruction. + +   neigh->lock mainly serializes ll address data and its validity state. +   However, the same lock is used to protect another entry fields: +    - timer +    - resolution queue + +   Again, nothing clever shall be made under neigh->lock, +   the most complicated procedure, which we allow is dev->hard_header. +   It is supposed, that dev->hard_header is simplistic and does +   not make callbacks to neighbour tables. + +   The last lock is neigh_tbl_lock. It is pure SMP lock, protecting +   list of neighbour tables. This list is used only in process context, + */ + +static DEFINE_RWLOCK(neigh_tbl_lock); + +static int neigh_blackhole(struct sk_buff *skb) +{ +	kfree_skb(skb); +	return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ +	return (base ? (net_random() % base) + (base >> 1) : 0); +} + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ +	int shrunk = 0; +	int i; + +	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + +	write_lock_bh(&tbl->lock); +	for (i = 0; i <= tbl->hash_mask; i++) { +		struct neighbour *n, **np; + +		np = &tbl->hash_buckets[i]; +		while ((n = *np) != NULL) { +			/* Neighbour record may be discarded if: +			 * - nobody refers to it. +			 * - it is not permanent +			 */ +			write_lock(&n->lock); +			if (atomic_read(&n->refcnt) == 1 && +			    !(n->nud_state & NUD_PERMANENT)) { +				*np	= n->next; +				n->dead = 1; +				shrunk	= 1; +				write_unlock(&n->lock); +				neigh_release(n); +				continue; +			} +			write_unlock(&n->lock); +			np = &n->next; +		} +	} + +	tbl->last_flush = jiffies; + +	write_unlock_bh(&tbl->lock); + +	return shrunk; +} + +static int neigh_del_timer(struct neighbour *n) +{ +	if ((n->nud_state & NUD_IN_TIMER) && +	    del_timer(&n->timer)) { +		neigh_release(n); +		return 1; +	} +	return 0; +} + +static void pneigh_queue_purge(struct sk_buff_head *list) +{ +	struct sk_buff *skb; + +	while ((skb = skb_dequeue(list)) != NULL) { +		dev_put(skb->dev); +		kfree_skb(skb); +	} +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ +	int i; + +	write_lock_bh(&tbl->lock); + +	for (i=0; i <= tbl->hash_mask; i++) { +		struct neighbour *n, **np; + +		np = &tbl->hash_buckets[i]; +		while ((n = *np) != NULL) { +			if (dev && n->dev != dev) { +				np = &n->next; +				continue; +			} +			*np = n->next; +			write_lock_bh(&n->lock); +			n->dead = 1; +			neigh_del_timer(n); +			write_unlock_bh(&n->lock); +			neigh_release(n); +		} +	} + +        write_unlock_bh(&tbl->lock); +} + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ +	int i; + +	write_lock_bh(&tbl->lock); + +	for (i = 0; i <= tbl->hash_mask; i++) { +		struct neighbour *n, **np = &tbl->hash_buckets[i]; + +		while ((n = *np) != NULL) { +			if (dev && n->dev != dev) { +				np = &n->next; +				continue; +			} +			*np = n->next; +			write_lock(&n->lock); +			neigh_del_timer(n); +			n->dead = 1; + +			if (atomic_read(&n->refcnt) != 1) { +				/* The most unpleasant situation. +				   We must destroy neighbour entry, +				   but someone still uses it. + +				   The destroy will be delayed until +				   the last user releases us, but +				   we must kill timers etc. and move +				   it to safe state. +				 */ +				skb_queue_purge(&n->arp_queue); +				n->output = neigh_blackhole; +				if (n->nud_state & NUD_VALID) +					n->nud_state = NUD_NOARP; +				else +					n->nud_state = NUD_NONE; +				NEIGH_PRINTK2("neigh %p is stray.\n", n); +			} +			write_unlock(&n->lock); +			neigh_release(n); +		} +	} + +	pneigh_ifdown(tbl, dev); +	write_unlock_bh(&tbl->lock); + +	del_timer_sync(&tbl->proxy_timer); +	pneigh_queue_purge(&tbl->proxy_queue); +	return 0; +} + +static struct neighbour *neigh_alloc(struct neigh_table *tbl) +{ +	struct neighbour *n = NULL; +	unsigned long now = jiffies; +	int entries; + +	entries = atomic_inc_return(&tbl->entries) - 1; +	if (entries >= tbl->gc_thresh3 || +	    (entries >= tbl->gc_thresh2 && +	     time_after(now, tbl->last_flush + 5 * HZ))) { +		if (!neigh_forced_gc(tbl) && +		    entries >= tbl->gc_thresh3) +			goto out_entries; +	} + +	n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); +	if (!n) +		goto out_entries; + +	memset(n, 0, tbl->entry_size); + +	skb_queue_head_init(&n->arp_queue); +	rwlock_init(&n->lock); +	n->updated	  = n->used = now; +	n->nud_state	  = NUD_NONE; +	n->output	  = neigh_blackhole; +	n->parms	  = neigh_parms_clone(&tbl->parms); +	init_timer(&n->timer); +	n->timer.function = neigh_timer_handler; +	n->timer.data	  = (unsigned long)n; + +	NEIGH_CACHE_STAT_INC(tbl, allocs); +	n->tbl		  = tbl; +	atomic_set(&n->refcnt, 1); +	n->dead		  = 1; +out: +	return n; + +out_entries: +	atomic_dec(&tbl->entries); +	goto out; +} + +static struct neighbour **neigh_hash_alloc(unsigned int entries) +{ +	unsigned long size = entries * sizeof(struct neighbour *); +	struct neighbour **ret; + +	if (size <= PAGE_SIZE) { +		ret = kmalloc(size, GFP_ATOMIC); +	} else { +		ret = (struct neighbour **) +			__get_free_pages(GFP_ATOMIC, get_order(size)); +	} +	if (ret) +		memset(ret, 0, size); + +	return ret; +} + +static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +{ +	unsigned long size = entries * sizeof(struct neighbour *); + +	if (size <= PAGE_SIZE) +		kfree(hash); +	else +		free_pages((unsigned long)hash, get_order(size)); +} + +static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +{ +	struct neighbour **new_hash, **old_hash; +	unsigned int i, new_hash_mask, old_entries; + +	NEIGH_CACHE_STAT_INC(tbl, hash_grows); + +	BUG_ON(new_entries & (new_entries - 1)); +	new_hash = neigh_hash_alloc(new_entries); +	if (!new_hash) +		return; + +	old_entries = tbl->hash_mask + 1; +	new_hash_mask = new_entries - 1; +	old_hash = tbl->hash_buckets; + +	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); +	for (i = 0; i < old_entries; i++) { +		struct neighbour *n, *next; + +		for (n = old_hash[i]; n; n = next) { +			unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + +			hash_val &= new_hash_mask; +			next = n->next; + +			n->next = new_hash[hash_val]; +			new_hash[hash_val] = n; +		} +	} +	tbl->hash_buckets = new_hash; +	tbl->hash_mask = new_hash_mask; + +	neigh_hash_free(old_hash, old_entries); +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, +			       struct net_device *dev) +{ +	struct neighbour *n; +	int key_len = tbl->key_len; +	u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; +	 +	NEIGH_CACHE_STAT_INC(tbl, lookups); + +	read_lock_bh(&tbl->lock); +	for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { +		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { +			neigh_hold(n); +			NEIGH_CACHE_STAT_INC(tbl, hits); +			break; +		} +	} +	read_unlock_bh(&tbl->lock); +	return n; +} + +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) +{ +	struct neighbour *n; +	int key_len = tbl->key_len; +	u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + +	NEIGH_CACHE_STAT_INC(tbl, lookups); + +	read_lock_bh(&tbl->lock); +	for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { +		if (!memcmp(n->primary_key, pkey, key_len)) { +			neigh_hold(n); +			NEIGH_CACHE_STAT_INC(tbl, hits); +			break; +		} +	} +	read_unlock_bh(&tbl->lock); +	return n; +} + +struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, +			       struct net_device *dev) +{ +	u32 hash_val; +	int key_len = tbl->key_len; +	int error; +	struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + +	if (!n) { +		rc = ERR_PTR(-ENOBUFS); +		goto out; +	} + +	memcpy(n->primary_key, pkey, key_len); +	n->dev = dev; +	dev_hold(dev); + +	/* Protocol specific setup. */ +	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) { +		rc = ERR_PTR(error); +		goto out_neigh_release; +	} + +	/* Device specific setup. */ +	if (n->parms->neigh_setup && +	    (error = n->parms->neigh_setup(n)) < 0) { +		rc = ERR_PTR(error); +		goto out_neigh_release; +	} + +	n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + +	write_lock_bh(&tbl->lock); + +	if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) +		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + +	hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + +	if (n->parms->dead) { +		rc = ERR_PTR(-EINVAL); +		goto out_tbl_unlock; +	} + +	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { +		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { +			neigh_hold(n1); +			rc = n1; +			goto out_tbl_unlock; +		} +	} + +	n->next = tbl->hash_buckets[hash_val]; +	tbl->hash_buckets[hash_val] = n; +	n->dead = 0; +	neigh_hold(n); +	write_unlock_bh(&tbl->lock); +	NEIGH_PRINTK2("neigh %p is created.\n", n); +	rc = n; +out: +	return rc; +out_tbl_unlock: +	write_unlock_bh(&tbl->lock); +out_neigh_release: +	neigh_release(n); +	goto out; +} + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, +				    struct net_device *dev, int creat) +{ +	struct pneigh_entry *n; +	int key_len = tbl->key_len; +	u32 hash_val = *(u32 *)(pkey + key_len - 4); + +	hash_val ^= (hash_val >> 16); +	hash_val ^= hash_val >> 8; +	hash_val ^= hash_val >> 4; +	hash_val &= PNEIGH_HASHMASK; + +	read_lock_bh(&tbl->lock); + +	for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { +		if (!memcmp(n->key, pkey, key_len) && +		    (n->dev == dev || !n->dev)) { +			read_unlock_bh(&tbl->lock); +			goto out; +		} +	} +	read_unlock_bh(&tbl->lock); +	n = NULL; +	if (!creat) +		goto out; + +	n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); +	if (!n) +		goto out; + +	memcpy(n->key, pkey, key_len); +	n->dev = dev; +	if (dev) +		dev_hold(dev); + +	if (tbl->pconstructor && tbl->pconstructor(n)) { +		if (dev) +			dev_put(dev); +		kfree(n); +		n = NULL; +		goto out; +	} + +	write_lock_bh(&tbl->lock); +	n->next = tbl->phash_buckets[hash_val]; +	tbl->phash_buckets[hash_val] = n; +	write_unlock_bh(&tbl->lock); +out: +	return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, +		  struct net_device *dev) +{ +	struct pneigh_entry *n, **np; +	int key_len = tbl->key_len; +	u32 hash_val = *(u32 *)(pkey + key_len - 4); + +	hash_val ^= (hash_val >> 16); +	hash_val ^= hash_val >> 8; +	hash_val ^= hash_val >> 4; +	hash_val &= PNEIGH_HASHMASK; + +	write_lock_bh(&tbl->lock); +	for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; +	     np = &n->next) { +		if (!memcmp(n->key, pkey, key_len) && n->dev == dev) { +			*np = n->next; +			write_unlock_bh(&tbl->lock); +			if (tbl->pdestructor) +				tbl->pdestructor(n); +			if (n->dev) +				dev_put(n->dev); +			kfree(n); +			return 0; +		} +	} +	write_unlock_bh(&tbl->lock); +	return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ +	struct pneigh_entry *n, **np; +	u32 h; + +	for (h = 0; h <= PNEIGH_HASHMASK; h++) { +		np = &tbl->phash_buckets[h]; +		while ((n = *np) != NULL) { +			if (!dev || n->dev == dev) { +				*np = n->next; +				if (tbl->pdestructor) +					tbl->pdestructor(n); +				if (n->dev) +					dev_put(n->dev); +				kfree(n); +				continue; +			} +			np = &n->next; +		} +	} +	return -ENOENT; +} + + +/* + *	neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ +	struct hh_cache *hh; + +	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + +	if (!neigh->dead) { +		printk(KERN_WARNING +		       "Destroying alive neighbour %p\n", neigh); +		dump_stack(); +		return; +	} + +	if (neigh_del_timer(neigh)) +		printk(KERN_WARNING "Impossible event.\n"); + +	while ((hh = neigh->hh) != NULL) { +		neigh->hh = hh->hh_next; +		hh->hh_next = NULL; +		write_lock_bh(&hh->hh_lock); +		hh->hh_output = neigh_blackhole; +		write_unlock_bh(&hh->hh_lock); +		if (atomic_dec_and_test(&hh->hh_refcnt)) +			kfree(hh); +	} + +	if (neigh->ops && neigh->ops->destructor) +		(neigh->ops->destructor)(neigh); + +	skb_queue_purge(&neigh->arp_queue); + +	dev_put(neigh->dev); +	neigh_parms_put(neigh->parms); + +	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + +	atomic_dec(&neigh->tbl->entries); +	kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} + +/* Neighbour state is suspicious; +   disable fast path. + +   Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ +	struct hh_cache *hh; + +	NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + +	neigh->output = neigh->ops->output; + +	for (hh = neigh->hh; hh; hh = hh->hh_next) +		hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; +   enable fast path. + +   Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ +	struct hh_cache *hh; + +	NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + +	neigh->output = neigh->ops->connected_output; + +	for (hh = neigh->hh; hh; hh = hh->hh_next) +		hh->hh_output = neigh->ops->hh_output; +} + +static void neigh_periodic_timer(unsigned long arg) +{ +	struct neigh_table *tbl = (struct neigh_table *)arg; +	struct neighbour *n, **np; +	unsigned long expire, now = jiffies; + +	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + +	write_lock(&tbl->lock); + +	/* +	 *	periodically recompute ReachableTime from random function +	 */ + +	if (time_after(now, tbl->last_rand + 300 * HZ)) { +		struct neigh_parms *p; +		tbl->last_rand = now; +		for (p = &tbl->parms; p; p = p->next) +			p->reachable_time = +				neigh_rand_reach_time(p->base_reachable_time); +	} + +	np = &tbl->hash_buckets[tbl->hash_chain_gc]; +	tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask); + +	while ((n = *np) != NULL) { +		unsigned int state; + +		write_lock(&n->lock); + +		state = n->nud_state; +		if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { +			write_unlock(&n->lock); +			goto next_elt; +		} + +		if (time_before(n->used, n->confirmed)) +			n->used = n->confirmed; + +		if (atomic_read(&n->refcnt) == 1 && +		    (state == NUD_FAILED || +		     time_after(now, n->used + n->parms->gc_staletime))) { +			*np = n->next; +			n->dead = 1; +			write_unlock(&n->lock); +			neigh_release(n); +			continue; +		} +		write_unlock(&n->lock); + +next_elt: +		np = &n->next; +	} + + 	/* Cycle through all hash buckets every base_reachable_time/2 ticks. + 	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 + 	 * base_reachable_time. +	 */ +	expire = tbl->parms.base_reachable_time >> 1; +	expire /= (tbl->hash_mask + 1); +	if (!expire) +		expire = 1; + + 	mod_timer(&tbl->gc_timer, now + expire); + +	write_unlock(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ +	struct neigh_parms *p = n->parms; +	return (n->nud_state & NUD_PROBE ? +		p->ucast_probes : +		p->ucast_probes + p->app_probes + p->mcast_probes); +} + + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ +	unsigned long now, next; +	struct neighbour *neigh = (struct neighbour *)arg; +	unsigned state; +	int notify = 0; + +	write_lock(&neigh->lock); + +	state = neigh->nud_state; +	now = jiffies; +	next = now + HZ; + +	if (!(state & NUD_IN_TIMER)) { +#ifndef CONFIG_SMP +		printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); +#endif +		goto out; +	} + +	if (state & NUD_REACHABLE) { +		if (time_before_eq(now,  +				   neigh->confirmed + neigh->parms->reachable_time)) { +			NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); +			next = neigh->confirmed + neigh->parms->reachable_time; +		} else if (time_before_eq(now, +					  neigh->used + neigh->parms->delay_probe_time)) { +			NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); +			neigh->nud_state = NUD_DELAY; +			neigh_suspect(neigh); +			next = now + neigh->parms->delay_probe_time; +		} else { +			NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); +			neigh->nud_state = NUD_STALE; +			neigh_suspect(neigh); +		} +	} else if (state & NUD_DELAY) { +		if (time_before_eq(now,  +				   neigh->confirmed + neigh->parms->delay_probe_time)) { +			NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); +			neigh->nud_state = NUD_REACHABLE; +			neigh_connect(neigh); +			next = neigh->confirmed + neigh->parms->reachable_time; +		} else { +			NEIGH_PRINTK2("neigh %p is probed.\n", neigh); +			neigh->nud_state = NUD_PROBE; +			atomic_set(&neigh->probes, 0); +			next = now + neigh->parms->retrans_time; +		} +	} else { +		/* NUD_PROBE|NUD_INCOMPLETE */ +		next = now + neigh->parms->retrans_time; +	} + +	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && +	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { +		struct sk_buff *skb; + +		neigh->nud_state = NUD_FAILED; +		notify = 1; +		NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); +		NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + +		/* It is very thin place. report_unreachable is very complicated +		   routine. Particularly, it can hit the same neighbour entry! + +		   So that, we try to be accurate and avoid dead loop. --ANK +		 */ +		while (neigh->nud_state == NUD_FAILED && +		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { +			write_unlock(&neigh->lock); +			neigh->ops->error_report(neigh, skb); +			write_lock(&neigh->lock); +		} +		skb_queue_purge(&neigh->arp_queue); +	} + +	if (neigh->nud_state & NUD_IN_TIMER) { +		neigh_hold(neigh); +		if (time_before(next, jiffies + HZ/2)) +			next = jiffies + HZ/2; +		neigh->timer.expires = next; +		add_timer(&neigh->timer); +	} +	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { +		struct sk_buff *skb = skb_peek(&neigh->arp_queue); +		/* keep skb alive even if arp_queue overflows */ +		if (skb) +			skb_get(skb); +		write_unlock(&neigh->lock); +		neigh->ops->solicit(neigh, skb); +		atomic_inc(&neigh->probes); +		if (skb) +			kfree_skb(skb); +	} else { +out: +		write_unlock(&neigh->lock); +	} + +#ifdef CONFIG_ARPD +	if (notify && neigh->parms->app_probes) +		neigh_app_notify(neigh); +#endif +	neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ +	int rc; +	unsigned long now; + +	write_lock_bh(&neigh->lock); + +	rc = 0; +	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) +		goto out_unlock_bh; + +	now = jiffies; +	 +	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { +		if (neigh->parms->mcast_probes + neigh->parms->app_probes) { +			atomic_set(&neigh->probes, neigh->parms->ucast_probes); +			neigh->nud_state     = NUD_INCOMPLETE; +			neigh_hold(neigh); +			neigh->timer.expires = now + 1; +			add_timer(&neigh->timer); +		} else { +			neigh->nud_state = NUD_FAILED; +			write_unlock_bh(&neigh->lock); + +			if (skb) +				kfree_skb(skb); +			return 1; +		} +	} else if (neigh->nud_state & NUD_STALE) { +		NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); +		neigh_hold(neigh); +		neigh->nud_state = NUD_DELAY; +		neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; +		add_timer(&neigh->timer); +	} + +	if (neigh->nud_state == NUD_INCOMPLETE) { +		if (skb) { +			if (skb_queue_len(&neigh->arp_queue) >= +			    neigh->parms->queue_len) { +				struct sk_buff *buff; +				buff = neigh->arp_queue.next; +				__skb_unlink(buff, &neigh->arp_queue); +				kfree_skb(buff); +			} +			__skb_queue_tail(&neigh->arp_queue, skb); +		} +		rc = 1; +	} +out_unlock_bh: +	write_unlock_bh(&neigh->lock); +	return rc; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ +	struct hh_cache *hh; +	void (*update)(struct hh_cache*, struct net_device*, unsigned char *) = +		neigh->dev->header_cache_update; + +	if (update) { +		for (hh = neigh->hh; hh; hh = hh->hh_next) { +			write_lock_bh(&hh->hh_lock); +			update(hh, neigh->dev, neigh->ha); +			write_unlock_bh(&hh->hh_lock); +		} +	} +} + + + +/* Generic update routine. +   -- lladdr is new lladdr or NULL, if it is not supplied. +   -- new    is new state. +   -- flags +	NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, +				if it is different. +	NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" +				lladdr instead of overriding it  +				if it is different. +				It also allows to retain current state +				if lladdr is unchanged. +	NEIGH_UPDATE_F_ADMIN	means that the change is administrative. + +	NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing  +				NTF_ROUTER flag. +	NEIGH_UPDATE_F_ISROUTER	indicates if the neighbour is known as +				a router. + +   Caller MUST hold reference count on the entry. + */ + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, +		 u32 flags) +{ +	u8 old; +	int err; +#ifdef CONFIG_ARPD +	int notify = 0; +#endif +	struct net_device *dev; +	int update_isrouter = 0; + +	write_lock_bh(&neigh->lock); + +	dev    = neigh->dev; +	old    = neigh->nud_state; +	err    = -EPERM; + +	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&  +	    (old & (NUD_NOARP | NUD_PERMANENT))) +		goto out; + +	if (!(new & NUD_VALID)) { +		neigh_del_timer(neigh); +		if (old & NUD_CONNECTED) +			neigh_suspect(neigh); +		neigh->nud_state = new; +		err = 0; +#ifdef CONFIG_ARPD +		notify = old & NUD_VALID; +#endif +		goto out; +	} + +	/* Compare new lladdr with cached one */ +	if (!dev->addr_len) { +		/* First case: device needs no address. */ +		lladdr = neigh->ha; +	} else if (lladdr) { +		/* The second case: if something is already cached +		   and a new address is proposed: +		   - compare new & old +		   - if they are different, check override flag +		 */ +		if ((old & NUD_VALID) &&  +		    !memcmp(lladdr, neigh->ha, dev->addr_len)) +			lladdr = neigh->ha; +	} else { +		/* No address is supplied; if we know something, +		   use it, otherwise discard the request. +		 */ +		err = -EINVAL; +		if (!(old & NUD_VALID)) +			goto out; +		lladdr = neigh->ha; +	} + +	if (new & NUD_CONNECTED) +		neigh->confirmed = jiffies; +	neigh->updated = jiffies; + +	/* If entry was valid and address is not changed, +	   do not change entry state, if new one is STALE. +	 */ +	err = 0; +	update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; +	if (old & NUD_VALID) { +		if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { +			update_isrouter = 0; +			if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && +			    (old & NUD_CONNECTED)) { +				lladdr = neigh->ha; +				new = NUD_STALE; +			} else +				goto out; +		} else { +			if (lladdr == neigh->ha && new == NUD_STALE && +			    ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || +			     (old & NUD_CONNECTED)) +			    ) +				new = old; +		} +	} + +	if (new != old) { +		neigh_del_timer(neigh); +		if (new & NUD_IN_TIMER) { +			neigh_hold(neigh); +			neigh->timer.expires = jiffies +  +						((new & NUD_REACHABLE) ?  +						 neigh->parms->reachable_time : 0); +			add_timer(&neigh->timer); +		} +		neigh->nud_state = new; +	} + +	if (lladdr != neigh->ha) { +		memcpy(&neigh->ha, lladdr, dev->addr_len); +		neigh_update_hhs(neigh); +		if (!(new & NUD_CONNECTED)) +			neigh->confirmed = jiffies - +				      (neigh->parms->base_reachable_time << 1); +#ifdef CONFIG_ARPD +		notify = 1; +#endif +	} +	if (new == old) +		goto out; +	if (new & NUD_CONNECTED) +		neigh_connect(neigh); +	else +		neigh_suspect(neigh); +	if (!(old & NUD_VALID)) { +		struct sk_buff *skb; + +		/* Again: avoid dead loop if something went wrong */ + +		while (neigh->nud_state & NUD_VALID && +		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { +			struct neighbour *n1 = neigh; +			write_unlock_bh(&neigh->lock); +			/* On shaper/eql skb->dst->neighbour != neigh :( */ +			if (skb->dst && skb->dst->neighbour) +				n1 = skb->dst->neighbour; +			n1->output(skb); +			write_lock_bh(&neigh->lock); +		} +		skb_queue_purge(&neigh->arp_queue); +	} +out: +	if (update_isrouter) { +		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? +			(neigh->flags | NTF_ROUTER) : +			(neigh->flags & ~NTF_ROUTER); +	} +	write_unlock_bh(&neigh->lock); +#ifdef CONFIG_ARPD +	if (notify && neigh->parms->app_probes) +		neigh_app_notify(neigh); +#endif +	return err; +} + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, +				 u8 *lladdr, void *saddr, +				 struct net_device *dev) +{ +	struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, +						 lladdr || !dev->addr_len); +	if (neigh) +		neigh_update(neigh, lladdr, NUD_STALE,  +			     NEIGH_UPDATE_F_OVERRIDE); +	return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, +			  u16 protocol) +{ +	struct hh_cache	*hh; +	struct net_device *dev = dst->dev; + +	for (hh = n->hh; hh; hh = hh->hh_next) +		if (hh->hh_type == protocol) +			break; + +	if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { +		memset(hh, 0, sizeof(struct hh_cache)); +		rwlock_init(&hh->hh_lock); +		hh->hh_type = protocol; +		atomic_set(&hh->hh_refcnt, 0); +		hh->hh_next = NULL; +		if (dev->hard_header_cache(n, hh)) { +			kfree(hh); +			hh = NULL; +		} else { +			atomic_inc(&hh->hh_refcnt); +			hh->hh_next = n->hh; +			n->hh	    = hh; +			if (n->nud_state & NUD_CONNECTED) +				hh->hh_output = n->ops->hh_output; +			else +				hh->hh_output = n->ops->output; +		} +	} +	if (hh)	{ +		atomic_inc(&hh->hh_refcnt); +		dst->hh = hh; +	} +} + +/* This function can be used in contexts, where only old dev_queue_xmit +   worked, f.e. if you want to override normal output path (eql, shaper), +   but resolution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; + +	__skb_pull(skb, skb->nh.raw - skb->data); + +	if (dev->hard_header && +	    dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, +		    	     skb->len) < 0 && +	    dev->rebuild_header(skb)) +		return 0; + +	return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ +	struct dst_entry *dst = skb->dst; +	struct neighbour *neigh; +	int rc = 0; + +	if (!dst || !(neigh = dst->neighbour)) +		goto discard; + +	__skb_pull(skb, skb->nh.raw - skb->data); + +	if (!neigh_event_send(neigh, skb)) { +		int err; +		struct net_device *dev = neigh->dev; +		if (dev->hard_header_cache && !dst->hh) { +			write_lock_bh(&neigh->lock); +			if (!dst->hh) +				neigh_hh_init(neigh, dst, dst->ops->protocol); +			err = dev->hard_header(skb, dev, ntohs(skb->protocol), +					       neigh->ha, NULL, skb->len); +			write_unlock_bh(&neigh->lock); +		} else { +			read_lock_bh(&neigh->lock); +			err = dev->hard_header(skb, dev, ntohs(skb->protocol), +					       neigh->ha, NULL, skb->len); +			read_unlock_bh(&neigh->lock); +		} +		if (err >= 0) +			rc = neigh->ops->queue_xmit(skb); +		else +			goto out_kfree_skb; +	} +out: +	return rc; +discard: +	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", +		      dst, dst ? dst->neighbour : NULL); +out_kfree_skb: +	rc = -EINVAL; +	kfree_skb(skb); +	goto out; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ +	int err; +	struct dst_entry *dst = skb->dst; +	struct neighbour *neigh = dst->neighbour; +	struct net_device *dev = neigh->dev; + +	__skb_pull(skb, skb->nh.raw - skb->data); + +	read_lock_bh(&neigh->lock); +	err = dev->hard_header(skb, dev, ntohs(skb->protocol), +			       neigh->ha, NULL, skb->len); +	read_unlock_bh(&neigh->lock); +	if (err >= 0) +		err = neigh->ops->queue_xmit(skb); +	else { +		err = -EINVAL; +		kfree_skb(skb); +	} +	return err; +} + +static void neigh_proxy_process(unsigned long arg) +{ +	struct neigh_table *tbl = (struct neigh_table *)arg; +	long sched_next = 0; +	unsigned long now = jiffies; +	struct sk_buff *skb; + +	spin_lock(&tbl->proxy_queue.lock); + +	skb = tbl->proxy_queue.next; + +	while (skb != (struct sk_buff *)&tbl->proxy_queue) { +		struct sk_buff *back = skb; +		long tdif = back->stamp.tv_usec - now; + +		skb = skb->next; +		if (tdif <= 0) { +			struct net_device *dev = back->dev; +			__skb_unlink(back, &tbl->proxy_queue); +			if (tbl->proxy_redo && netif_running(dev)) +				tbl->proxy_redo(back); +			else +				kfree_skb(back); + +			dev_put(dev); +		} else if (!sched_next || tdif < sched_next) +			sched_next = tdif; +	} +	del_timer(&tbl->proxy_timer); +	if (sched_next) +		mod_timer(&tbl->proxy_timer, jiffies + sched_next); +	spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, +		    struct sk_buff *skb) +{ +	unsigned long now = jiffies; +	unsigned long sched_next = now + (net_random() % p->proxy_delay); + +	if (tbl->proxy_queue.qlen > p->proxy_qlen) { +		kfree_skb(skb); +		return; +	} +	skb->stamp.tv_sec  = LOCALLY_ENQUEUED; +	skb->stamp.tv_usec = sched_next; + +	spin_lock(&tbl->proxy_queue.lock); +	if (del_timer(&tbl->proxy_timer)) { +		if (time_before(tbl->proxy_timer.expires, sched_next)) +			sched_next = tbl->proxy_timer.expires; +	} +	dst_release(skb->dst); +	skb->dst = NULL; +	dev_hold(skb->dev); +	__skb_queue_tail(&tbl->proxy_queue, skb); +	mod_timer(&tbl->proxy_timer, sched_next); +	spin_unlock(&tbl->proxy_queue.lock); +} + + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, +				      struct neigh_table *tbl) +{ +	struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL); + +	if (p) { +		memcpy(p, &tbl->parms, sizeof(*p)); +		p->tbl		  = tbl; +		atomic_set(&p->refcnt, 1); +		INIT_RCU_HEAD(&p->rcu_head); +		p->reachable_time = +				neigh_rand_reach_time(p->base_reachable_time); +		if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) { +			kfree(p); +			return NULL; +		} +		p->sysctl_table = NULL; +		write_lock_bh(&tbl->lock); +		p->next		= tbl->parms.next; +		tbl->parms.next = p; +		write_unlock_bh(&tbl->lock); +	} +	return p; +} + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ +	struct neigh_parms *parms = +		container_of(head, struct neigh_parms, rcu_head); + +	neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ +	struct neigh_parms **p; + +	if (!parms || parms == &tbl->parms) +		return; +	write_lock_bh(&tbl->lock); +	for (p = &tbl->parms.next; *p; p = &(*p)->next) { +		if (*p == parms) { +			*p = parms->next; +			parms->dead = 1; +			write_unlock_bh(&tbl->lock); +			call_rcu(&parms->rcu_head, neigh_rcu_free_parms); +			return; +		} +	} +	write_unlock_bh(&tbl->lock); +	NEIGH_PRINTK1("neigh_parms_release: not found\n"); +} + +void neigh_parms_destroy(struct neigh_parms *parms) +{ +	kfree(parms); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ +	unsigned long now = jiffies; +	unsigned long phsize; + +	atomic_set(&tbl->parms.refcnt, 1); +	INIT_RCU_HEAD(&tbl->parms.rcu_head); +	tbl->parms.reachable_time = +			  neigh_rand_reach_time(tbl->parms.base_reachable_time); + +	if (!tbl->kmem_cachep) +		tbl->kmem_cachep = kmem_cache_create(tbl->id, +						     tbl->entry_size, +						     0, SLAB_HWCACHE_ALIGN, +						     NULL, NULL); + +	if (!tbl->kmem_cachep) +		panic("cannot create neighbour cache"); + +	tbl->stats = alloc_percpu(struct neigh_statistics); +	if (!tbl->stats) +		panic("cannot create neighbour cache statistics"); +	 +#ifdef CONFIG_PROC_FS +	tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); +	if (!tbl->pde)  +		panic("cannot create neighbour proc dir entry"); +	tbl->pde->proc_fops = &neigh_stat_seq_fops; +	tbl->pde->data = tbl; +#endif + +	tbl->hash_mask = 1; +	tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + +	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); +	tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL); + +	if (!tbl->hash_buckets || !tbl->phash_buckets) +		panic("cannot allocate neighbour cache hashes"); + +	memset(tbl->phash_buckets, 0, phsize); + +	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + +	rwlock_init(&tbl->lock); +	init_timer(&tbl->gc_timer); +	tbl->gc_timer.data     = (unsigned long)tbl; +	tbl->gc_timer.function = neigh_periodic_timer; +	tbl->gc_timer.expires  = now + 1; +	add_timer(&tbl->gc_timer); + +	init_timer(&tbl->proxy_timer); +	tbl->proxy_timer.data	  = (unsigned long)tbl; +	tbl->proxy_timer.function = neigh_proxy_process; +	skb_queue_head_init(&tbl->proxy_queue); + +	tbl->last_flush = now; +	tbl->last_rand	= now + tbl->parms.reachable_time * 20; +	write_lock(&neigh_tbl_lock); +	tbl->next	= neigh_tables; +	neigh_tables	= tbl; +	write_unlock(&neigh_tbl_lock); +} + +int neigh_table_clear(struct neigh_table *tbl) +{ +	struct neigh_table **tp; + +	/* It is not clean... Fix it to unload IPv6 module safely */ +	del_timer_sync(&tbl->gc_timer); +	del_timer_sync(&tbl->proxy_timer); +	pneigh_queue_purge(&tbl->proxy_queue); +	neigh_ifdown(tbl, NULL); +	if (atomic_read(&tbl->entries)) +		printk(KERN_CRIT "neighbour leakage\n"); +	write_lock(&neigh_tbl_lock); +	for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { +		if (*tp == tbl) { +			*tp = tbl->next; +			break; +		} +	} +	write_unlock(&neigh_tbl_lock); + +	neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); +	tbl->hash_buckets = NULL; + +	kfree(tbl->phash_buckets); +	tbl->phash_buckets = NULL; + +	return 0; +} + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ +	struct ndmsg *ndm = NLMSG_DATA(nlh); +	struct rtattr **nda = arg; +	struct neigh_table *tbl; +	struct net_device *dev = NULL; +	int err = -ENODEV; + +	if (ndm->ndm_ifindex && +	    (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) +		goto out; + +	read_lock(&neigh_tbl_lock); +	for (tbl = neigh_tables; tbl; tbl = tbl->next) { +		struct rtattr *dst_attr = nda[NDA_DST - 1]; +		struct neighbour *n; + +		if (tbl->family != ndm->ndm_family) +			continue; +		read_unlock(&neigh_tbl_lock); + +		err = -EINVAL; +		if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) +			goto out_dev_put; + +		if (ndm->ndm_flags & NTF_PROXY) { +			err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev); +			goto out_dev_put; +		} + +		if (!dev) +			goto out; + +		n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); +		if (n) { +			err = neigh_update(n, NULL, NUD_FAILED,  +					   NEIGH_UPDATE_F_OVERRIDE| +					   NEIGH_UPDATE_F_ADMIN); +			neigh_release(n); +		} +		goto out_dev_put; +	} +	read_unlock(&neigh_tbl_lock); +	err = -EADDRNOTAVAIL; +out_dev_put: +	if (dev) +		dev_put(dev); +out: +	return err; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ +	struct ndmsg *ndm = NLMSG_DATA(nlh); +	struct rtattr **nda = arg; +	struct neigh_table *tbl; +	struct net_device *dev = NULL; +	int err = -ENODEV; + +	if (ndm->ndm_ifindex && +	    (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) +		goto out; + +	read_lock(&neigh_tbl_lock); +	for (tbl = neigh_tables; tbl; tbl = tbl->next) { +		struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1]; +		struct rtattr *dst_attr = nda[NDA_DST - 1]; +		int override = 1; +		struct neighbour *n; + +		if (tbl->family != ndm->ndm_family) +			continue; +		read_unlock(&neigh_tbl_lock); + +		err = -EINVAL; +		if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) +			goto out_dev_put; + +		if (ndm->ndm_flags & NTF_PROXY) { +			err = -ENOBUFS; +			if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1)) +				err = 0; +			goto out_dev_put; +		} + +		err = -EINVAL; +		if (!dev) +			goto out; +		if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len) +			goto out_dev_put; +	 +		n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); +		if (n) { +			if (nlh->nlmsg_flags & NLM_F_EXCL) { +				err = -EEXIST; +				neigh_release(n); +				goto out_dev_put; +			} +			 +			override = nlh->nlmsg_flags & NLM_F_REPLACE; +		} else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { +			err = -ENOENT; +			goto out_dev_put; +		} else { +			n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev); +			if (IS_ERR(n)) { +				err = PTR_ERR(n); +				goto out_dev_put; +			} +		} + +		err = neigh_update(n, +				   lladdr_attr ? RTA_DATA(lladdr_attr) : NULL, +				   ndm->ndm_state, +				   (override ? NEIGH_UPDATE_F_OVERRIDE : 0) | +				   NEIGH_UPDATE_F_ADMIN); + +		neigh_release(n); +		goto out_dev_put; +	} + +	read_unlock(&neigh_tbl_lock); +	err = -EADDRNOTAVAIL; +out_dev_put: +	if (dev) +		dev_put(dev); +out: +	return err; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, +			   u32 pid, u32 seq, int event) +{ +	unsigned long now = jiffies; +	unsigned char *b = skb->tail; +	struct nda_cacheinfo ci; +	int locked = 0; +	u32 probes; +	struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event, +					 sizeof(struct ndmsg)); +	struct ndmsg *ndm = NLMSG_DATA(nlh); + +	nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0; +	ndm->ndm_family	 = n->ops->family; +	ndm->ndm_flags	 = n->flags; +	ndm->ndm_type	 = n->type; +	ndm->ndm_ifindex = n->dev->ifindex; +	RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); +	read_lock_bh(&n->lock); +	locked		 = 1; +	ndm->ndm_state	 = n->nud_state; +	if (n->nud_state & NUD_VALID) +		RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); +	ci.ndm_used	 = now - n->used; +	ci.ndm_confirmed = now - n->confirmed; +	ci.ndm_updated	 = now - n->updated; +	ci.ndm_refcnt	 = atomic_read(&n->refcnt) - 1; +	probes = atomic_read(&n->probes); +	read_unlock_bh(&n->lock); +	locked		 = 0; +	RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); +	RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes); +	nlh->nlmsg_len	 = skb->tail - b; +	return skb->len; + +nlmsg_failure: +rtattr_failure: +	if (locked) +		read_unlock_bh(&n->lock); +	skb_trim(skb, b - skb->data); +	return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, +			    struct netlink_callback *cb) +{ +	struct neighbour *n; +	int rc, h, s_h = cb->args[1]; +	int idx, s_idx = idx = cb->args[2]; + +	for (h = 0; h <= tbl->hash_mask; h++) { +		if (h < s_h) +			continue; +		if (h > s_h) +			s_idx = 0; +		read_lock_bh(&tbl->lock); +		for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { +			if (idx < s_idx) +				continue; +			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, +					    cb->nlh->nlmsg_seq, +					    RTM_NEWNEIGH) <= 0) { +				read_unlock_bh(&tbl->lock); +				rc = -1; +				goto out; +			} +		} +		read_unlock_bh(&tbl->lock); +	} +	rc = skb->len; +out: +	cb->args[1] = h; +	cb->args[2] = idx; +	return rc; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct neigh_table *tbl; +	int t, family, s_t; + +	read_lock(&neigh_tbl_lock); +	family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; +	s_t = cb->args[0]; + +	for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { +		if (t < s_t || (family && tbl->family != family)) +			continue; +		if (t > s_t) +			memset(&cb->args[1], 0, sizeof(cb->args) - +						sizeof(cb->args[0])); +		if (neigh_dump_table(tbl, skb, cb) < 0) +			break; +	} +	read_unlock(&neigh_tbl_lock); + +	cb->args[0] = t; +	return skb->len; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ +	int chain; + +	read_lock_bh(&tbl->lock); +	for (chain = 0; chain <= tbl->hash_mask; chain++) { +		struct neighbour *n; + +		for (n = tbl->hash_buckets[chain]; n; n = n->next) +			cb(n, cookie); +	} +	read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, +			      int (*cb)(struct neighbour *)) +{ +	int chain; + +	for (chain = 0; chain <= tbl->hash_mask; chain++) { +		struct neighbour *n, **np; + +		np = &tbl->hash_buckets[chain]; +		while ((n = *np) != NULL) { +			int release; + +			write_lock(&n->lock); +			release = cb(n); +			if (release) { +				*np = n->next; +				n->dead = 1; +			} else +				np = &n->next; +			write_unlock(&n->lock); +			if (release) +				neigh_release(n); +		} +	} +} +EXPORT_SYMBOL(__neigh_for_each_release); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ +	struct neigh_seq_state *state = seq->private; +	struct neigh_table *tbl = state->tbl; +	struct neighbour *n = NULL; +	int bucket = state->bucket; + +	state->flags &= ~NEIGH_SEQ_IS_PNEIGH; +	for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { +		n = tbl->hash_buckets[bucket]; + +		while (n) { +			if (state->neigh_sub_iter) { +				loff_t fakep = 0; +				void *v; + +				v = state->neigh_sub_iter(state, n, &fakep); +				if (!v) +					goto next; +			} +			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) +				break; +			if (n->nud_state & ~NUD_NOARP) +				break; +		next: +			n = n->next; +		} + +		if (n) +			break; +	} +	state->bucket = bucket; + +	return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, +					struct neighbour *n, +					loff_t *pos) +{ +	struct neigh_seq_state *state = seq->private; +	struct neigh_table *tbl = state->tbl; + +	if (state->neigh_sub_iter) { +		void *v = state->neigh_sub_iter(state, n, pos); +		if (v) +			return n; +	} +	n = n->next; + +	while (1) { +		while (n) { +			if (state->neigh_sub_iter) { +				void *v = state->neigh_sub_iter(state, n, pos); +				if (v) +					return n; +				goto next; +			} +			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) +				break; + +			if (n->nud_state & ~NUD_NOARP) +				break; +		next: +			n = n->next; +		} + +		if (n) +			break; + +		if (++state->bucket > tbl->hash_mask) +			break; + +		n = tbl->hash_buckets[state->bucket]; +	} + +	if (n && pos) +		--(*pos); +	return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ +	struct neighbour *n = neigh_get_first(seq); + +	if (n) { +		while (*pos) { +			n = neigh_get_next(seq, n, pos); +			if (!n) +				break; +		} +	} +	return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ +	struct neigh_seq_state *state = seq->private; +	struct neigh_table *tbl = state->tbl; +	struct pneigh_entry *pn = NULL; +	int bucket = state->bucket; + +	state->flags |= NEIGH_SEQ_IS_PNEIGH; +	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { +		pn = tbl->phash_buckets[bucket]; +		if (pn) +			break; +	} +	state->bucket = bucket; + +	return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, +					    struct pneigh_entry *pn, +					    loff_t *pos) +{ +	struct neigh_seq_state *state = seq->private; +	struct neigh_table *tbl = state->tbl; + +	pn = pn->next; +	while (!pn) { +		if (++state->bucket > PNEIGH_HASHMASK) +			break; +		pn = tbl->phash_buckets[state->bucket]; +		if (pn) +			break; +	} + +	if (pn && pos) +		--(*pos); + +	return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ +	struct pneigh_entry *pn = pneigh_get_first(seq); + +	if (pn) { +		while (*pos) { +			pn = pneigh_get_next(seq, pn, pos); +			if (!pn) +				break; +		} +	} +	return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ +	struct neigh_seq_state *state = seq->private; +	void *rc; + +	rc = neigh_get_idx(seq, pos); +	if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) +		rc = pneigh_get_idx(seq, pos); + +	return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) +{ +	struct neigh_seq_state *state = seq->private; +	loff_t pos_minus_one; + +	state->tbl = tbl; +	state->bucket = 0; +	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + +	read_lock_bh(&tbl->lock); + +	pos_minus_one = *pos - 1; +	return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct neigh_seq_state *state; +	void *rc; + +	if (v == SEQ_START_TOKEN) { +		rc = neigh_get_idx(seq, pos); +		goto out; +	} + +	state = seq->private; +	if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { +		rc = neigh_get_next(seq, v, NULL); +		if (rc) +			goto out; +		if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) +			rc = pneigh_get_first(seq); +	} else { +		BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); +		rc = pneigh_get_next(seq, v, NULL); +	} +out: +	++(*pos); +	return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) +{ +	struct neigh_seq_state *state = seq->private; +	struct neigh_table *tbl = state->tbl; + +	read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ +	struct proc_dir_entry *pde = seq->private; +	struct neigh_table *tbl = pde->data; +	int cpu; + +	if (*pos == 0) +		return SEQ_START_TOKEN; +	 +	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { +		if (!cpu_possible(cpu)) +			continue; +		*pos = cpu+1; +		return per_cpu_ptr(tbl->stats, cpu); +	} +	return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct proc_dir_entry *pde = seq->private; +	struct neigh_table *tbl = pde->data; +	int cpu; + +	for (cpu = *pos; cpu < NR_CPUS; ++cpu) { +		if (!cpu_possible(cpu)) +			continue; +		*pos = cpu+1; +		return per_cpu_ptr(tbl->stats, cpu); +	} +	return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ +	struct proc_dir_entry *pde = seq->private; +	struct neigh_table *tbl = pde->data; +	struct neigh_statistics *st = v; + +	if (v == SEQ_START_TOKEN) { +		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs forced_gc_goal_miss\n"); +		return 0; +	} + +	seq_printf(seq, "%08x  %08lx %08lx %08lx  %08lx %08lx  %08lx  " +			"%08lx %08lx  %08lx %08lx\n", +		   atomic_read(&tbl->entries), + +		   st->allocs, +		   st->destroys, +		   st->hash_grows, + +		   st->lookups, +		   st->hits, + +		   st->res_failed, + +		   st->rcv_probes_mcast, +		   st->rcv_probes_ucast, + +		   st->periodic_gc_runs, +		   st->forced_gc_runs +		   ); + +	return 0; +} + +static struct seq_operations neigh_stat_seq_ops = { +	.start	= neigh_stat_seq_start, +	.next	= neigh_stat_seq_next, +	.stop	= neigh_stat_seq_stop, +	.show	= neigh_stat_seq_show, +}; + +static int neigh_stat_seq_open(struct inode *inode, struct file *file) +{ +	int ret = seq_open(file, &neigh_stat_seq_ops); + +	if (!ret) { +		struct seq_file *sf = file->private_data; +		sf->private = PDE(inode); +	} +	return ret; +}; + +static struct file_operations neigh_stat_seq_fops = { +	.owner	 = THIS_MODULE, +	.open 	 = neigh_stat_seq_open, +	.read	 = seq_read, +	.llseek	 = seq_lseek, +	.release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ +	struct nlmsghdr  *nlh; +	int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); +	struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + +	if (!skb) +		return; + +	if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { +		kfree_skb(skb); +		return; +	} +	nlh			   = (struct nlmsghdr *)skb->data; +	nlh->nlmsg_flags	   = NLM_F_REQUEST; +	NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; +	netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +static void neigh_app_notify(struct neighbour *n) +{ +	struct nlmsghdr *nlh; +	int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); +	struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + +	if (!skb) +		return; + +	if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { +		kfree_skb(skb); +		return; +	} +	nlh			   = (struct nlmsghdr *)skb->data; +	NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; +	netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +#endif /* CONFIG_ARPD */ + +#ifdef CONFIG_SYSCTL + +static struct neigh_sysctl_table { +	struct ctl_table_header *sysctl_header; +	ctl_table		neigh_vars[__NET_NEIGH_MAX]; +	ctl_table		neigh_dev[2]; +	ctl_table		neigh_neigh_dir[2]; +	ctl_table		neigh_proto_dir[2]; +	ctl_table		neigh_root_dir[2]; +} neigh_sysctl_template = { +	.neigh_vars = { +		{ +			.ctl_name	= NET_NEIGH_MCAST_SOLICIT, +			.procname	= "mcast_solicit", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_UCAST_SOLICIT, +			.procname	= "ucast_solicit", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_APP_SOLICIT, +			.procname	= "app_solicit", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_RETRANS_TIME, +			.procname	= "retrans_time", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_userhz_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_REACHABLE_TIME, +			.procname	= "base_reachable_time", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_jiffies, +			.strategy	= &sysctl_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_DELAY_PROBE_TIME, +			.procname	= "delay_first_probe_time", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_jiffies, +			.strategy	= &sysctl_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_GC_STALE_TIME, +			.procname	= "gc_stale_time", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_jiffies, +			.strategy	= &sysctl_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_UNRES_QLEN, +			.procname	= "unres_qlen", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_PROXY_QLEN, +			.procname	= "proxy_qlen", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_ANYCAST_DELAY, +			.procname	= "anycast_delay", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_userhz_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_PROXY_DELAY, +			.procname	= "proxy_delay", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_userhz_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_LOCKTIME, +			.procname	= "locktime", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_userhz_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_GC_INTERVAL, +			.procname	= "gc_interval", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_jiffies, +			.strategy	= &sysctl_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_GC_THRESH1, +			.procname	= "gc_thresh1", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_GC_THRESH2, +			.procname	= "gc_thresh2", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_GC_THRESH3, +			.procname	= "gc_thresh3", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec, +		}, +		{ +			.ctl_name	= NET_NEIGH_RETRANS_TIME_MS, +			.procname	= "retrans_time_ms", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_ms_jiffies, +			.strategy	= &sysctl_ms_jiffies, +		}, +		{ +			.ctl_name	= NET_NEIGH_REACHABLE_TIME_MS, +			.procname	= "base_reachable_time_ms", +			.maxlen		= sizeof(int), +			.mode		= 0644, +			.proc_handler	= &proc_dointvec_ms_jiffies, +			.strategy	= &sysctl_ms_jiffies, +		}, +	}, +	.neigh_dev = { +		{ +			.ctl_name	= NET_PROTO_CONF_DEFAULT, +			.procname	= "default", +			.mode		= 0555, +		}, +	}, +	.neigh_neigh_dir = { +		{ +			.procname	= "neigh", +			.mode		= 0555, +		}, +	}, +	.neigh_proto_dir = { +		{ +			.mode		= 0555, +		}, +	}, +	.neigh_root_dir = { +		{ +			.ctl_name	= CTL_NET, +			.procname	= "net", +			.mode		= 0555, +		}, +	}, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, +			  int p_id, int pdev_id, char *p_name,  +			  proc_handler *handler, ctl_handler *strategy) +{ +	struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); +	const char *dev_name_source = NULL; +	char *dev_name = NULL; +	int err = 0; + +	if (!t) +		return -ENOBUFS; +	memcpy(t, &neigh_sysctl_template, sizeof(*t)); +	t->neigh_vars[0].data  = &p->mcast_probes; +	t->neigh_vars[1].data  = &p->ucast_probes; +	t->neigh_vars[2].data  = &p->app_probes; +	t->neigh_vars[3].data  = &p->retrans_time; +	t->neigh_vars[4].data  = &p->base_reachable_time; +	t->neigh_vars[5].data  = &p->delay_probe_time; +	t->neigh_vars[6].data  = &p->gc_staletime; +	t->neigh_vars[7].data  = &p->queue_len; +	t->neigh_vars[8].data  = &p->proxy_qlen; +	t->neigh_vars[9].data  = &p->anycast_delay; +	t->neigh_vars[10].data = &p->proxy_delay; +	t->neigh_vars[11].data = &p->locktime; + +	if (dev) { +		dev_name_source = dev->name; +		t->neigh_dev[0].ctl_name = dev->ifindex; +		t->neigh_vars[12].procname = NULL; +		t->neigh_vars[13].procname = NULL; +		t->neigh_vars[14].procname = NULL; +		t->neigh_vars[15].procname = NULL; +	} else { + 		dev_name_source = t->neigh_dev[0].procname; +		t->neigh_vars[12].data = (int *)(p + 1); +		t->neigh_vars[13].data = (int *)(p + 1) + 1; +		t->neigh_vars[14].data = (int *)(p + 1) + 2; +		t->neigh_vars[15].data = (int *)(p + 1) + 3; +	} + +	t->neigh_vars[16].data  = &p->retrans_time; +	t->neigh_vars[17].data  = &p->base_reachable_time; + +	if (handler || strategy) { +		/* RetransTime */ +		t->neigh_vars[3].proc_handler = handler; +		t->neigh_vars[3].strategy = strategy; +		t->neigh_vars[3].extra1 = dev; +		/* ReachableTime */ +		t->neigh_vars[4].proc_handler = handler; +		t->neigh_vars[4].strategy = strategy; +		t->neigh_vars[4].extra1 = dev; +		/* RetransTime (in milliseconds)*/ +		t->neigh_vars[16].proc_handler = handler; +		t->neigh_vars[16].strategy = strategy; +		t->neigh_vars[16].extra1 = dev; +		/* ReachableTime (in milliseconds) */ +		t->neigh_vars[17].proc_handler = handler; +		t->neigh_vars[17].strategy = strategy; +		t->neigh_vars[17].extra1 = dev; +	} + +	dev_name = net_sysctl_strdup(dev_name_source); +	if (!dev_name) { +		err = -ENOBUFS; +		goto free; +	} + + 	t->neigh_dev[0].procname = dev_name; + +	t->neigh_neigh_dir[0].ctl_name = pdev_id; + +	t->neigh_proto_dir[0].procname = p_name; +	t->neigh_proto_dir[0].ctl_name = p_id; + +	t->neigh_dev[0].child	       = t->neigh_vars; +	t->neigh_neigh_dir[0].child    = t->neigh_dev; +	t->neigh_proto_dir[0].child    = t->neigh_neigh_dir; +	t->neigh_root_dir[0].child     = t->neigh_proto_dir; + +	t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); +	if (!t->sysctl_header) { +		err = -ENOBUFS; +		goto free_procname; +	} +	p->sysctl_table = t; +	return 0; + +	/* error path */ + free_procname: +	kfree(dev_name); + free: +	kfree(t); + +	return err; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ +	if (p->sysctl_table) { +		struct neigh_sysctl_table *t = p->sysctl_table; +		p->sysctl_table = NULL; +		unregister_sysctl_table(t->sysctl_header); +		kfree(t->neigh_dev[0].procname); +		kfree(t); +	} +} + +#endif	/* CONFIG_SYSCTL */ + +EXPORT_SYMBOL(__neigh_event_send); +EXPORT_SYMBOL(neigh_add); +EXPORT_SYMBOL(neigh_changeaddr); +EXPORT_SYMBOL(neigh_compat_output); +EXPORT_SYMBOL(neigh_connected_output); +EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(neigh_delete); +EXPORT_SYMBOL(neigh_destroy); +EXPORT_SYMBOL(neigh_dump_info); +EXPORT_SYMBOL(neigh_event_ns); +EXPORT_SYMBOL(neigh_ifdown); +EXPORT_SYMBOL(neigh_lookup); +EXPORT_SYMBOL(neigh_lookup_nodev); +EXPORT_SYMBOL(neigh_parms_alloc); +EXPORT_SYMBOL(neigh_parms_release); +EXPORT_SYMBOL(neigh_rand_reach_time); +EXPORT_SYMBOL(neigh_resolve_output); +EXPORT_SYMBOL(neigh_table_clear); +EXPORT_SYMBOL(neigh_table_init); +EXPORT_SYMBOL(neigh_update); +EXPORT_SYMBOL(neigh_update_hhs); +EXPORT_SYMBOL(pneigh_enqueue); +EXPORT_SYMBOL(pneigh_lookup); + +#ifdef CONFIG_ARPD +EXPORT_SYMBOL(neigh_app_ns); +#endif +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(neigh_sysctl_register); +EXPORT_SYMBOL(neigh_sysctl_unregister); +#endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 00000000000..060f703659e --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,461 @@ +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> + *  + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/wireless.h> + +#define to_class_dev(obj) container_of(obj,struct class_device,kobj) +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +static const char fmt_hex[] = "%#x\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_ulong[] = "%lu\n"; + +static inline int dev_isalive(const struct net_device *dev)  +{ +	return dev->reg_state == NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct class_device *cd, char *buf, +			   ssize_t (*format)(const struct net_device *, char *)) +{ +	struct net_device *net = to_net_dev(cd); +	ssize_t ret = -EINVAL; + +	read_lock(&dev_base_lock); +	if (dev_isalive(net)) +		ret = (*format)(net, buf); +	read_unlock(&dev_base_lock); + +	return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string)				\ +static ssize_t format_##field(const struct net_device *net, char *buf)	\ +{									\ +	return sprintf(buf, format_string, net->field);			\ +}									\ +static ssize_t show_##field(struct class_device *cd, char *buf)		\ +{									\ +	return netdev_show(cd, buf, format_##field);			\ +} + + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct class_device *dev, +			    const char *buf, size_t len, +			    int (*set)(struct net_device *, unsigned long)) +{ +	struct net_device *net = to_net_dev(dev); +	char *endp; +	unsigned long new; +	int ret = -EINVAL; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	new = simple_strtoul(buf, &endp, 0); +	if (endp == buf) +		goto err; + +	rtnl_lock(); +	if (dev_isalive(net)) { +		if ((ret = (*set)(net, new)) == 0) +			ret = len; +	} +	rtnl_unlock(); + err: +	return ret; +} + +/* generate a read-only network device class attribute */ +#define NETDEVICE_ATTR(field, format_string)				\ +NETDEVICE_SHOW(field, format_string)					\ +static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL)		\ + +NETDEVICE_ATTR(addr_len, fmt_dec); +NETDEVICE_ATTR(iflink, fmt_dec); +NETDEVICE_ATTR(ifindex, fmt_dec); +NETDEVICE_ATTR(features, fmt_hex); +NETDEVICE_ATTR(type, fmt_dec); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t format_addr(char *buf, const unsigned char *addr, int len) +{ +	int i; +	char *cp = buf; + +	for (i = 0; i < len; i++) +		cp += sprintf(cp, "%02x%c", addr[i], +			      i == (len - 1) ? '\n' : ':'); +	return cp - buf; +} + +static ssize_t show_address(struct class_device *dev, char *buf) +{ +	struct net_device *net = to_net_dev(dev); +	ssize_t ret = -EINVAL; + +	read_lock(&dev_base_lock); +	if (dev_isalive(net)) +	    ret = format_addr(buf, net->dev_addr, net->addr_len); +	read_unlock(&dev_base_lock); +	return ret; +} + +static ssize_t show_broadcast(struct class_device *dev, char *buf) +{ +	struct net_device *net = to_net_dev(dev); +	if (dev_isalive(net)) +		return format_addr(buf, net->broadcast, net->addr_len); +	return -EINVAL; +} + +static ssize_t show_carrier(struct class_device *dev, char *buf) +{ +	struct net_device *netdev = to_net_dev(dev); +	if (netif_running(netdev)) { +		return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); +	} +	return -EINVAL; +} + +static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL); +static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); + +/* read-write attributes */ +NETDEVICE_SHOW(mtu, fmt_dec); + +static int change_mtu(struct net_device *net, unsigned long new_mtu) +{ +	return dev_set_mtu(net, (int) new_mtu); +} + +static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len) +{ +	return netdev_store(dev, buf, len, change_mtu); +} + +static CLASS_DEVICE_ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu); + +NETDEVICE_SHOW(flags, fmt_hex); + +static int change_flags(struct net_device *net, unsigned long new_flags) +{ +	return dev_change_flags(net, (unsigned) new_flags); +} + +static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len) +{ +	return netdev_store(dev, buf, len, change_flags); +} + +static CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags); + +NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + +static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +{ +	net->tx_queue_len = new_len; +	return 0; +} + +static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len) +{ +	return netdev_store(dev, buf, len, change_tx_queue_len); +} + +static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,  +			 store_tx_queue_len); + + +static struct class_device_attribute *net_class_attributes[] = { +	&class_device_attr_ifindex, +	&class_device_attr_iflink, +	&class_device_attr_addr_len, +	&class_device_attr_tx_queue_len, +	&class_device_attr_features, +	&class_device_attr_mtu, +	&class_device_attr_flags, +	&class_device_attr_type, +	&class_device_attr_address, +	&class_device_attr_broadcast, +	&class_device_attr_carrier, +	NULL +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct class_device *cd, char *buf,  +			    unsigned long offset) +{ +	struct net_device *dev = to_net_dev(cd); +	struct net_device_stats *stats; +	ssize_t ret = -EINVAL; + +	if (offset > sizeof(struct net_device_stats) || +	    offset % sizeof(unsigned long) != 0) +		WARN_ON(1); + +	read_lock(&dev_base_lock); +	if (dev_isalive(dev) && dev->get_stats && +	    (stats = (*dev->get_stats)(dev)))  +		ret = sprintf(buf, fmt_ulong, +			      *(unsigned long *)(((u8 *) stats) + offset)); + +	read_unlock(&dev_base_lock); +	return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name)						\ +static ssize_t show_##name(struct class_device *cd, char *buf) 		\ +{									\ +	return netstat_show(cd, buf, 					\ +			    offsetof(struct net_device_stats, name));	\ +}									\ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); + +static struct attribute *netstat_attrs[] = { +	&class_device_attr_rx_packets.attr, +	&class_device_attr_tx_packets.attr, +	&class_device_attr_rx_bytes.attr, +	&class_device_attr_tx_bytes.attr, +	&class_device_attr_rx_errors.attr, +	&class_device_attr_tx_errors.attr, +	&class_device_attr_rx_dropped.attr, +	&class_device_attr_tx_dropped.attr, +	&class_device_attr_multicast.attr, +	&class_device_attr_collisions.attr, +	&class_device_attr_rx_length_errors.attr, +	&class_device_attr_rx_over_errors.attr, +	&class_device_attr_rx_crc_errors.attr, +	&class_device_attr_rx_frame_errors.attr, +	&class_device_attr_rx_fifo_errors.attr, +	&class_device_attr_rx_missed_errors.attr, +	&class_device_attr_tx_aborted_errors.attr, +	&class_device_attr_tx_carrier_errors.attr, +	&class_device_attr_tx_fifo_errors.attr, +	&class_device_attr_tx_heartbeat_errors.attr, +	&class_device_attr_tx_window_errors.attr, +	&class_device_attr_rx_compressed.attr, +	&class_device_attr_tx_compressed.attr, +	NULL +}; + + +static struct attribute_group netstat_group = { +	.name  = "statistics", +	.attrs  = netstat_attrs, +}; + +#ifdef WIRELESS_EXT +/* helper function that does all the locking etc for wireless stats */ +static ssize_t wireless_show(struct class_device *cd, char *buf, +			     ssize_t (*format)(const struct iw_statistics *, +					       char *)) +{ +	struct net_device *dev = to_net_dev(cd); +	const struct iw_statistics *iw; +	ssize_t ret = -EINVAL; +	 +	read_lock(&dev_base_lock); +	if (dev_isalive(dev) && dev->get_wireless_stats  +	    && (iw = dev->get_wireless_stats(dev)) != NULL)  +		ret = (*format)(iw, buf); +	read_unlock(&dev_base_lock); + +	return ret; +} + +/* show function template for wireless fields */ +#define WIRELESS_SHOW(name, field, format_string)			\ +static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ +{									\ +	return sprintf(buf, format_string, iw->field);			\ +}									\ +static ssize_t show_iw_##name(struct class_device *cd, char *buf)	\ +{									\ +	return wireless_show(cd, buf, format_iw_##name);		\ +}									\ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) + +WIRELESS_SHOW(status, status, fmt_hex); +WIRELESS_SHOW(link, qual.qual, fmt_dec); +WIRELESS_SHOW(level, qual.level, fmt_dec); +WIRELESS_SHOW(noise, qual.noise, fmt_dec); +WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); +WIRELESS_SHOW(crypt, discard.code, fmt_dec); +WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); +WIRELESS_SHOW(misc, discard.misc, fmt_dec); +WIRELESS_SHOW(retries, discard.retries, fmt_dec); +WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); + +static struct attribute *wireless_attrs[] = { +	&class_device_attr_status.attr, +	&class_device_attr_link.attr, +	&class_device_attr_level.attr, +	&class_device_attr_noise.attr, +	&class_device_attr_nwid.attr, +	&class_device_attr_crypt.attr, +	&class_device_attr_fragment.attr, +	&class_device_attr_retries.attr, +	&class_device_attr_misc.attr, +	&class_device_attr_beacon.attr, +	NULL +}; + +static struct attribute_group wireless_group = { +	.name = "wireless", +	.attrs = wireless_attrs, +}; +#endif + +#ifdef CONFIG_HOTPLUG +static int netdev_hotplug(struct class_device *cd, char **envp, +			  int num_envp, char *buf, int size) +{ +	struct net_device *dev = to_net_dev(cd); +	int i = 0; +	int n; + +	/* pass interface in env to hotplug. */ +	envp[i++] = buf; +	n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1; +	buf += n; +	size -= n; + +	if ((size <= 0) || (i >= num_envp)) +		return -ENOMEM; + +	envp[i] = NULL; +	return 0; +} +#endif + +/* + *	netdev_release -- destroy and free a dead device.  + *	Called when last reference to class_device kobject is gone. + */ +static void netdev_release(struct class_device *cd) +{ +	struct net_device *dev  +		= container_of(cd, struct net_device, class_dev); + +	BUG_ON(dev->reg_state != NETREG_RELEASED); + +	kfree((char *)dev - dev->padded); +} + +static struct class net_class = { +	.name = "net", +	.release = netdev_release, +#ifdef CONFIG_HOTPLUG +	.hotplug = netdev_hotplug, +#endif +}; + +void netdev_unregister_sysfs(struct net_device * net) +{ +	struct class_device * class_dev = &(net->class_dev); + +	if (net->get_stats) +		sysfs_remove_group(&class_dev->kobj, &netstat_group); + +#ifdef WIRELESS_EXT +	if (net->get_wireless_stats) +		sysfs_remove_group(&class_dev->kobj, &wireless_group); +#endif +	class_device_del(class_dev); + +} + +/* Create sysfs entries for network device. */ +int netdev_register_sysfs(struct net_device *net) +{ +	struct class_device *class_dev = &(net->class_dev); +	int i; +	struct class_device_attribute *attr; +	int ret; + +	class_dev->class = &net_class; +	class_dev->class_data = net; + +	strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); +	if ((ret = class_device_register(class_dev))) +		goto out; + +	for (i = 0; (attr = net_class_attributes[i]) != NULL; i++) { +		if ((ret = class_device_create_file(class_dev, attr))) +		    goto out_unreg; +	} + + +	if (net->get_stats && +	    (ret = sysfs_create_group(&class_dev->kobj, &netstat_group))) +		goto out_unreg;  + +#ifdef WIRELESS_EXT +	if (net->get_wireless_stats && +	    (ret = sysfs_create_group(&class_dev->kobj, &wireless_group))) +		goto out_cleanup;  + +	return 0; +out_cleanup: +	if (net->get_stats) +		sysfs_remove_group(&class_dev->kobj, &netstat_group); +#else +	return 0; +#endif + +out_unreg: +	printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", +	       net->name, ret); +	class_device_unregister(class_dev); +out: +	return ret; +} + +int netdev_sysfs_init(void) +{ +	return class_register(&net_class); +} diff --git a/net/core/netfilter.c b/net/core/netfilter.c new file mode 100644 index 00000000000..e51cfa46950 --- /dev/null +++ b/net/core/netfilter.c @@ -0,0 +1,799 @@ +/* netfilter.c: look after the filters for various protocols.  + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + * + * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000:   Added NF_REPEAT --RR. + * 08-May-2003:	  Internal logging interface added by Jozsef Kadlecsik. + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/route.h> +#include <linux/ip.h> + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE.  We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks.  Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...)  printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + +/* Sockopts only registered and called from user context, so +   net locking would be overkill.  Also, [gs]etsockopt calls may +   sleep. */ +static DECLARE_MUTEX(nf_sockopt_mutex); + +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +static LIST_HEAD(nf_sockopts); +static DEFINE_SPINLOCK(nf_hook_lock); + +/*  + * A queue handler may be registered for each protocol.  Each is protected by + * long term mutex.  The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static struct nf_queue_handler_t { +	nf_queue_outfn_t outfn; +	void *data; +} queue_handler[NPROTO]; +static DEFINE_RWLOCK(queue_handler_lock); + +int nf_register_hook(struct nf_hook_ops *reg) +{ +	struct list_head *i; + +	spin_lock_bh(&nf_hook_lock); +	list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { +		if (reg->priority < ((struct nf_hook_ops *)i)->priority) +			break; +	} +	list_add_rcu(®->list, i->prev); +	spin_unlock_bh(&nf_hook_lock); + +	synchronize_net(); +	return 0; +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ +	spin_lock_bh(&nf_hook_lock); +	list_del_rcu(®->list); +	spin_unlock_bh(&nf_hook_lock); + +	synchronize_net(); +} + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ +	return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ +	struct list_head *i; +	int ret = 0; + +	if (down_interruptible(&nf_sockopt_mutex) != 0) +		return -EINTR; + +	list_for_each(i, &nf_sockopts) { +		struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; +		if (ops->pf == reg->pf +		    && (overlap(ops->set_optmin, ops->set_optmax,  +				reg->set_optmin, reg->set_optmax) +			|| overlap(ops->get_optmin, ops->get_optmax,  +				   reg->get_optmin, reg->get_optmax))) { +			NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", +				ops->set_optmin, ops->set_optmax,  +				ops->get_optmin, ops->get_optmax,  +				reg->set_optmin, reg->set_optmax, +				reg->get_optmin, reg->get_optmax); +			ret = -EBUSY; +			goto out; +		} +	} + +	list_add(®->list, &nf_sockopts); +out: +	up(&nf_sockopt_mutex); +	return ret; +} + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ +	/* No point being interruptible: we're probably in cleanup_module() */ + restart: +	down(&nf_sockopt_mutex); +	if (reg->use != 0) { +		/* To be woken by nf_sockopt call... */ +		/* FIXME: Stuart Young's name appears gratuitously. */ +		set_current_state(TASK_UNINTERRUPTIBLE); +		reg->cleanup_task = current; +		up(&nf_sockopt_mutex); +		schedule(); +		goto restart; +	} +	list_del(®->list); +	up(&nf_sockopt_mutex); +} + +#ifdef CONFIG_NETFILTER_DEBUG +#include <net/ip.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4.h> + +static void debug_print_hooks_ip(unsigned int nf_debug) +{ +	if (nf_debug & (1 << NF_IP_PRE_ROUTING)) { +		printk("PRE_ROUTING "); +		nf_debug ^= (1 << NF_IP_PRE_ROUTING); +	} +	if (nf_debug & (1 << NF_IP_LOCAL_IN)) { +		printk("LOCAL_IN "); +		nf_debug ^= (1 << NF_IP_LOCAL_IN); +	} +	if (nf_debug & (1 << NF_IP_FORWARD)) { +		printk("FORWARD "); +		nf_debug ^= (1 << NF_IP_FORWARD); +	} +	if (nf_debug & (1 << NF_IP_LOCAL_OUT)) { +		printk("LOCAL_OUT "); +		nf_debug ^= (1 << NF_IP_LOCAL_OUT); +	} +	if (nf_debug & (1 << NF_IP_POST_ROUTING)) { +		printk("POST_ROUTING "); +		nf_debug ^= (1 << NF_IP_POST_ROUTING); +	} +	if (nf_debug) +		printk("Crap bits: 0x%04X", nf_debug); +	printk("\n"); +} + +static void nf_dump_skb(int pf, struct sk_buff *skb) +{ +	printk("skb: pf=%i %s dev=%s len=%u\n",  +	       pf, +	       skb->sk ? "(owned)" : "(unowned)", +	       skb->dev ? skb->dev->name : "(no dev)", +	       skb->len); +	switch (pf) { +	case PF_INET: { +		const struct iphdr *ip = skb->nh.iph; +		__u32 *opt = (__u32 *) (ip + 1); +		int opti; +		__u16 src_port = 0, dst_port = 0; + +		if (ip->protocol == IPPROTO_TCP +		    || ip->protocol == IPPROTO_UDP) { +			struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); +			src_port = ntohs(tcp->source); +			dst_port = ntohs(tcp->dest); +		} +	 +		printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu" +		       " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", +		       ip->protocol, NIPQUAD(ip->saddr), +		       src_port, NIPQUAD(ip->daddr), +		       dst_port, +		       ntohs(ip->tot_len), ip->tos, ntohs(ip->id), +		       ntohs(ip->frag_off), ip->ttl); + +		for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) +			printk(" O=0x%8.8X", *opt++); +		printk("\n"); +	} +	} +} + +void nf_debug_ip_local_deliver(struct sk_buff *skb) +{ +	/* If it's a loopback packet, it must have come through +	 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and +	 * NF_IP_LOCAL_IN.  Otherwise, must have gone through +	 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING.  */ +	if (!skb->dev) { +		printk("ip_local_deliver: skb->dev is NULL.\n"); +	} +	else if (strcmp(skb->dev->name, "lo") == 0) { +		if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) +				      | (1 << NF_IP_POST_ROUTING) +				      | (1 << NF_IP_PRE_ROUTING) +				      | (1 << NF_IP_LOCAL_IN))) { +			printk("ip_local_deliver: bad loopback skb: "); +			debug_print_hooks_ip(skb->nf_debug); +			nf_dump_skb(PF_INET, skb); +		} +	} +	else { +		if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING) +				      | (1<<NF_IP_LOCAL_IN))) { +			printk("ip_local_deliver: bad non-lo skb: "); +			debug_print_hooks_ip(skb->nf_debug); +			nf_dump_skb(PF_INET, skb); +		} +	} +} + +void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) +{ +	if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT) +				 | (1 << NF_IP_POST_ROUTING))) { +		printk("ip_dev_loopback_xmit: bad owned skb = %p: ",  +		       newskb); +		debug_print_hooks_ip(newskb->nf_debug); +		nf_dump_skb(PF_INET, newskb); +	} +	/* Clear to avoid confusing input check */ +	newskb->nf_debug = 0; +} + +void nf_debug_ip_finish_output2(struct sk_buff *skb) +{ +	/* If it's owned, it must have gone through the +	 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING. +	 * Otherwise, must have gone through +	 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING. +	 */ +	if (skb->sk) { +		if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) +				      | (1 << NF_IP_POST_ROUTING))) { +			printk("ip_finish_output: bad owned skb = %p: ", skb); +			debug_print_hooks_ip(skb->nf_debug); +			nf_dump_skb(PF_INET, skb); +		} +	} else { +		if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) +				      | (1 << NF_IP_FORWARD) +				      | (1 << NF_IP_POST_ROUTING))) { +			/* Fragments, entunnelled packets, TCP RSTs +                           generated by ipt_REJECT will have no +                           owners, but still may be local */ +			if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) +					      | (1 << NF_IP_POST_ROUTING))){ +				printk("ip_finish_output:" +				       " bad unowned skb = %p: ",skb); +				debug_print_hooks_ip(skb->nf_debug); +				nf_dump_skb(PF_INET, skb); +			} +		} +	} +} +#endif /*CONFIG_NETFILTER_DEBUG*/ + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val,  +		      char __user *opt, int *len, int get) +{ +	struct list_head *i; +	struct nf_sockopt_ops *ops; +	int ret; + +	if (down_interruptible(&nf_sockopt_mutex) != 0) +		return -EINTR; + +	list_for_each(i, &nf_sockopts) { +		ops = (struct nf_sockopt_ops *)i; +		if (ops->pf == pf) { +			if (get) { +				if (val >= ops->get_optmin +				    && val < ops->get_optmax) { +					ops->use++; +					up(&nf_sockopt_mutex); +					ret = ops->get(sk, val, opt, len); +					goto out; +				} +			} else { +				if (val >= ops->set_optmin +				    && val < ops->set_optmax) { +					ops->use++; +					up(&nf_sockopt_mutex); +					ret = ops->set(sk, val, opt, *len); +					goto out; +				} +			} +		} +	} +	up(&nf_sockopt_mutex); +	return -ENOPROTOOPT; +	 + out: +	down(&nf_sockopt_mutex); +	ops->use--; +	if (ops->cleanup_task) +		wake_up_process(ops->cleanup_task); +	up(&nf_sockopt_mutex); +	return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, +		  int len) +{ +	return nf_sockopt(sk, pf, val, opt, &len, 0); +} + +int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +{ +	return nf_sockopt(sk, pf, val, opt, len, 1); +} + +static unsigned int nf_iterate(struct list_head *head, +			       struct sk_buff **skb, +			       int hook, +			       const struct net_device *indev, +			       const struct net_device *outdev, +			       struct list_head **i, +			       int (*okfn)(struct sk_buff *), +			       int hook_thresh) +{ +	unsigned int verdict; + +	/* +	 * The caller must not block between calls to this +	 * function because of risk of continuing from deleted element. +	 */ +	list_for_each_continue_rcu(*i, head) { +		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + +		if (hook_thresh > elem->priority) +			continue; + +		/* Optimization: we don't need to hold module +                   reference here, since function can't sleep. --RR */ +		verdict = elem->hook(hook, skb, indev, outdev, okfn); +		if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG +			if (unlikely(verdict > NF_MAX_VERDICT)) { +				NFDEBUG("Evil return from %p(%u).\n", +				        elem->hook, hook); +				continue; +			} +#endif +			if (verdict != NF_REPEAT) +				return verdict; +			*i = (*i)->prev; +		} +	} +	return NF_ACCEPT; +} + +int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) +{       +	int ret; + +	write_lock_bh(&queue_handler_lock); +	if (queue_handler[pf].outfn) +		ret = -EBUSY; +	else { +		queue_handler[pf].outfn = outfn; +		queue_handler[pf].data = data; +		ret = 0; +	} +	write_unlock_bh(&queue_handler_lock); + +	return ret; +} + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(int pf) +{ +	write_lock_bh(&queue_handler_lock); +	queue_handler[pf].outfn = NULL; +	queue_handler[pf].data = NULL; +	write_unlock_bh(&queue_handler_lock); +	 +	return 0; +} + +/*  + * Any packet that leaves via this function must come back  + * through nf_reinject(). + */ +static int nf_queue(struct sk_buff *skb,  +		    struct list_head *elem,  +		    int pf, unsigned int hook, +		    struct net_device *indev, +		    struct net_device *outdev, +		    int (*okfn)(struct sk_buff *)) +{ +	int status; +	struct nf_info *info; +#ifdef CONFIG_BRIDGE_NETFILTER +	struct net_device *physindev = NULL; +	struct net_device *physoutdev = NULL; +#endif + +	/* QUEUE == DROP if noone is waiting, to be safe. */ +	read_lock(&queue_handler_lock); +	if (!queue_handler[pf].outfn) { +		read_unlock(&queue_handler_lock); +		kfree_skb(skb); +		return 1; +	} + +	info = kmalloc(sizeof(*info), GFP_ATOMIC); +	if (!info) { +		if (net_ratelimit()) +			printk(KERN_ERR "OOM queueing packet %p\n", +			       skb); +		read_unlock(&queue_handler_lock); +		kfree_skb(skb); +		return 1; +	} + +	*info = (struct nf_info) {  +		(struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + +	/* If it's going away, ignore hook. */ +	if (!try_module_get(info->elem->owner)) { +		read_unlock(&queue_handler_lock); +		kfree(info); +		return 0; +	} + +	/* Bump dev refs so they don't vanish while packet is out */ +	if (indev) dev_hold(indev); +	if (outdev) dev_hold(outdev); + +#ifdef CONFIG_BRIDGE_NETFILTER +	if (skb->nf_bridge) { +		physindev = skb->nf_bridge->physindev; +		if (physindev) dev_hold(physindev); +		physoutdev = skb->nf_bridge->physoutdev; +		if (physoutdev) dev_hold(physoutdev); +	} +#endif + +	status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); +	read_unlock(&queue_handler_lock); + +	if (status < 0) { +		/* James M doesn't say fuck enough. */ +		if (indev) dev_put(indev); +		if (outdev) dev_put(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER +		if (physindev) dev_put(physindev); +		if (physoutdev) dev_put(physoutdev); +#endif +		module_put(info->elem->owner); +		kfree(info); +		kfree_skb(skb); +		return 1; +	} +	return 1; +} + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, +		 struct net_device *indev, +		 struct net_device *outdev, +		 int (*okfn)(struct sk_buff *), +		 int hook_thresh) +{ +	struct list_head *elem; +	unsigned int verdict; +	int ret = 0; + +	/* We may already have this, but read-locks nest anyway */ +	rcu_read_lock(); + +#ifdef CONFIG_NETFILTER_DEBUG +	if (unlikely((*pskb)->nf_debug & (1 << hook))) { +		printk("nf_hook: hook %i already set.\n", hook); +		nf_dump_skb(pf, *pskb); +	} +	(*pskb)->nf_debug |= (1 << hook); +#endif + +	elem = &nf_hooks[pf][hook]; +next_hook: +	verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, +			     outdev, &elem, okfn, hook_thresh); +	if (verdict == NF_ACCEPT || verdict == NF_STOP) { +		ret = 1; +		goto unlock; +	} else if (verdict == NF_DROP) { +		kfree_skb(*pskb); +		ret = -EPERM; +	} else if (verdict == NF_QUEUE) { +		NFDEBUG("nf_hook: Verdict = QUEUE.\n"); +		if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) +			goto next_hook; +	} +unlock: +	rcu_read_unlock(); +	return ret; +} + +void nf_reinject(struct sk_buff *skb, struct nf_info *info, +		 unsigned int verdict) +{ +	struct list_head *elem = &info->elem->list; +	struct list_head *i; + +	rcu_read_lock(); + +	/* Release those devices we held, or Alexey will kill me. */ +	if (info->indev) dev_put(info->indev); +	if (info->outdev) dev_put(info->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER +	if (skb->nf_bridge) { +		if (skb->nf_bridge->physindev) +			dev_put(skb->nf_bridge->physindev); +		if (skb->nf_bridge->physoutdev) +			dev_put(skb->nf_bridge->physoutdev); +	} +#endif + +	/* Drop reference to owner of hook which queued us. */ +	module_put(info->elem->owner); + +	list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { +		if (i == elem)  +  			break; +  	} +   +	if (elem == &nf_hooks[info->pf][info->hook]) { +		/* The module which sent it to userspace is gone. */ +		NFDEBUG("%s: module disappeared, dropping packet.\n", +			__FUNCTION__); +		verdict = NF_DROP; +	} + +	/* Continue traversal iff userspace said ok... */ +	if (verdict == NF_REPEAT) { +		elem = elem->prev; +		verdict = NF_ACCEPT; +	} + +	if (verdict == NF_ACCEPT) { +	next_hook: +		verdict = nf_iterate(&nf_hooks[info->pf][info->hook], +				     &skb, info->hook,  +				     info->indev, info->outdev, &elem, +				     info->okfn, INT_MIN); +	} + +	switch (verdict) { +	case NF_ACCEPT: +		info->okfn(skb); +		break; + +	case NF_QUEUE: +		if (!nf_queue(skb, elem, info->pf, info->hook,  +			      info->indev, info->outdev, info->okfn)) +			goto next_hook; +		break; +	} +	rcu_read_unlock(); + +	if (verdict == NF_DROP) +		kfree_skb(skb); + +	kfree(info); +	return; +} + +#ifdef CONFIG_INET +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ +	struct iphdr *iph = (*pskb)->nh.iph; +	struct rtable *rt; +	struct flowi fl = {}; +	struct dst_entry *odst; +	unsigned int hh_len; + +	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause +	 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. +	 */ +	if (inet_addr_type(iph->saddr) == RTN_LOCAL) { +		fl.nl_u.ip4_u.daddr = iph->daddr; +		fl.nl_u.ip4_u.saddr = iph->saddr; +		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); +		fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK +		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif +		fl.proto = iph->protocol; +		if (ip_route_output_key(&rt, &fl) != 0) +			return -1; + +		/* Drop old route. */ +		dst_release((*pskb)->dst); +		(*pskb)->dst = &rt->u.dst; +	} else { +		/* non-local src, find valid iif to satisfy +		 * rp-filter when calling ip_route_input. */ +		fl.nl_u.ip4_u.daddr = iph->saddr; +		if (ip_route_output_key(&rt, &fl) != 0) +			return -1; + +		odst = (*pskb)->dst; +		if (ip_route_input(*pskb, iph->daddr, iph->saddr, +				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) { +			dst_release(&rt->u.dst); +			return -1; +		} +		dst_release(&rt->u.dst); +		dst_release(odst); +	} +	 +	if ((*pskb)->dst->error) +		return -1; + +	/* Change in oif may mean change in hh_len. */ +	hh_len = (*pskb)->dst->dev->hard_header_len; +	if (skb_headroom(*pskb) < hh_len) { +		struct sk_buff *nskb; + +		nskb = skb_realloc_headroom(*pskb, hh_len); +		if (!nskb)  +			return -1; +		if ((*pskb)->sk) +			skb_set_owner_w(nskb, (*pskb)->sk); +		kfree_skb(*pskb); +		*pskb = nskb; +	} + +	return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) +{ +	struct sk_buff *nskb; + +	if (writable_len > (*pskb)->len) +		return 0; + +	/* Not exclusive use of packet?  Must copy. */ +	if (skb_shared(*pskb) || skb_cloned(*pskb)) +		goto copy_skb; + +	return pskb_may_pull(*pskb, writable_len); + +copy_skb: +	nskb = skb_copy(*pskb, GFP_ATOMIC); +	if (!nskb) +		return 0; +	BUG_ON(skb_is_nonlinear(nskb)); + +	/* Rest of kernel will get very unhappy if we pass it a +	   suddenly-orphaned skbuff */ +	if ((*pskb)->sk) +		skb_set_owner_w(nskb, (*pskb)->sk); +	kfree_skb(*pskb); +	*pskb = nskb; +	return 1; +} +EXPORT_SYMBOL(skb_ip_make_writable); +#endif /*CONFIG_INET*/ + +/* Internal logging interface, which relies on the real  +   LOG target modules */ + +#define NF_LOG_PREFIXLEN		128 + +static nf_logfn *nf_logging[NPROTO]; /* = NULL */ +static int reported = 0; +static DEFINE_SPINLOCK(nf_log_lock); + +int nf_log_register(int pf, nf_logfn *logfn) +{ +	int ret = -EBUSY; + +	/* Any setup of logging members must be done before +	 * substituting pointer. */ +	spin_lock(&nf_log_lock); +	if (!nf_logging[pf]) { +		rcu_assign_pointer(nf_logging[pf], logfn); +		ret = 0; +	} +	spin_unlock(&nf_log_lock); +	return ret; +}		 + +void nf_log_unregister(int pf, nf_logfn *logfn) +{ +	spin_lock(&nf_log_lock); +	if (nf_logging[pf] == logfn) +		nf_logging[pf] = NULL; +	spin_unlock(&nf_log_lock); + +	/* Give time to concurrent readers. */ +	synchronize_net(); +}		 + +void nf_log_packet(int pf, +		   unsigned int hooknum, +		   const struct sk_buff *skb, +		   const struct net_device *in, +		   const struct net_device *out, +		   const char *fmt, ...) +{ +	va_list args; +	char prefix[NF_LOG_PREFIXLEN]; +	nf_logfn *logfn; +	 +	rcu_read_lock(); +	logfn = rcu_dereference(nf_logging[pf]); +	if (logfn) { +		va_start(args, fmt); +		vsnprintf(prefix, sizeof(prefix), fmt, args); +		va_end(args); +		/* We must read logging before nf_logfn[pf] */ +		logfn(hooknum, skb, in, out, prefix); +	} else if (!reported) { +		printk(KERN_WARNING "nf_log_packet: can\'t log yet, " +		       "no backend logging module loaded in!\n"); +		reported++; +	} +	rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_register); +EXPORT_SYMBOL(nf_log_unregister); +EXPORT_SYMBOL(nf_log_packet); + +/* This does not belong here, but locally generated errors need it if connection +   tracking in use: without this, connection may not be in hash table, and hence +   manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ +	void (*attach)(struct sk_buff *, struct sk_buff *); + +	if (skb->nfct && (attach = ip_ct_attach) != NULL) { +		mb(); /* Just to be sure: must be read before executing this */ +		attach(new, skb); +	} +} + +void __init netfilter_init(void) +{ +	int i, h; + +	for (i = 0; i < NPROTO; i++) { +		for (h = 0; h < NF_MAX_HOOKS; h++) +			INIT_LIST_HEAD(&nf_hooks[i][h]); +	} +} + +EXPORT_SYMBOL(ip_ct_attach); +EXPORT_SYMBOL(nf_ct_attach); +EXPORT_SYMBOL(nf_getsockopt); +EXPORT_SYMBOL(nf_hook_slow); +EXPORT_SYMBOL(nf_hooks); +EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_queue_handler); +EXPORT_SYMBOL(nf_register_sockopt); +EXPORT_SYMBOL(nf_reinject); +EXPORT_SYMBOL(nf_setsockopt); +EXPORT_SYMBOL(nf_unregister_hook); +EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 00000000000..a119696d552 --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,735 @@ +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003  Matt Mackall <mpm@selenic.com> + * + * based on the netconsole code from: + * + * Copyright (C) 2001  Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002  Red Hat, Inc. + */ + +#include <linux/smp_lock.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/unaligned.h> + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 +#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) + +static DEFINE_SPINLOCK(skb_list_lock); +static int nr_skbs; +static struct sk_buff *skbs; + +static DEFINE_SPINLOCK(queue_lock); +static int queue_depth; +static struct sk_buff *queue_head, *queue_tail; + +static atomic_t trapped; + +#define NETPOLL_RX_ENABLED  1 +#define NETPOLL_RX_DROP     2 + +#define MAX_SKB_SIZE \ +		(MAX_UDP_CHUNK + sizeof(struct udphdr) + \ +				sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void zap_completion_queue(void); + +static void queue_process(void *p) +{ +	unsigned long flags; +	struct sk_buff *skb; + +	while (queue_head) { +		spin_lock_irqsave(&queue_lock, flags); + +		skb = queue_head; +		queue_head = skb->next; +		if (skb == queue_tail) +			queue_head = NULL; + +		queue_depth--; + +		spin_unlock_irqrestore(&queue_lock, flags); + +		dev_queue_xmit(skb); +	} +} + +static DECLARE_WORK(send_queue, queue_process, NULL); + +void netpoll_queue(struct sk_buff *skb) +{ +	unsigned long flags; + +	if (queue_depth == MAX_QUEUE_DEPTH) { +		__kfree_skb(skb); +		return; +	} + +	spin_lock_irqsave(&queue_lock, flags); +	if (!queue_head) +		queue_head = skb; +	else +		queue_tail->next = skb; +	queue_tail = skb; +	queue_depth++; +	spin_unlock_irqrestore(&queue_lock, flags); + +	schedule_work(&send_queue); +} + +static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, +			     unsigned short ulen, u32 saddr, u32 daddr) +{ +	if (uh->check == 0) +		return 0; + +	if (skb->ip_summed == CHECKSUM_HW) +		return csum_tcpudp_magic( +			saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + +	skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + +	return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +} + +/* + * Check whether delayed processing was scheduled for our NIC. If so, + * we attempt to grab the poll lock and use ->poll() to pump the card. + * If this fails, either we've recursed in ->poll() or it's already + * running on another CPU. + * + * Note: we don't mask interrupts with this lock because we're using + * trylock here and interrupts are already disabled in the softirq + * case. Further, we test the poll_owner to avoid recursion on UP + * systems where the lock doesn't exist. + * + * In cases where there is bi-directional communications, reading only + * one message at a time can lead to packets being dropped by the + * network adapter, forcing superfluous retries and possibly timeouts. + * Thus, we set our budget to greater than 1. + */ +static void poll_napi(struct netpoll *np) +{ +	int budget = 16; + +	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && +	    np->poll_owner != smp_processor_id() && +	    spin_trylock(&np->poll_lock)) { +		np->rx_flags |= NETPOLL_RX_DROP; +		atomic_inc(&trapped); + +		np->dev->poll(np->dev, &budget); + +		atomic_dec(&trapped); +		np->rx_flags &= ~NETPOLL_RX_DROP; +		spin_unlock(&np->poll_lock); +	} +} + +void netpoll_poll(struct netpoll *np) +{ +	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller) +		return; + +	/* Process pending work on NIC */ +	np->dev->poll_controller(np->dev); +	if (np->dev->poll) +		poll_napi(np); + +	zap_completion_queue(); +} + +static void refill_skbs(void) +{ +	struct sk_buff *skb; +	unsigned long flags; + +	spin_lock_irqsave(&skb_list_lock, flags); +	while (nr_skbs < MAX_SKBS) { +		skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); +		if (!skb) +			break; + +		skb->next = skbs; +		skbs = skb; +		nr_skbs++; +	} +	spin_unlock_irqrestore(&skb_list_lock, flags); +} + +static void zap_completion_queue(void) +{ +	unsigned long flags; +	struct softnet_data *sd = &get_cpu_var(softnet_data); + +	if (sd->completion_queue) { +		struct sk_buff *clist; + +		local_irq_save(flags); +		clist = sd->completion_queue; +		sd->completion_queue = NULL; +		local_irq_restore(flags); + +		while (clist != NULL) { +			struct sk_buff *skb = clist; +			clist = clist->next; +			if(skb->destructor) +				dev_kfree_skb_any(skb); /* put this one back */ +			else +				__kfree_skb(skb); +		} +	} + +	put_cpu_var(softnet_data); +} + +static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) +{ +	int once = 1, count = 0; +	unsigned long flags; +	struct sk_buff *skb = NULL; + +	zap_completion_queue(); +repeat: +	if (nr_skbs < MAX_SKBS) +		refill_skbs(); + +	skb = alloc_skb(len, GFP_ATOMIC); + +	if (!skb) { +		spin_lock_irqsave(&skb_list_lock, flags); +		skb = skbs; +		if (skb) { +			skbs = skb->next; +			skb->next = NULL; +			nr_skbs--; +		} +		spin_unlock_irqrestore(&skb_list_lock, flags); +	} + +	if(!skb) { +		count++; +		if (once && (count == 1000000)) { +			printk("out of netpoll skbs!\n"); +			once = 0; +		} +		netpoll_poll(np); +		goto repeat; +	} + +	atomic_set(&skb->users, 1); +	skb_reserve(skb, reserve); +	return skb; +} + +static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ +	int status; + +repeat: +	if(!np || !np->dev || !netif_running(np->dev)) { +		__kfree_skb(skb); +		return; +	} + +	/* avoid recursion */ +	if(np->poll_owner == smp_processor_id() || +	   np->dev->xmit_lock_owner == smp_processor_id()) { +		if (np->drop) +			np->drop(skb); +		else +			__kfree_skb(skb); +		return; +	} + +	spin_lock(&np->dev->xmit_lock); +	np->dev->xmit_lock_owner = smp_processor_id(); + +	/* +	 * network drivers do not expect to be called if the queue is +	 * stopped. +	 */ +	if (netif_queue_stopped(np->dev)) { +		np->dev->xmit_lock_owner = -1; +		spin_unlock(&np->dev->xmit_lock); + +		netpoll_poll(np); +		goto repeat; +	} + +	status = np->dev->hard_start_xmit(skb, np->dev); +	np->dev->xmit_lock_owner = -1; +	spin_unlock(&np->dev->xmit_lock); + +	/* transmit busy */ +	if(status) { +		netpoll_poll(np); +		goto repeat; +	} +} + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ +	int total_len, eth_len, ip_len, udp_len; +	struct sk_buff *skb; +	struct udphdr *udph; +	struct iphdr *iph; +	struct ethhdr *eth; + +	udp_len = len + sizeof(*udph); +	ip_len = eth_len = udp_len + sizeof(*iph); +	total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; + +	skb = find_skb(np, total_len, total_len - len); +	if (!skb) +		return; + +	memcpy(skb->data, msg, len); +	skb->len += len; + +	udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); +	udph->source = htons(np->local_port); +	udph->dest = htons(np->remote_port); +	udph->len = htons(udp_len); +	udph->check = 0; + +	iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + +	/* iph->version = 4; iph->ihl = 5; */ +	put_unaligned(0x45, (unsigned char *)iph); +	iph->tos      = 0; +	put_unaligned(htons(ip_len), &(iph->tot_len)); +	iph->id       = 0; +	iph->frag_off = 0; +	iph->ttl      = 64; +	iph->protocol = IPPROTO_UDP; +	iph->check    = 0; +	put_unaligned(htonl(np->local_ip), &(iph->saddr)); +	put_unaligned(htonl(np->remote_ip), &(iph->daddr)); +	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); + +	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + +	eth->h_proto = htons(ETH_P_IP); +	memcpy(eth->h_source, np->local_mac, 6); +	memcpy(eth->h_dest, np->remote_mac, 6); + +	skb->dev = np->dev; + +	netpoll_send_skb(np, skb); +} + +static void arp_reply(struct sk_buff *skb) +{ +	struct arphdr *arp; +	unsigned char *arp_ptr; +	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; +	u32 sip, tip; +	struct sk_buff *send_skb; +	struct netpoll *np = skb->dev->np; + +	if (!np) return; + +	/* No arp on this interface */ +	if (skb->dev->flags & IFF_NOARP) +		return; + +	if (!pskb_may_pull(skb, (sizeof(struct arphdr) + +				 (2 * skb->dev->addr_len) + +				 (2 * sizeof(u32))))) +		return; + +	skb->h.raw = skb->nh.raw = skb->data; +	arp = skb->nh.arph; + +	if ((arp->ar_hrd != htons(ARPHRD_ETHER) && +	     arp->ar_hrd != htons(ARPHRD_IEEE802)) || +	    arp->ar_pro != htons(ETH_P_IP) || +	    arp->ar_op != htons(ARPOP_REQUEST)) +		return; + +	arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len; +	memcpy(&sip, arp_ptr, 4); +	arp_ptr += 4 + skb->dev->addr_len; +	memcpy(&tip, arp_ptr, 4); + +	/* Should we ignore arp? */ +	if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip)) +		return; + +	size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4); +	send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev), +			    LL_RESERVED_SPACE(np->dev)); + +	if (!send_skb) +		return; + +	send_skb->nh.raw = send_skb->data; +	arp = (struct arphdr *) skb_put(send_skb, size); +	send_skb->dev = skb->dev; +	send_skb->protocol = htons(ETH_P_ARP); + +	/* Fill the device header for the ARP frame */ + +	if (np->dev->hard_header && +	    np->dev->hard_header(send_skb, skb->dev, ptype, +				       np->remote_mac, np->local_mac, +				       send_skb->len) < 0) { +		kfree_skb(send_skb); +		return; +	} + +	/* +	 * Fill out the arp protocol part. +	 * +	 * we only support ethernet device type, +	 * which (according to RFC 1390) should always equal 1 (Ethernet). +	 */ + +	arp->ar_hrd = htons(np->dev->type); +	arp->ar_pro = htons(ETH_P_IP); +	arp->ar_hln = np->dev->addr_len; +	arp->ar_pln = 4; +	arp->ar_op = htons(type); + +	arp_ptr=(unsigned char *)(arp + 1); +	memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); +	arp_ptr += np->dev->addr_len; +	memcpy(arp_ptr, &tip, 4); +	arp_ptr += 4; +	memcpy(arp_ptr, np->remote_mac, np->dev->addr_len); +	arp_ptr += np->dev->addr_len; +	memcpy(arp_ptr, &sip, 4); + +	netpoll_send_skb(np, send_skb); +} + +int __netpoll_rx(struct sk_buff *skb) +{ +	int proto, len, ulen; +	struct iphdr *iph; +	struct udphdr *uh; +	struct netpoll *np = skb->dev->np; + +	if (!np->rx_hook) +		goto out; +	if (skb->dev->type != ARPHRD_ETHER) +		goto out; + +	/* check if netpoll clients need ARP */ +	if (skb->protocol == __constant_htons(ETH_P_ARP) && +	    atomic_read(&trapped)) { +		arp_reply(skb); +		return 1; +	} + +	proto = ntohs(eth_hdr(skb)->h_proto); +	if (proto != ETH_P_IP) +		goto out; +	if (skb->pkt_type == PACKET_OTHERHOST) +		goto out; +	if (skb_shared(skb)) +		goto out; + +	iph = (struct iphdr *)skb->data; +	if (!pskb_may_pull(skb, sizeof(struct iphdr))) +		goto out; +	if (iph->ihl < 5 || iph->version != 4) +		goto out; +	if (!pskb_may_pull(skb, iph->ihl*4)) +		goto out; +	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) +		goto out; + +	len = ntohs(iph->tot_len); +	if (skb->len < len || len < iph->ihl*4) +		goto out; + +	if (iph->protocol != IPPROTO_UDP) +		goto out; + +	len -= iph->ihl*4; +	uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); +	ulen = ntohs(uh->len); + +	if (ulen != len) +		goto out; +	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) +		goto out; +	if (np->local_ip && np->local_ip != ntohl(iph->daddr)) +		goto out; +	if (np->remote_ip && np->remote_ip != ntohl(iph->saddr)) +		goto out; +	if (np->local_port && np->local_port != ntohs(uh->dest)) +		goto out; + +	np->rx_hook(np, ntohs(uh->source), +		    (char *)(uh+1), +		    ulen - sizeof(struct udphdr)); + +	kfree_skb(skb); +	return 1; + +out: +	if (atomic_read(&trapped)) { +		kfree_skb(skb); +		return 1; +	} + +	return 0; +} + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ +	char *cur=opt, *delim; + +	if(*cur != '@') { +		if ((delim = strchr(cur, '@')) == NULL) +			goto parse_failed; +		*delim=0; +		np->local_port=simple_strtol(cur, NULL, 10); +		cur=delim; +	} +	cur++; +	printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port); + +	if(*cur != '/') { +		if ((delim = strchr(cur, '/')) == NULL) +			goto parse_failed; +		*delim=0; +		np->local_ip=ntohl(in_aton(cur)); +		cur=delim; + +		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", +		       np->name, HIPQUAD(np->local_ip)); +	} +	cur++; + +	if ( *cur != ',') { +		/* parse out dev name */ +		if ((delim = strchr(cur, ',')) == NULL) +			goto parse_failed; +		*delim=0; +		strlcpy(np->dev_name, cur, sizeof(np->dev_name)); +		cur=delim; +	} +	cur++; + +	printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name); + +	if ( *cur != '@' ) { +		/* dst port */ +		if ((delim = strchr(cur, '@')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_port=simple_strtol(cur, NULL, 10); +		cur=delim; +	} +	cur++; +	printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); + +	/* dst ip */ +	if ((delim = strchr(cur, '/')) == NULL) +		goto parse_failed; +	*delim=0; +	np->remote_ip=ntohl(in_aton(cur)); +	cur=delim+1; + +	printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n", +		       np->name, HIPQUAD(np->remote_ip)); + +	if( *cur != 0 ) +	{ +		/* MAC address */ +		if ((delim = strchr(cur, ':')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_mac[0]=simple_strtol(cur, NULL, 16); +		cur=delim+1; +		if ((delim = strchr(cur, ':')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_mac[1]=simple_strtol(cur, NULL, 16); +		cur=delim+1; +		if ((delim = strchr(cur, ':')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_mac[2]=simple_strtol(cur, NULL, 16); +		cur=delim+1; +		if ((delim = strchr(cur, ':')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_mac[3]=simple_strtol(cur, NULL, 16); +		cur=delim+1; +		if ((delim = strchr(cur, ':')) == NULL) +			goto parse_failed; +		*delim=0; +		np->remote_mac[4]=simple_strtol(cur, NULL, 16); +		cur=delim+1; +		np->remote_mac[5]=simple_strtol(cur, NULL, 16); +	} + +	printk(KERN_INFO "%s: remote ethernet address " +	       "%02x:%02x:%02x:%02x:%02x:%02x\n", +	       np->name, +	       np->remote_mac[0], +	       np->remote_mac[1], +	       np->remote_mac[2], +	       np->remote_mac[3], +	       np->remote_mac[4], +	       np->remote_mac[5]); + +	return 0; + + parse_failed: +	printk(KERN_INFO "%s: couldn't parse config at %s!\n", +	       np->name, cur); +	return -1; +} + +int netpoll_setup(struct netpoll *np) +{ +	struct net_device *ndev = NULL; +	struct in_device *in_dev; + +	np->poll_lock = SPIN_LOCK_UNLOCKED; +	np->poll_owner = -1; + +	if (np->dev_name) +		ndev = dev_get_by_name(np->dev_name); +	if (!ndev) { +		printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", +		       np->name, np->dev_name); +		return -1; +	} + +	np->dev = ndev; +	ndev->np = np; + +	if (!ndev->poll_controller) { +		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", +		       np->name, np->dev_name); +		goto release; +	} + +	if (!netif_running(ndev)) { +		unsigned long atmost, atleast; + +		printk(KERN_INFO "%s: device %s not up yet, forcing it\n", +		       np->name, np->dev_name); + +		rtnl_shlock(); +		if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) { +			printk(KERN_ERR "%s: failed to open %s\n", +			       np->name, np->dev_name); +			rtnl_shunlock(); +			goto release; +		} +		rtnl_shunlock(); + +		atleast = jiffies + HZ/10; + 		atmost = jiffies + 4*HZ; +		while (!netif_carrier_ok(ndev)) { +			if (time_after(jiffies, atmost)) { +				printk(KERN_NOTICE +				       "%s: timeout waiting for carrier\n", +				       np->name); +				break; +			} +			cond_resched(); +		} + +		/* If carrier appears to come up instantly, we don't +		 * trust it and pause so that we don't pump all our +		 * queued console messages into the bitbucket. +		 */ + +		if (time_before(jiffies, atleast)) { +			printk(KERN_NOTICE "%s: carrier detect appears" +			       " untrustworthy, waiting 4 seconds\n", +			       np->name); +			msleep(4000); +		} +	} + +	if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr) +		memcpy(np->local_mac, ndev->dev_addr, 6); + +	if (!np->local_ip) { +		rcu_read_lock(); +		in_dev = __in_dev_get(ndev); + +		if (!in_dev || !in_dev->ifa_list) { +			rcu_read_unlock(); +			printk(KERN_ERR "%s: no IP address for %s, aborting\n", +			       np->name, np->dev_name); +			goto release; +		} + +		np->local_ip = ntohl(in_dev->ifa_list->ifa_local); +		rcu_read_unlock(); +		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", +		       np->name, HIPQUAD(np->local_ip)); +	} + +	if(np->rx_hook) +		np->rx_flags = NETPOLL_RX_ENABLED; + +	return 0; + + release: +	ndev->np = NULL; +	np->dev = NULL; +	dev_put(ndev); +	return -1; +} + +void netpoll_cleanup(struct netpoll *np) +{ +	if (np->dev) +		np->dev->np = NULL; +	dev_put(np->dev); +	np->dev = NULL; +} + +int netpoll_trap(void) +{ +	return atomic_read(&trapped); +} + +void netpoll_set_trap(int trap) +{ +	if (trap) +		atomic_inc(&trapped); +	else +		atomic_dec(&trapped); +} + +EXPORT_SYMBOL(netpoll_set_trap); +EXPORT_SYMBOL(netpoll_trap); +EXPORT_SYMBOL(netpoll_parse_options); +EXPORT_SYMBOL(netpoll_setup); +EXPORT_SYMBOL(netpoll_cleanup); +EXPORT_SYMBOL(netpoll_send_udp); +EXPORT_SYMBOL(netpoll_poll); +EXPORT_SYMBOL(netpoll_queue); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 00000000000..c57b06bc79f --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,3132 @@ +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se> + *                             Uppsala University and + *                             Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov  <kuznet@ms2.inr.ac.ru> + * Ben Greear <greearb@candelatech.com> + * Jens Låås <jens.laas@data.slu.se> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module.  Parameters are output  + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated.  020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + *   *  Convert to threaded model to more efficiently be able to transmit + *       and receive on multiple interfaces at once. + *   *  Converted many counters to __u64 to allow longer runs. + *   *  Allow configuration of ranges, like min/max IP address, MACs, + *       and UDP-ports, for both source and destination, and can + *       set to use a random distribution or sequentially walk the range. + *   *  Can now change most values after starting. + *   *  Place 12-byte packet in UDP payload with magic number, + *       sequence number, and timestamp. + *   *  Add receiver code that detects dropped pkts, re-ordered pkts, and + *       latencies (with micro-second) precision. + *   *  Add IOCTL interface to easily get counters & configuration. + *   --Ben Greear <greearb@candelatech.com> + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct  + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0  + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time  + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100  + * clones. + * + * Also moved to /proc/net/pktgen/  + * --ro + * + * Sept 10:  Fixed threading/locking.  Lots of bone-headed and more clever + *    mistakes.  Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear <greearb@candelatech.com> + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + *  + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.txt for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks  + * for running devices in the if_list and sends packets until count is 0 it  + * also the thread checks the thread->control which is used for inter-process  + * communication. controlling process "posts" operations to the threads this  + * way. The if_lock should be possible to remove when add/rem_device is merged + * into this too. + * + * By design there should only be *one* "controlling" process. In practice  + * multiple write accesses gives unpredictable result. Understood by "write"  + * to /proc gives result code thats should be read be the "writer". + * For pratical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign  + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.  + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref,  + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org> + *  + * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419 + * ia64 compilation fix from  Aron Griffis <aron@hp.com> 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger  + * <shemminger@osdl.org> 040923 + * + * Rany Dunlap fixed u64 printk compiler waring  + * + * Remove FCS from BW calculation.  Lennert Buytenhek <buytenh@wantstofly.org> + * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com)  + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>  + * 050103 + */ +#include <linux/sys.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/smp_lock.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/inet.h> +#include <linux/inetdevice.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <linux/proc_fs.h> +#include <linux/wait.h> +#include <net/checksum.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <asm/byteorder.h> +#include <linux/rcupdate.h> +#include <asm/bitops.h> +#include <asm/io.h> +#include <asm/dma.h> +#include <asm/uaccess.h> +#include <asm/div64.h> /* do_div */ +#include <asm/timex.h> + + +#define VERSION  "pktgen v2.61: Packet Generator for packet performance testing.\n" + +/* #define PG_DEBUG(a) a */ +#define PG_DEBUG(a)  + +/* The buckets are exponential in 'width' */ +#define LAT_BUCKETS_MAX 32 +#define IP_NAME_SZ 32 + +/* Device flag bits */ +#define F_IPSRC_RND   (1<<0)  /* IP-Src Random  */ +#define F_IPDST_RND   (1<<1)  /* IP-Dst Random  */ +#define F_UDPSRC_RND  (1<<2)  /* UDP-Src Random */ +#define F_UDPDST_RND  (1<<3)  /* UDP-Dst Random */ +#define F_MACSRC_RND  (1<<4)  /* MAC-Src Random */ +#define F_MACDST_RND  (1<<5)  /* MAC-Dst Random */ +#define F_TXSIZE_RND  (1<<6)  /* Transmit size is random */ +#define F_IPV6        (1<<7)  /* Interface in IPV6 Mode */ + +/* Thread control flag bits */ +#define T_TERMINATE   (1<<0)   +#define T_STOP        (1<<1)  /* Stop run */ +#define T_RUN         (1<<2)  /* Start run */ +#define T_REMDEV      (1<<3)  /* Remove all devs */ + +/* Locks */ +#define   thread_lock()        spin_lock(&_thread_lock) +#define   thread_unlock()      spin_unlock(&_thread_lock) + +/* If lock -- can be removed after some work */ +#define   if_lock(t)           spin_lock(&(t->if_lock)); +#define   if_unlock(t)           spin_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" + +#define MAX_CFLOWS  65536 + +struct flow_state +{ +	__u32		cur_daddr; +	int		count; +}; + +struct pktgen_dev { + +	/* +	 * Try to keep frequent/infrequent used vars. separated. +	 */ + +        char ifname[32]; +        struct proc_dir_entry *proc_ent; +        char result[512]; +        /* proc file names */ +        char fname[80]; + +        struct pktgen_thread* pg_thread; /* the owner */ +        struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */ + +        int running;  /* if this changes to false, the test will stop */ +         +        /* If min != max, then we will either do a linear iteration, or +         * we will do a random selection from within the range. +         */ +        __u32 flags;      + +        int min_pkt_size;    /* = ETH_ZLEN; */ +        int max_pkt_size;    /* = ETH_ZLEN; */ +        int nfrags; +        __u32 delay_us;    /* Default delay */ +        __u32 delay_ns; +        __u64 count;  /* Default No packets to send */ +        __u64 sofar;  /* How many pkts we've sent so far */ +        __u64 tx_bytes; /* How many bytes we've transmitted */ +        __u64 errors;    /* Errors when trying to transmit, pkts will be re-sent */ + +        /* runtime counters relating to clone_skb */ +        __u64 next_tx_us;          /* timestamp of when to tx next */ +        __u32 next_tx_ns; +         +        __u64 allocated_skbs; +        __u32 clone_count; +	int last_ok;           /* Was last skb sent?  +	                        * Or a failed transmit of some sort?  This will keep +                                * sequence numbers in order, for example. +				*/ +        __u64 started_at; /* micro-seconds */ +        __u64 stopped_at; /* micro-seconds */ +        __u64 idle_acc; /* micro-seconds */ +        __u32 seq_num; +         +        int clone_skb; /* Use multiple SKBs during packet gen.  If this number +                          * is greater than 1, then that many coppies of the same +                          * packet will be sent before a new packet is allocated. +                          * For instance, if you want to send 1024 identical packets +                          * before creating a new packet, set clone_skb to 1024. +                          */ +         +        char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ +        char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ +        char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ +        char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + +	struct in6_addr  in6_saddr; +	struct in6_addr  in6_daddr; +	struct in6_addr  cur_in6_daddr; +	struct in6_addr  cur_in6_saddr; +	/* For ranges */ +	struct in6_addr  min_in6_daddr; +	struct in6_addr  max_in6_daddr; +	struct in6_addr  min_in6_saddr; +	struct in6_addr  max_in6_saddr; + +        /* If we're doing ranges, random or incremental, then this +         * defines the min/max for those ranges. +         */ +        __u32 saddr_min; /* inclusive, source IP address */ +        __u32 saddr_max; /* exclusive, source IP address */ +        __u32 daddr_min; /* inclusive, dest IP address */ +        __u32 daddr_max; /* exclusive, dest IP address */ + +        __u16 udp_src_min; /* inclusive, source UDP port */ +        __u16 udp_src_max; /* exclusive, source UDP port */ +        __u16 udp_dst_min; /* inclusive, dest UDP port */ +        __u16 udp_dst_max; /* exclusive, dest UDP port */ + +        __u32 src_mac_count; /* How many MACs to iterate through */ +        __u32 dst_mac_count; /* How many MACs to iterate through */ +         +        unsigned char dst_mac[6]; +        unsigned char src_mac[6]; +         +        __u32 cur_dst_mac_offset; +        __u32 cur_src_mac_offset; +        __u32 cur_saddr; +        __u32 cur_daddr; +        __u16 cur_udp_dst; +        __u16 cur_udp_src; +        __u32 cur_pkt_size; +         +        __u8 hh[14]; +        /* = {  +           0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,  +            +           We fill in SRC address later +           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +           0x08, 0x00 +           }; +        */ +        __u16 pad; /* pad out the hh struct to an even 16 bytes */ + +        struct sk_buff* skb; /* skb we are to transmit next, mainly used for when we +                              * are transmitting the same one multiple times +                              */ +        struct net_device* odev; /* The out-going device.  Note that the device should +                                  * have it's pg_info pointer pointing back to this +                                  * device.  This will be set when the user specifies +                                  * the out-going device name (not when the inject is +                                  * started as it used to do.) +                                  */ +	struct flow_state *flows; +	unsigned cflows;         /* Concurrent flows (config) */ +	unsigned lflow;          /* Flow length  (config) */ +	unsigned nflows;         /* accumulated flows (stats) */ +}; + +struct pktgen_hdr { +        __u32 pgh_magic; +        __u32 seq_num; +	__u32 tv_sec; +	__u32 tv_usec; +}; + +struct pktgen_thread { +        spinlock_t if_lock; +        struct pktgen_dev *if_list;           /* All device here */ +        struct pktgen_thread* next; +        char name[32]; +        char fname[128]; /* name of proc file */ +        struct proc_dir_entry *proc_ent; +        char result[512]; +        u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ +         +	/* Field for thread to receive "posted" events terminate, stop ifs etc.*/ + +        u32 control; +	int pid; +	int cpu; + +        wait_queue_head_t queue; +}; + +#define REMOVE 1 +#define FIND   0 + +/*  This code works around the fact that do_div cannot handle two 64-bit +    numbers, and regular 64-bit division doesn't work on x86 kernels. +    --Ben +*/ + +#define PG_DIV 0 + +/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net> + * Function copied/adapted/optimized from: + * + *  nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html + * + * Copyright 1994, University of Cambridge Computer Laboratory + * All Rights Reserved. + * + */ +inline static s64 divremdi3(s64 x, s64 y, int type)  +{ +        u64 a = (x < 0) ? -x : x; +        u64 b = (y < 0) ? -y : y; +        u64 res = 0, d = 1; + +        if (b > 0) { +                while (b < a) { +                        b <<= 1; +                        d <<= 1; +                } +        } +         +        do { +                if ( a >= b ) { +                        a -= b; +                        res += d; +                } +                b >>= 1; +                d >>= 1; +        } +        while (d); + +        if (PG_DIV == type) { +                return (((x ^ y) & (1ll<<63)) == 0) ? res : -(s64)res; +        } +        else { +                return ((x & (1ll<<63)) == 0) ? a : -(s64)a; +        } +} + +/* End of hacks to deal with 64-bit math on x86 */ + +/** Convert to miliseconds */ +static inline __u64 tv_to_ms(const struct timeval* tv)  +{ +        __u64 ms = tv->tv_usec / 1000; +        ms += (__u64)tv->tv_sec * (__u64)1000; +        return ms; +} + + +/** Convert to micro-seconds */ +static inline __u64 tv_to_us(const struct timeval* tv)  +{ +        __u64 us = tv->tv_usec; +        us += (__u64)tv->tv_sec * (__u64)1000000; +        return us; +} + +static inline __u64 pg_div(__u64 n, __u32 base) { +        __u64 tmp = n; +        do_div(tmp, base); +        /* printk("pktgen: pg_div, n: %llu  base: %d  rv: %llu\n", +                  n, base, tmp); */ +        return tmp; +} + +static inline __u64 pg_div64(__u64 n, __u64 base)  +{ +        __u64 tmp = n; +/* + * How do we know if the architectrure we are running on + * supports division with 64 bit base? + *  + */ +#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__)  + +		do_div(tmp, base); +#else +		tmp = divremdi3(n, base, PG_DIV); +#endif +        return tmp; +} + +static inline u32 pktgen_random(void) +{ +#if 0 +	__u32 n; +	get_random_bytes(&n, 4); +	return n; +#else +	return net_random(); +#endif +} + +static inline __u64 getCurMs(void)  +{ +        struct timeval tv; +        do_gettimeofday(&tv); +        return tv_to_ms(&tv); +} + +static inline __u64 getCurUs(void)  +{ +        struct timeval tv; +        do_gettimeofday(&tv); +        return tv_to_us(&tv); +} + +static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b)  +{ +        return tv_to_us(a) - tv_to_us(b); +} + + +/* old include end */ + +static char version[] __initdata = VERSION; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos); +static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos); +static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); + +static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); +static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int create_proc_dir(void); +static int remove_proc_dir(void); + +static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread* t, const char* ifname); +static struct pktgen_thread* pktgen_find_thread(const char* name); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(void); +static void pktgen_stop_all_threads_ifs(void); +static int pktgen_stop_device(struct pktgen_dev *pkt_dev); +static void pktgen_stop(struct pktgen_thread* t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static struct pktgen_dev *pktgen_NN_threads(const char* dev_name, int remove); +static unsigned int scan_ip6(const char *s,char ip[16]); +static unsigned int fmt_ip6(char *s,const char ip[16]); + +/* Module parameters, defaults. */ +static int pg_count_d = 1000; /* 1000 pkts by default */ +static int pg_delay_d = 0; +static int pg_clone_skb_d = 0; +static int debug = 0; + +static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED; +static struct pktgen_thread *pktgen_threads = NULL; + +static char module_fname[128]; +static struct proc_dir_entry *module_proc_ent = NULL; + +static struct notifier_block pktgen_notifier_block = { +	.notifier_call = pktgen_device_event, +}; + +static struct file_operations pktgen_fops = { +        .read     = proc_pgctrl_read, +        .write    = proc_pgctrl_write, +	/*  .ioctl    = pktgen_ioctl, later maybe */ +}; + +/* + * /proc handling functions  + * + */ + +static struct proc_dir_entry *pg_proc_dir = NULL; +static int proc_pgctrl_read_eof=0; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, +                                 size_t count, loff_t *ppos) +{  +	char data[200]; +	int len = 0; + +	if(proc_pgctrl_read_eof) { +		proc_pgctrl_read_eof=0; +		len = 0; +		goto out; +	} + +	sprintf(data, "%s", VERSION);  + +	len = strlen(data); + +	if(len > count) { +		len =-EFAULT; +		goto out; +	}  	 + +	if (copy_to_user(buf, data, len)) { +		len =-EFAULT; +		goto out; +	}   + +	*ppos += len; +	proc_pgctrl_read_eof=1; /* EOF next call */ + + out: +	return len; +} + +static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf, +				 size_t count, loff_t *ppos) +{ +	char *data = NULL; +	int err = 0; + +        if (!capable(CAP_NET_ADMIN)){ +                err = -EPERM; +		goto out; +        } + +	data = (void*)vmalloc ((unsigned int)count); + +	if(!data) { +		err = -ENOMEM; +		goto out; +	} +	if (copy_from_user(data, buf, count)) { +		err =-EFAULT; +		goto out_free; +	}   +	data[count-1] = 0; /* Make string */ + +	if (!strcmp(data, "stop"))  +		pktgen_stop_all_threads_ifs(); + +        else if (!strcmp(data, "start"))  +		pktgen_run_all_threads(); + +	else  +		printk("pktgen: Unknown command: %s\n", data); + +	err = count; + + out_free: +	vfree (data); + out: +        return err; +} + +static int proc_if_read(char *buf , char **start, off_t offset, +                           int len, int *eof, void *data) +{ +	char *p; +	int i; +        struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); +        __u64 sa; +        __u64 stopped; +        __u64 now = getCurUs(); +         +	p = buf; +	p += sprintf(p, "Params: count %llu  min_pkt_size: %u  max_pkt_size: %u\n", +		     (unsigned long long) pkt_dev->count, +		     pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + +	p += sprintf(p, "     frags: %d  delay: %u  clone_skb: %d  ifname: %s\n", +                     pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname); + +	p += sprintf(p, "     flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); + + +	if(pkt_dev->flags & F_IPV6) { +		char b1[128], b2[128], b3[128]; +		fmt_ip6(b1,  pkt_dev->in6_saddr.s6_addr); +		fmt_ip6(b2,  pkt_dev->min_in6_saddr.s6_addr); +		fmt_ip6(b3,  pkt_dev->max_in6_saddr.s6_addr); +		p += sprintf(p, "     saddr: %s  min_saddr: %s  max_saddr: %s\n", b1, b2, b3); + +		fmt_ip6(b1,  pkt_dev->in6_daddr.s6_addr); +		fmt_ip6(b2,  pkt_dev->min_in6_daddr.s6_addr); +		fmt_ip6(b3,  pkt_dev->max_in6_daddr.s6_addr); +		p += sprintf(p, "     daddr: %s  min_daddr: %s  max_daddr: %s\n", b1, b2, b3); + +	}  +	else  +		p += sprintf(p, "     dst_min: %s  dst_max: %s\n     src_min: %s  src_max: %s\n", +                     pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max); + +        p += sprintf(p, "     src_mac: "); + +	if ((pkt_dev->src_mac[0] == 0) &&  +	    (pkt_dev->src_mac[1] == 0) &&  +	    (pkt_dev->src_mac[2] == 0) &&  +	    (pkt_dev->src_mac[3] == 0) &&  +	    (pkt_dev->src_mac[4] == 0) &&  +	    (pkt_dev->src_mac[5] == 0))  + +		for (i = 0; i < 6; i++)  +			p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? "  " : ":"); + +	else  +		for (i = 0; i < 6; i++)  +			p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? "  " : ":"); + +        p += sprintf(p, "dst_mac: "); +	for (i = 0; i < 6; i++)  +		p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":"); + +        p += sprintf(p, "     udp_src_min: %d  udp_src_max: %d  udp_dst_min: %d  udp_dst_max: %d\n", +                     pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, +                     pkt_dev->udp_dst_max); + +        p += sprintf(p, "     src_mac_count: %d  dst_mac_count: %d \n     Flags: ", +                     pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + +        if (pkt_dev->flags &  F_IPV6)  +                p += sprintf(p, "IPV6  "); + +        if (pkt_dev->flags &  F_IPSRC_RND)  +                p += sprintf(p, "IPSRC_RND  "); + +        if (pkt_dev->flags & F_IPDST_RND)  +                p += sprintf(p, "IPDST_RND  "); +         +        if (pkt_dev->flags & F_TXSIZE_RND)  +                p += sprintf(p, "TXSIZE_RND  "); +         +        if (pkt_dev->flags & F_UDPSRC_RND)  +                p += sprintf(p, "UDPSRC_RND  "); +         +        if (pkt_dev->flags & F_UDPDST_RND)  +                p += sprintf(p, "UDPDST_RND  "); +         +        if (pkt_dev->flags & F_MACSRC_RND)  +                p += sprintf(p, "MACSRC_RND  "); +         +        if (pkt_dev->flags & F_MACDST_RND)  +                p += sprintf(p, "MACDST_RND  "); + +         +        p += sprintf(p, "\n"); +         +        sa = pkt_dev->started_at; +        stopped = pkt_dev->stopped_at; +        if (pkt_dev->running)  +                stopped = now; /* not really stopped, more like last-running-at */ +         +        p += sprintf(p, "Current:\n     pkts-sofar: %llu  errors: %llu\n     started: %lluus  stopped: %lluus idle: %lluus\n", +		     (unsigned long long) pkt_dev->sofar, +		     (unsigned long long) pkt_dev->errors, +		     (unsigned long long) sa, +		     (unsigned long long) stopped,  +		     (unsigned long long) pkt_dev->idle_acc); + +        p += sprintf(p, "     seq_num: %d  cur_dst_mac_offset: %d  cur_src_mac_offset: %d\n", +                     pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset); + +	if(pkt_dev->flags & F_IPV6) { +		char b1[128], b2[128]; +		fmt_ip6(b1,  pkt_dev->cur_in6_daddr.s6_addr); +		fmt_ip6(b2,  pkt_dev->cur_in6_saddr.s6_addr); +		p += sprintf(p, "     cur_saddr: %s  cur_daddr: %s\n", b2, b1); +	}  +	else  +		p += sprintf(p, "     cur_saddr: 0x%x  cur_daddr: 0x%x\n", +                     pkt_dev->cur_saddr, pkt_dev->cur_daddr); + + +	p += sprintf(p, "     cur_udp_dst: %d  cur_udp_src: %d\n", +                     pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + +	p += sprintf(p, "     flows: %u\n", pkt_dev->nflows); + +	if (pkt_dev->result[0]) +		p += sprintf(p, "Result: %s\n", pkt_dev->result); +	else +		p += sprintf(p, "Result: Idle\n"); +	*eof = 1; + +	return p - buf; +} + + +static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen) +{ +	int i; + +	for (i = 0; i < maxlen; i++) { +                char c; +                if (get_user(c, &user_buffer[i])) +                        return -EFAULT; +                switch (c) { +		case '\"': +		case '\n': +		case '\r': +		case '\t': +		case ' ': +		case '=': +			break; +		default: +			goto done; +		}; +	} +done: +	return i; +} + +static unsigned long num_arg(const char __user *user_buffer, unsigned long maxlen,  +			     unsigned long *num) +{ +	int i = 0; +	*num = 0; +   +	for(; i < maxlen; i++) { +                char c; +                if (get_user(c, &user_buffer[i])) +                        return -EFAULT; +                if ((c >= '0') && (c <= '9')) { +			*num *= 10; +			*num += c -'0'; +		} else +			break; +	} +	return i; +} + +static int strn_len(const char __user *user_buffer, unsigned int maxlen) +{ +	int i = 0; + +	for(; i < maxlen; i++) { +                char c; +                if (get_user(c, &user_buffer[i])) +                        return -EFAULT; +                switch (c) { +		case '\"': +		case '\n': +		case '\r': +		case '\t': +		case ' ': +			goto done_str; +			break; +		default: +			break; +		}; +	} +done_str: + +	return i; +} + +static int proc_if_write(struct file *file, const char __user *user_buffer, +                            unsigned long count, void *data) +{ +	int i = 0, max, len; +	char name[16], valstr[32]; +	unsigned long value = 0; +        struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); +        char* pg_result = NULL; +        int tmp = 0; +	char buf[128]; +         +        pg_result = &(pkt_dev->result[0]); +         +	if (count < 1) { +		printk("pktgen: wrong command format\n"); +		return -EINVAL; +	} +   +	max = count - i; +	tmp = count_trail_chars(&user_buffer[i], max); +        if (tmp < 0) {  +		printk("pktgen: illegal format\n"); +		return tmp;  +	} +        i += tmp; +         +	/* Read variable name */ + +	len = strn_len(&user_buffer[i], sizeof(name) - 1); +        if (len < 0) { return len; } +	memset(name, 0, sizeof(name)); +	if (copy_from_user(name, &user_buffer[i], len) ) +		return -EFAULT; +	i += len; +   +	max = count -i; +	len = count_trail_chars(&user_buffer[i], max); +        if (len < 0)  +                return len; +         +	i += len; + +	if (debug) { +                char tb[count + 1]; +                if (copy_from_user(tb, user_buffer, count)) +			return -EFAULT; +                tb[count] = 0; +		printk("pktgen: %s,%lu  buffer -:%s:-\n", name, count, tb); +        } + +	if (!strcmp(name, "min_pkt_size")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (value < 14+20+8) +			value = 14+20+8; +                if (value != pkt_dev->min_pkt_size) { +                        pkt_dev->min_pkt_size = value; +                        pkt_dev->cur_pkt_size = value; +                } +		sprintf(pg_result, "OK: min_pkt_size=%u", pkt_dev->min_pkt_size); +		return count; +	} + +        if (!strcmp(name, "max_pkt_size")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (value < 14+20+8) +			value = 14+20+8; +                if (value != pkt_dev->max_pkt_size) { +                        pkt_dev->max_pkt_size = value; +                        pkt_dev->cur_pkt_size = value; +                } +		sprintf(pg_result, "OK: max_pkt_size=%u", pkt_dev->max_pkt_size); +		return count; +	} + +        /* Shortcut for min = max */ + +	if (!strcmp(name, "pkt_size")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (value < 14+20+8) +			value = 14+20+8; +                if (value != pkt_dev->min_pkt_size) { +                        pkt_dev->min_pkt_size = value; +                        pkt_dev->max_pkt_size = value; +                        pkt_dev->cur_pkt_size = value; +                } +		sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); +		return count; +	} + +        if (!strcmp(name, "debug")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                debug = value; +		sprintf(pg_result, "OK: debug=%u", debug); +		return count; +	} + +        if (!strcmp(name, "frags")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		pkt_dev->nfrags = value; +		sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); +		return count; +	} +	if (!strcmp(name, "delay")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (value == 0x7FFFFFFF) { +			pkt_dev->delay_us = 0x7FFFFFFF; +			pkt_dev->delay_ns = 0; +		} else { +			pkt_dev->delay_us = value / 1000; +			pkt_dev->delay_ns = value % 1000; +		} +		sprintf(pg_result, "OK: delay=%u", 1000*pkt_dev->delay_us+pkt_dev->delay_ns); +		return count; +	} + 	if (!strcmp(name, "udp_src_min")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                if (value != pkt_dev->udp_src_min) { +                        pkt_dev->udp_src_min = value; +                        pkt_dev->cur_udp_src = value; +                }        +		sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); +		return count; +	} + 	if (!strcmp(name, "udp_dst_min")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                if (value != pkt_dev->udp_dst_min) { +                        pkt_dev->udp_dst_min = value; +                        pkt_dev->cur_udp_dst = value; +                } +		sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); +		return count; +	} + 	if (!strcmp(name, "udp_src_max")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                if (value != pkt_dev->udp_src_max) { +                        pkt_dev->udp_src_max = value; +                        pkt_dev->cur_udp_src = value; +                } +		sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); +		return count; +	} + 	if (!strcmp(name, "udp_dst_max")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                if (value != pkt_dev->udp_dst_max) { +                        pkt_dev->udp_dst_max = value; +                        pkt_dev->cur_udp_dst = value; +                } +		sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); +		return count; +	} +	if (!strcmp(name, "clone_skb")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +                pkt_dev->clone_skb = value; +	 +		sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); +		return count; +	} +	if (!strcmp(name, "count")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		pkt_dev->count = value; +		sprintf(pg_result, "OK: count=%llu", +			(unsigned long long) pkt_dev->count); +		return count; +	} +	if (!strcmp(name, "src_mac_count")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (pkt_dev->src_mac_count != value) { +                        pkt_dev->src_mac_count = value; +                        pkt_dev->cur_src_mac_offset = 0; +                } +		sprintf(pg_result, "OK: src_mac_count=%d", pkt_dev->src_mac_count); +		return count; +	} +	if (!strcmp(name, "dst_mac_count")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (pkt_dev->dst_mac_count != value) { +                        pkt_dev->dst_mac_count = value; +                        pkt_dev->cur_dst_mac_offset = 0; +                } +		sprintf(pg_result, "OK: dst_mac_count=%d", pkt_dev->dst_mac_count); +		return count; +	} +	if (!strcmp(name, "flag")) { +                char f[32]; +                memset(f, 0, 32); +		len = strn_len(&user_buffer[i], sizeof(f) - 1); +                if (len < 0) { return len; } +		if (copy_from_user(f, &user_buffer[i], len)) +			return -EFAULT; +		i += len; +                if (strcmp(f, "IPSRC_RND") == 0)  +                        pkt_dev->flags |= F_IPSRC_RND; +                 +                else if (strcmp(f, "!IPSRC_RND") == 0)  +                        pkt_dev->flags &= ~F_IPSRC_RND; +                 +                else if (strcmp(f, "TXSIZE_RND") == 0)  +                        pkt_dev->flags |= F_TXSIZE_RND; +                 +                else if (strcmp(f, "!TXSIZE_RND") == 0)  +                        pkt_dev->flags &= ~F_TXSIZE_RND; +                 +                else if (strcmp(f, "IPDST_RND") == 0)  +                        pkt_dev->flags |= F_IPDST_RND; +                 +                else if (strcmp(f, "!IPDST_RND") == 0)  +                        pkt_dev->flags &= ~F_IPDST_RND; +                 +                else if (strcmp(f, "UDPSRC_RND") == 0)  +                        pkt_dev->flags |= F_UDPSRC_RND; +                 +                else if (strcmp(f, "!UDPSRC_RND") == 0)  +                        pkt_dev->flags &= ~F_UDPSRC_RND; +                 +                else if (strcmp(f, "UDPDST_RND") == 0)  +                        pkt_dev->flags |= F_UDPDST_RND; +                 +                else if (strcmp(f, "!UDPDST_RND") == 0)  +                        pkt_dev->flags &= ~F_UDPDST_RND; +                 +                else if (strcmp(f, "MACSRC_RND") == 0)  +                        pkt_dev->flags |= F_MACSRC_RND; +                 +                else if (strcmp(f, "!MACSRC_RND") == 0)  +                        pkt_dev->flags &= ~F_MACSRC_RND; +                 +                else if (strcmp(f, "MACDST_RND") == 0)  +                        pkt_dev->flags |= F_MACDST_RND; +                 +                else if (strcmp(f, "!MACDST_RND") == 0)  +                        pkt_dev->flags &= ~F_MACDST_RND; +                 +                else { +                        sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", +                                f, +                                "IPSRC_RND, IPDST_RND, TXSIZE_RND, UDPSRC_RND, UDPDST_RND, MACSRC_RND, MACDST_RND\n"); +                        return count; +                } +		sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); +		return count; +	} +	if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { +		len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); +                if (len < 0) { return len; } + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; +                if (strcmp(buf, pkt_dev->dst_min) != 0) { +                        memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); +                        strncpy(pkt_dev->dst_min, buf, len); +                        pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); +                        pkt_dev->cur_daddr = pkt_dev->daddr_min; +                } +                if(debug) +                        printk("pktgen: dst_min set to: %s\n", pkt_dev->dst_min); +                i += len; +		sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); +		return count; +	} +	if (!strcmp(name, "dst_max")) { +		len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); +                if (len < 0) { return len; } + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; + +                buf[len] = 0; +                if (strcmp(buf, pkt_dev->dst_max) != 0) { +                        memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); +                        strncpy(pkt_dev->dst_max, buf, len); +                        pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); +                        pkt_dev->cur_daddr = pkt_dev->daddr_max; +                } +		if(debug) +			printk("pktgen: dst_max set to: %s\n", pkt_dev->dst_max); +		i += len; +		sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); +		return count; +	} +	if (!strcmp(name, "dst6")) { +		len = strn_len(&user_buffer[i], sizeof(buf) - 1); +                if (len < 0) return len;  + +		pkt_dev->flags |= F_IPV6; + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; + +		scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); +		fmt_ip6(buf,  pkt_dev->in6_daddr.s6_addr); + +		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + +                if(debug)  +			printk("pktgen: dst6 set to: %s\n", buf); + +                i += len; +		sprintf(pg_result, "OK: dst6=%s", buf); +		return count; +	} +	if (!strcmp(name, "dst6_min")) { +		len = strn_len(&user_buffer[i], sizeof(buf) - 1); +                if (len < 0) return len;  + +		pkt_dev->flags |= F_IPV6; + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; + +		scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); +		fmt_ip6(buf,  pkt_dev->min_in6_daddr.s6_addr); + +		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr); +                if(debug)  +			printk("pktgen: dst6_min set to: %s\n", buf); + +                i += len; +		sprintf(pg_result, "OK: dst6_min=%s", buf); +		return count; +	} +	if (!strcmp(name, "dst6_max")) { +		len = strn_len(&user_buffer[i], sizeof(buf) - 1); +                if (len < 0) return len;  + +		pkt_dev->flags |= F_IPV6; + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; + +		scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); +		fmt_ip6(buf,  pkt_dev->max_in6_daddr.s6_addr); + +                if(debug)  +			printk("pktgen: dst6_max set to: %s\n", buf); + +                i += len; +		sprintf(pg_result, "OK: dst6_max=%s", buf); +		return count; +	} +	if (!strcmp(name, "src6")) { +		len = strn_len(&user_buffer[i], sizeof(buf) - 1); +                if (len < 0) return len;  + +		pkt_dev->flags |= F_IPV6; + +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; + +		scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); +		fmt_ip6(buf,  pkt_dev->in6_saddr.s6_addr); + +		ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + +                if(debug)  +			printk("pktgen: src6 set to: %s\n", buf); +		 +                i += len; +		sprintf(pg_result, "OK: src6=%s", buf); +		return count; +	} +	if (!strcmp(name, "src_min")) { +		len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); +                if (len < 0) { return len; } +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; +                if (strcmp(buf, pkt_dev->src_min) != 0) { +                        memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); +                        strncpy(pkt_dev->src_min, buf, len); +                        pkt_dev->saddr_min = in_aton(pkt_dev->src_min); +                        pkt_dev->cur_saddr = pkt_dev->saddr_min; +                } +		if(debug) +			printk("pktgen: src_min set to: %s\n", pkt_dev->src_min); +		i += len; +		sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); +		return count; +	} +	if (!strcmp(name, "src_max")) { +		len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); +                if (len < 0) { return len; } +                if (copy_from_user(buf, &user_buffer[i], len)) +			return -EFAULT; +                buf[len] = 0; +                if (strcmp(buf, pkt_dev->src_max) != 0) { +                        memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); +                        strncpy(pkt_dev->src_max, buf, len); +                        pkt_dev->saddr_max = in_aton(pkt_dev->src_max); +                        pkt_dev->cur_saddr = pkt_dev->saddr_max; +                } +		if(debug) +			printk("pktgen: src_max set to: %s\n", pkt_dev->src_max); +		i += len; +		sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); +		return count; +	} +	if (!strcmp(name, "dst_mac")) { +		char *v = valstr; +                unsigned char old_dmac[6]; +		unsigned char *m = pkt_dev->dst_mac; +                memcpy(old_dmac, pkt_dev->dst_mac, 6); +                 +		len = strn_len(&user_buffer[i], sizeof(valstr) - 1); +                if (len < 0) { return len; } +		memset(valstr, 0, sizeof(valstr)); +		if( copy_from_user(valstr, &user_buffer[i], len)) +			return -EFAULT; +		i += len; + +		for(*m = 0;*v && m < pkt_dev->dst_mac + 6; v++) { +			if (*v >= '0' && *v <= '9') { +				*m *= 16; +				*m += *v - '0'; +			} +			if (*v >= 'A' && *v <= 'F') { +				*m *= 16; +				*m += *v - 'A' + 10; +			} +			if (*v >= 'a' && *v <= 'f') { +				*m *= 16; +				*m += *v - 'a' + 10; +			} +			if (*v == ':') { +				m++; +				*m = 0; +			} +		} + +		/* Set up Dest MAC */ +                if (memcmp(old_dmac, pkt_dev->dst_mac, 6) != 0)  +                        memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); +                 +		sprintf(pg_result, "OK: dstmac"); +		return count; +	} +	if (!strcmp(name, "src_mac")) { +		char *v = valstr; +		unsigned char *m = pkt_dev->src_mac; + +		len = strn_len(&user_buffer[i], sizeof(valstr) - 1); +                if (len < 0) { return len; } +		memset(valstr, 0, sizeof(valstr)); +		if( copy_from_user(valstr, &user_buffer[i], len))  +			return -EFAULT; +		i += len; + +		for(*m = 0;*v && m < pkt_dev->src_mac + 6; v++) { +			if (*v >= '0' && *v <= '9') { +				*m *= 16; +				*m += *v - '0'; +			} +			if (*v >= 'A' && *v <= 'F') { +				*m *= 16; +				*m += *v - 'A' + 10; +			} +			if (*v >= 'a' && *v <= 'f') { +				*m *= 16; +				*m += *v - 'a' + 10; +			} +			if (*v == ':') { +				m++; +				*m = 0; +			} +		}	   + +                sprintf(pg_result, "OK: srcmac"); +		return count; +	} + +        if (!strcmp(name, "clear_counters")) { +                pktgen_clear_counters(pkt_dev); +                sprintf(pg_result, "OK: Clearing counters.\n"); +                return count; +        } + +	if (!strcmp(name, "flows")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		if (value > MAX_CFLOWS) +			value = MAX_CFLOWS; + +		pkt_dev->cflows = value; +		sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); +		return count; +	} + +	if (!strcmp(name, "flowlen")) { +		len = num_arg(&user_buffer[i], 10, &value); +                if (len < 0) { return len; } +		i += len; +		pkt_dev->lflow = value; +		sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); +		return count; +	} +         +	sprintf(pkt_dev->result, "No such parameter \"%s\"", name); +	return -EINVAL; +} + +static int proc_thread_read(char *buf , char **start, off_t offset, +                               int len, int *eof, void *data) +{ +	char *p; +        struct pktgen_thread *t = (struct pktgen_thread*)(data); +        struct pktgen_dev *pkt_dev = NULL; + + +        if (!t) { +                printk("pktgen: ERROR: could not find thread in proc_thread_read\n"); +                return -EINVAL; +        } + +	p = buf; +	p += sprintf(p, "Name: %s  max_before_softirq: %d\n", +                     t->name, t->max_before_softirq); + +        p += sprintf(p, "Running: "); +         +        if_lock(t); +        for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)  +		if(pkt_dev->running) +			p += sprintf(p, "%s ", pkt_dev->ifname); +         +        p += sprintf(p, "\nStopped: "); + +        for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)  +		if(!pkt_dev->running) +			p += sprintf(p, "%s ", pkt_dev->ifname); + +	if (t->result[0]) +		p += sprintf(p, "\nResult: %s\n", t->result); +	else +		p += sprintf(p, "\nResult: NA\n"); + +	*eof = 1; + +        if_unlock(t); + +	return p - buf; +} + +static int proc_thread_write(struct file *file, const char __user *user_buffer, +                                unsigned long count, void *data) +{ +	int i = 0, max, len, ret; +	char name[40]; +        struct pktgen_thread *t; +        char *pg_result; +        unsigned long value = 0; +         +	if (count < 1) { +		//	sprintf(pg_result, "Wrong command format"); +		return -EINVAL; +	} +   +	max = count - i; +        len = count_trail_chars(&user_buffer[i], max); +        if (len < 0)  +		return len;  +      +	i += len; +   +	/* Read variable name */ + +	len = strn_len(&user_buffer[i], sizeof(name) - 1); +        if (len < 0)   +		return len;  +	 +	memset(name, 0, sizeof(name)); +	if (copy_from_user(name, &user_buffer[i], len)) +		return -EFAULT; +	i += len; +   +	max = count -i; +	len = count_trail_chars(&user_buffer[i], max); +        if (len < 0)   +		return len;  +	 +	i += len; + +	if (debug)  +		printk("pktgen: t=%s, count=%lu\n", name, count); +         + +        t = (struct pktgen_thread*)(data); +	if(!t) { +		printk("pktgen: ERROR: No thread\n"); +		ret = -EINVAL; +		goto out; +	} + +	pg_result = &(t->result[0]); + +        if (!strcmp(name, "add_device")) { +                char f[32]; +                memset(f, 0, 32); +		len = strn_len(&user_buffer[i], sizeof(f) - 1); +                if (len < 0) {  +			ret = len;  +			goto out; +		} +		if( copy_from_user(f, &user_buffer[i], len) ) +			return -EFAULT; +		i += len; +		thread_lock(); +                pktgen_add_device(t, f); +		thread_unlock(); +                ret = count; +                sprintf(pg_result, "OK: add_device=%s", f); +		goto out; +	} + +        if (!strcmp(name, "rem_device_all")) { +		thread_lock(); +		t->control |= T_REMDEV; +		thread_unlock(); +		current->state = TASK_INTERRUPTIBLE; +		schedule_timeout(HZ/8);  /* Propagate thread->control  */ +		ret = count; +                sprintf(pg_result, "OK: rem_device_all"); +		goto out; +	} + +        if (!strcmp(name, "max_before_softirq")) { +                len = num_arg(&user_buffer[i], 10, &value); +		thread_lock(); +                t->max_before_softirq = value; +		thread_unlock(); +                ret = count; +                sprintf(pg_result, "OK: max_before_softirq=%lu", value); +		goto out; +	} + +	ret = -EINVAL; + out: + +	return ret; +} + +static int create_proc_dir(void) +{ +        int     len; +        /*  does proc_dir already exists */ +        len = strlen(PG_PROC_DIR); + +        for (pg_proc_dir = proc_net->subdir; pg_proc_dir; pg_proc_dir=pg_proc_dir->next) { +                if ((pg_proc_dir->namelen == len) && +		    (! memcmp(pg_proc_dir->name, PG_PROC_DIR, len)))  +                        break; +        } +         +        if (!pg_proc_dir)  +                pg_proc_dir = create_proc_entry(PG_PROC_DIR, S_IFDIR, proc_net); +         +        if (!pg_proc_dir)  +                return -ENODEV; +         +        return 0; +} + +static int remove_proc_dir(void) +{ +        remove_proc_entry(PG_PROC_DIR, proc_net); +        return 0; +} + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove)  +{ +	struct pktgen_thread *t; +	struct pktgen_dev *pkt_dev = NULL; + +        t = pktgen_threads; +                 +	while (t) { +		pkt_dev = pktgen_find_dev(t, ifname); +		if (pkt_dev) { +		                if(remove) {  +				        if_lock(t); +				        pktgen_remove_device(t, pkt_dev); +				        if_unlock(t); +				} +			break; +		} +		t = t->next; +	} +        return pkt_dev; +} + +static struct pktgen_dev *pktgen_NN_threads(const char* ifname, int remove)  +{ +	struct pktgen_dev *pkt_dev = NULL; +	thread_lock(); +	pkt_dev = __pktgen_NN_threads(ifname, remove); +        thread_unlock(); +	return pkt_dev; +} + +static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr)  +{ +	struct net_device *dev = (struct net_device *)(ptr); + +	/* It is OK that we do not hold the group lock right now, +	 * as we run under the RTNL lock. +	 */ + +	switch (event) { +	case NETDEV_CHANGEADDR: +	case NETDEV_GOING_DOWN: +	case NETDEV_DOWN: +	case NETDEV_UP: +		/* Ignore for now */ +		break; +		 +	case NETDEV_UNREGISTER: +                pktgen_NN_threads(dev->name, REMOVE); +		break; +	}; + +	return NOTIFY_DONE; +} + +/* Associate pktgen_dev with a device. */ + +static struct net_device* pktgen_setup_dev(struct pktgen_dev *pkt_dev) { +	struct net_device *odev; + +	/* Clean old setups */ + +	if (pkt_dev->odev) { +		dev_put(pkt_dev->odev); +                pkt_dev->odev = NULL; +        } + +	odev = dev_get_by_name(pkt_dev->ifname); + +	if (!odev) { +		printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); +		goto out; +	} +	if (odev->type != ARPHRD_ETHER) { +		printk("pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname); +		goto out_put; +	} +	if (!netif_running(odev)) { +		printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname); +		goto out_put; +	} +	pkt_dev->odev = odev; +	 +        return pkt_dev->odev; + +out_put: +	dev_put(odev); +out: + 	return NULL; + +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ +	/* Try once more, just in case it works now. */ +        if (!pkt_dev->odev)  +                pktgen_setup_dev(pkt_dev); +         +        if (!pkt_dev->odev) { +                printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n"); +                sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); +                return; +        } +         +        /* Default to the interface's mac if not explicitly set. */ + +	if ((pkt_dev->src_mac[0] == 0) &&  +	    (pkt_dev->src_mac[1] == 0) &&  +	    (pkt_dev->src_mac[2] == 0) &&  +	    (pkt_dev->src_mac[3] == 0) &&  +	    (pkt_dev->src_mac[4] == 0) &&  +	    (pkt_dev->src_mac[5] == 0)) { + +	       memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, 6); +       } +        /* Set up Dest MAC */ +        memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); + +        /* Set up pkt size */ +        pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; +	 +	if(pkt_dev->flags & F_IPV6) { +		/* +		 * Skip this automatic address setting until locks or functions  +		 * gets exported +		 */ + +#ifdef NOTNOW +		int i, set = 0, err=1; +		struct inet6_dev *idev; + +		for(i=0; i< IN6_ADDR_HSIZE; i++) +			if(pkt_dev->cur_in6_saddr.s6_addr[i]) { +				set = 1; +				break; +			} + +		if(!set) { +			 +			/* +			 * Use linklevel address if unconfigured. +			 * +			 * use ipv6_get_lladdr if/when it's get exported +			 */ + + +			read_lock(&addrconf_lock); +			if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) { +				struct inet6_ifaddr *ifp; + +				read_lock_bh(&idev->lock); +				for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { +					if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { +						ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &ifp->addr); +						err = 0; +						break; +					} +				} +				read_unlock_bh(&idev->lock); +			} +			read_unlock(&addrconf_lock); +			if(err)	printk("pktgen: ERROR: IPv6 link address not availble.\n"); +		} +#endif +	}  +	else { +		pkt_dev->saddr_min = 0; +		pkt_dev->saddr_max = 0; +		if (strlen(pkt_dev->src_min) == 0) { +			 +			struct in_device *in_dev;  + +			rcu_read_lock(); +			in_dev = __in_dev_get(pkt_dev->odev); +			if (in_dev) { +				if (in_dev->ifa_list) { +					pkt_dev->saddr_min = in_dev->ifa_list->ifa_address; +					pkt_dev->saddr_max = pkt_dev->saddr_min; +				} +				__in_dev_put(in_dev);	 +			} +			rcu_read_unlock(); +		} +		else { +			pkt_dev->saddr_min = in_aton(pkt_dev->src_min); +			pkt_dev->saddr_max = in_aton(pkt_dev->src_max); +		} + +		pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); +		pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); +	} +        /* Initialize current values. */ +        pkt_dev->cur_dst_mac_offset = 0; +        pkt_dev->cur_src_mac_offset = 0; +        pkt_dev->cur_saddr = pkt_dev->saddr_min; +        pkt_dev->cur_daddr = pkt_dev->daddr_min; +        pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; +        pkt_dev->cur_udp_src = pkt_dev->udp_src_min; +	pkt_dev->nflows = 0; +} + +static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) +{ +	__u64 start; +	__u64 now; + +	start = now = getCurUs(); +	printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now)); +	while (now < spin_until_us) { +		/* TODO: optimise sleeping behavior */ +		if (spin_until_us - now > (1000000/HZ)+1) { +			current->state = TASK_INTERRUPTIBLE; +			schedule_timeout(1); +		} else if (spin_until_us - now > 100) { +			do_softirq(); +			if (!pkt_dev->running) +				return; +			if (need_resched()) +				schedule(); +		} + +		now = getCurUs(); +	} + +	pkt_dev->idle_acc += now - start; +} + + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) {         +        __u32 imn; +        __u32 imx; +	int  flow = 0; + +	if(pkt_dev->cflows)  { +		flow = pktgen_random() % pkt_dev->cflows; +		 +		if (pkt_dev->flows[flow].count > pkt_dev->lflow) +			pkt_dev->flows[flow].count = 0; +	}						 + + +	/*  Deal with source MAC */ +        if (pkt_dev->src_mac_count > 1) { +                __u32 mc; +                __u32 tmp; + +                if (pkt_dev->flags & F_MACSRC_RND)  +                        mc = pktgen_random() % (pkt_dev->src_mac_count); +                else { +                        mc = pkt_dev->cur_src_mac_offset++; +                        if (pkt_dev->cur_src_mac_offset > pkt_dev->src_mac_count)  +                                pkt_dev->cur_src_mac_offset = 0; +                } + +                tmp = pkt_dev->src_mac[5] + (mc & 0xFF); +                pkt_dev->hh[11] = tmp; +                tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[10] = tmp; +                tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[9] = tmp; +                tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[8] = tmp; +                tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); +                pkt_dev->hh[7] = tmp;         +        } + +        /*  Deal with Destination MAC */ +        if (pkt_dev->dst_mac_count > 1) { +                __u32 mc; +                __u32 tmp; + +                if (pkt_dev->flags & F_MACDST_RND)  +                        mc = pktgen_random() % (pkt_dev->dst_mac_count); + +                else { +                        mc = pkt_dev->cur_dst_mac_offset++; +                        if (pkt_dev->cur_dst_mac_offset > pkt_dev->dst_mac_count) { +                                pkt_dev->cur_dst_mac_offset = 0; +                        } +                } + +                tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); +                pkt_dev->hh[5] = tmp; +                tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[4] = tmp; +                tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[3] = tmp; +                tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); +                pkt_dev->hh[2] = tmp; +                tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); +                pkt_dev->hh[1] = tmp;         +        } + +        if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { +                if (pkt_dev->flags & F_UDPSRC_RND)  +                        pkt_dev->cur_udp_src = ((pktgen_random() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + pkt_dev->udp_src_min); + +                else { +			pkt_dev->cur_udp_src++; +			if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) +				pkt_dev->cur_udp_src = pkt_dev->udp_src_min; +                } +        } + +        if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { +                if (pkt_dev->flags & F_UDPDST_RND) { +                        pkt_dev->cur_udp_dst = ((pktgen_random() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + pkt_dev->udp_dst_min); +                } +                else { +			pkt_dev->cur_udp_dst++; +			if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)  +				pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; +                } +        } + +	if (!(pkt_dev->flags & F_IPV6)) { + +		if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = ntohl(pkt_dev->saddr_max))) { +			__u32 t; +			if (pkt_dev->flags & F_IPSRC_RND)  +				t = ((pktgen_random() % (imx - imn)) + imn); +			else { +				t = ntohl(pkt_dev->cur_saddr); +				t++; +				if (t > imx) { +					t = imn; +				} +			} +			pkt_dev->cur_saddr = htonl(t); +		} +		 +		if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) { +			pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; +		} else { + +			if ((imn = ntohl(pkt_dev->daddr_min)) < (imx = ntohl(pkt_dev->daddr_max))) { +				__u32 t; +				if (pkt_dev->flags & F_IPDST_RND) { + +					t = ((pktgen_random() % (imx - imn)) + imn); +					t = htonl(t); + +					while( LOOPBACK(t) || MULTICAST(t) || BADCLASS(t) || ZERONET(t) ||  LOCAL_MCAST(t) ) { +						t = ((pktgen_random() % (imx - imn)) + imn); +						t = htonl(t); +					} +					pkt_dev->cur_daddr = t; +				} +				 +				else { +					t = ntohl(pkt_dev->cur_daddr); +					t++; +					if (t > imx) { +						t = imn; +					} +					pkt_dev->cur_daddr = htonl(t); +				} +			} +			if(pkt_dev->cflows) {	 +				pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; +				pkt_dev->nflows++; +			} +		} +	} +	else /* IPV6 * */ +	{ +		if(pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && +		   pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && +		   pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && +		   pkt_dev->min_in6_daddr.s6_addr32[3] == 0); +		else { +			int i; + +			/* Only random destinations yet */ + +			for(i=0; i < 4; i++) { +				pkt_dev->cur_in6_daddr.s6_addr32[i] = +					((pktgen_random() | +					  pkt_dev->min_in6_daddr.s6_addr32[i]) & +					 pkt_dev->max_in6_daddr.s6_addr32[i]); +			} + 		} +	} + +        if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { +                __u32 t; +                if (pkt_dev->flags & F_TXSIZE_RND) { +                        t = ((pktgen_random() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)) +                             + pkt_dev->min_pkt_size); +                } +                else { +			t = pkt_dev->cur_pkt_size + 1; +			if (t > pkt_dev->max_pkt_size)  +				t = pkt_dev->min_pkt_size; +                } +                pkt_dev->cur_pkt_size = t; +        } + +	pkt_dev->flows[flow].count++; +} + + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  +				   struct pktgen_dev *pkt_dev) +{ +	struct sk_buff *skb = NULL; +	__u8 *eth; +	struct udphdr *udph; +	int datalen, iplen; +	struct iphdr *iph; +        struct pktgen_hdr *pgh = NULL; +         +	skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); +	if (!skb) { +		sprintf(pkt_dev->result, "No memory"); +		return NULL; +	} + +	skb_reserve(skb, 16); + +	/*  Reserve for ethernet and IP header  */ +	eth = (__u8 *) skb_push(skb, 14); +	iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); +	udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + +        /* Update any of the values, used when we're incrementing various +         * fields. +         */ +        mod_cur_headers(pkt_dev); + +	memcpy(eth, pkt_dev->hh, 12); +	*(u16*)ð[12] = __constant_htons(ETH_P_IP); + +	datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8; /* Eth + IPh + UDPh */ +	if (datalen < sizeof(struct pktgen_hdr))  +		datalen = sizeof(struct pktgen_hdr); +         +	udph->source = htons(pkt_dev->cur_udp_src); +	udph->dest = htons(pkt_dev->cur_udp_dst); +	udph->len = htons(datalen + 8); /* DATA + udphdr */ +	udph->check = 0;  /* No checksum */ + +	iph->ihl = 5; +	iph->version = 4; +	iph->ttl = 32; +	iph->tos = 0; +	iph->protocol = IPPROTO_UDP; /* UDP */ +	iph->saddr = pkt_dev->cur_saddr; +	iph->daddr = pkt_dev->cur_daddr; +	iph->frag_off = 0; +	iplen = 20 + 8 + datalen; +	iph->tot_len = htons(iplen); +	iph->check = 0; +	iph->check = ip_fast_csum((void *) iph, iph->ihl); +	skb->protocol = __constant_htons(ETH_P_IP); +	skb->mac.raw = ((u8 *)iph) - 14; +	skb->dev = odev; +	skb->pkt_type = PACKET_HOST; + +	if (pkt_dev->nfrags <= 0)  +                pgh = (struct pktgen_hdr *)skb_put(skb, datalen); +	else { +		int frags = pkt_dev->nfrags; +		int i; + +                pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); +                 +		if (frags > MAX_SKB_FRAGS) +			frags = MAX_SKB_FRAGS; +		if (datalen > frags*PAGE_SIZE) { +			skb_put(skb, datalen-frags*PAGE_SIZE); +			datalen = frags*PAGE_SIZE; +		} + +		i = 0; +		while (datalen > 0) { +			struct page *page = alloc_pages(GFP_KERNEL, 0); +			skb_shinfo(skb)->frags[i].page = page; +			skb_shinfo(skb)->frags[i].page_offset = 0; +			skb_shinfo(skb)->frags[i].size = +				(datalen < PAGE_SIZE ? datalen : PAGE_SIZE); +			datalen -= skb_shinfo(skb)->frags[i].size; +			skb->len += skb_shinfo(skb)->frags[i].size; +			skb->data_len += skb_shinfo(skb)->frags[i].size; +			i++; +			skb_shinfo(skb)->nr_frags = i; +		} + +		while (i < frags) { +			int rem; + +			if (i == 0) +				break; + +			rem = skb_shinfo(skb)->frags[i - 1].size / 2; +			if (rem == 0) +				break; + +			skb_shinfo(skb)->frags[i - 1].size -= rem; + +			skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; +			get_page(skb_shinfo(skb)->frags[i].page); +			skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; +			skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; +			skb_shinfo(skb)->frags[i].size = rem; +			i++; +			skb_shinfo(skb)->nr_frags = i; +		} +	} + +        /* Stamp the time, and sequence number, convert them to network byte order */ + +        if (pgh) { +              struct timeval timestamp; +	       +	      pgh->pgh_magic = htonl(PKTGEN_MAGIC); +	      pgh->seq_num   = htonl(pkt_dev->seq_num); +	       +	      do_gettimeofday(×tamp); +	      pgh->tv_sec    = htonl(timestamp.tv_sec); +	      pgh->tv_usec   = htonl(timestamp.tv_usec); +        } +        pkt_dev->seq_num++; +         +	return skb; +} + +/* + * scan_ip6, fmt_ip taken from dietlibc-0.21  + * Author Felix von Leitner <felix-dietlibc@fefe.de> + * + * Slightly modified for kernel.  + * Should be candidate for net/ipv4/utils.c + * --ro + */ + +static unsigned int scan_ip6(const char *s,char ip[16]) +{ +	unsigned int i; +	unsigned int len=0; +	unsigned long u; +	char suffix[16]; +	unsigned int prefixlen=0; +	unsigned int suffixlen=0; +	__u32 tmp; + +	for (i=0; i<16; i++) ip[i]=0; + +	for (;;) { +		if (*s == ':') { +			len++; +			if (s[1] == ':') {        /* Found "::", skip to part 2 */ +				s+=2; +				len++; +				break; +			} +			s++; +		} +		{ +			char *tmp; +			u=simple_strtoul(s,&tmp,16); +			i=tmp-s; +		} + +		if (!i) return 0; +		if (prefixlen==12 && s[i]=='.') { + +			/* the last 4 bytes may be written as IPv4 address */ + +			tmp = in_aton(s); +			memcpy((struct in_addr*)(ip+12), &tmp, sizeof(tmp)); +			return i+len; +		} +		ip[prefixlen++] = (u >> 8); +		ip[prefixlen++] = (u & 255); +		s += i; len += i; +		if (prefixlen==16) +			return len; +	} + +/* part 2, after "::" */ +	for (;;) { +		if (*s == ':') { +			if (suffixlen==0) +				break; +			s++; +			len++; +		} else if (suffixlen!=0) +			break; +		{ +			char *tmp; +			u=simple_strtol(s,&tmp,16); +			i=tmp-s; +		} +		if (!i) { +			if (*s) len--; +			break; +		} +		if (suffixlen+prefixlen<=12 && s[i]=='.') { +			tmp = in_aton(s); +			memcpy((struct in_addr*)(suffix+suffixlen), &tmp, sizeof(tmp)); +			suffixlen+=4; +			len+=strlen(s); +			break; +		} +		suffix[suffixlen++] = (u >> 8); +		suffix[suffixlen++] = (u & 255); +		s += i; len += i; +		if (prefixlen+suffixlen==16) +			break; +	} +	for (i=0; i<suffixlen; i++) +		ip[16-suffixlen+i] = suffix[i]; +	return len; +} + +static char tohex(char hexdigit) { +	return hexdigit>9?hexdigit+'a'-10:hexdigit+'0'; +} + +static int fmt_xlong(char* s,unsigned int i) { +	char* bak=s; +	*s=tohex((i>>12)&0xf); if (s!=bak || *s!='0') ++s; +	*s=tohex((i>>8)&0xf); if (s!=bak || *s!='0') ++s; +	*s=tohex((i>>4)&0xf); if (s!=bak || *s!='0') ++s; +	*s=tohex(i&0xf); +	return s-bak+1; +} + +static unsigned int fmt_ip6(char *s,const char ip[16]) { +	unsigned int len; +	unsigned int i; +	unsigned int temp; +	unsigned int compressing; +	int j; + +	len = 0; compressing = 0; +	for (j=0; j<16; j+=2) { + +#ifdef V4MAPPEDPREFIX +		if (j==12 && !memcmp(ip,V4mappedprefix,12)) { +			inet_ntoa_r(*(struct in_addr*)(ip+12),s); +			temp=strlen(s); +			return len+temp; +		} +#endif +		temp = ((unsigned long) (unsigned char) ip[j] << 8) + +			(unsigned long) (unsigned char) ip[j+1]; +		if (temp == 0) { +			if (!compressing) { +				compressing=1; +				if (j==0) { +					*s++=':'; ++len; +				} +			} +		} else { +			if (compressing) { +				compressing=0; +				*s++=':'; ++len; +			} +			i = fmt_xlong(s,temp); len += i; s += i; +			if (j<14) { +				*s++ = ':'; +				++len; +			} +		} +	} +	if (compressing) { +		*s++=':'; ++len; +	} +	*s=0; +	return len; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  +				   struct pktgen_dev *pkt_dev) +{ +	struct sk_buff *skb = NULL; +	__u8 *eth; +	struct udphdr *udph; +	int datalen; +	struct ipv6hdr *iph; +        struct pktgen_hdr *pgh = NULL; +         +	skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); +	if (!skb) { +		sprintf(pkt_dev->result, "No memory"); +		return NULL; +	} + +	skb_reserve(skb, 16); + +	/*  Reserve for ethernet and IP header  */ +	eth = (__u8 *) skb_push(skb, 14); +	iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); +	udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + +        /* Update any of the values, used when we're incrementing various +         * fields. +         */ +	mod_cur_headers(pkt_dev); + +	 +	memcpy(eth, pkt_dev->hh, 12); +	*(u16*)ð[12] = __constant_htons(ETH_P_IPV6); +	 +         +	datalen = pkt_dev->cur_pkt_size-14-  +		sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ + +	if (datalen < sizeof(struct pktgen_hdr)) {  +		datalen = sizeof(struct pktgen_hdr); +		if (net_ratelimit()) +			printk(KERN_INFO "pktgen: increased datalen to %d\n", datalen); +	} + +	udph->source = htons(pkt_dev->cur_udp_src); +	udph->dest = htons(pkt_dev->cur_udp_dst); +	udph->len = htons(datalen + sizeof(struct udphdr));  +	udph->check = 0;  /* No checksum */ + +	 *(u32*)iph = __constant_htonl(0x60000000); /* Version + flow */ + +	iph->hop_limit = 32; + +	iph->payload_len = htons(sizeof(struct udphdr) + datalen); +	iph->nexthdr = IPPROTO_UDP; + +	ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); +	ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + +	skb->mac.raw = ((u8 *)iph) - 14; +	skb->protocol = __constant_htons(ETH_P_IPV6); +	skb->dev = odev; +	skb->pkt_type = PACKET_HOST; + +	if (pkt_dev->nfrags <= 0)  +                pgh = (struct pktgen_hdr *)skb_put(skb, datalen); +	else { +		int frags = pkt_dev->nfrags; +		int i; + +                pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); +                 +		if (frags > MAX_SKB_FRAGS) +			frags = MAX_SKB_FRAGS; +		if (datalen > frags*PAGE_SIZE) { +			skb_put(skb, datalen-frags*PAGE_SIZE); +			datalen = frags*PAGE_SIZE; +		} + +		i = 0; +		while (datalen > 0) { +			struct page *page = alloc_pages(GFP_KERNEL, 0); +			skb_shinfo(skb)->frags[i].page = page; +			skb_shinfo(skb)->frags[i].page_offset = 0; +			skb_shinfo(skb)->frags[i].size = +				(datalen < PAGE_SIZE ? datalen : PAGE_SIZE); +			datalen -= skb_shinfo(skb)->frags[i].size; +			skb->len += skb_shinfo(skb)->frags[i].size; +			skb->data_len += skb_shinfo(skb)->frags[i].size; +			i++; +			skb_shinfo(skb)->nr_frags = i; +		} + +		while (i < frags) { +			int rem; + +			if (i == 0) +				break; + +			rem = skb_shinfo(skb)->frags[i - 1].size / 2; +			if (rem == 0) +				break; + +			skb_shinfo(skb)->frags[i - 1].size -= rem; + +			skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; +			get_page(skb_shinfo(skb)->frags[i].page); +			skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; +			skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; +			skb_shinfo(skb)->frags[i].size = rem; +			i++; +			skb_shinfo(skb)->nr_frags = i; +		} +	} + +        /* Stamp the time, and sequence number, convert them to network byte order */ +	/* should we update cloned packets too ? */ +        if (pgh) { +              struct timeval timestamp; +	       +	      pgh->pgh_magic = htonl(PKTGEN_MAGIC); +	      pgh->seq_num   = htonl(pkt_dev->seq_num); +	       +	      do_gettimeofday(×tamp); +	      pgh->tv_sec    = htonl(timestamp.tv_sec); +	      pgh->tv_usec   = htonl(timestamp.tv_usec); +        } +        pkt_dev->seq_num++; +         +	return skb; +} + +static inline struct sk_buff *fill_packet(struct net_device *odev,  +				   struct pktgen_dev *pkt_dev) +{ +	if(pkt_dev->flags & F_IPV6)  +		return fill_packet_ipv6(odev, pkt_dev); +	else +		return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)  +{ +        pkt_dev->seq_num = 1; +        pkt_dev->idle_acc = 0; +	pkt_dev->sofar = 0; +        pkt_dev->tx_bytes = 0; +        pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ +        struct pktgen_dev *pkt_dev = NULL; +	int started = 0; + +	PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t)); + +	if_lock(t); +        for (pkt_dev = t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { + +		/* +		 * setup odev and create initial packet. +		 */ +		pktgen_setup_inject(pkt_dev); + +		if(pkt_dev->odev) {  +			pktgen_clear_counters(pkt_dev); +			pkt_dev->running = 1; /* Cranke yeself! */ +			pkt_dev->skb = NULL; +			pkt_dev->started_at = getCurUs(); +			pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ +			pkt_dev->next_tx_ns = 0; +			 +			strcpy(pkt_dev->result, "Starting"); +			started++; +		} +		else  +			strcpy(pkt_dev->result, "Error starting"); +	} +	if_unlock(t); +	if(started) t->control &= ~(T_STOP); +} + +static void pktgen_stop_all_threads_ifs(void) +{ +        struct pktgen_thread *t = pktgen_threads; + +	PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads.\n")); + +	thread_lock(); +	while(t) { +		pktgen_stop(t); +		t = t->next; +	} +       thread_unlock(); +} + +static int thread_is_running(struct pktgen_thread *t ) +{ +        struct pktgen_dev *next; +        int res = 0; + +        for(next=t->if_list; next; next=next->next) {  +		if(next->running) { +			res = 1; +			break; +		} +        } +        return res; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t ) +{ +        if_lock(t); + +        while(thread_is_running(t)) { + +                if_unlock(t); + +		msleep_interruptible(100);  + +                if (signal_pending(current))  +                        goto signal; +                if_lock(t); +        } +        if_unlock(t); +        return 1; + signal: +        return 0; +} + +static int pktgen_wait_all_threads_run(void) +{ +	struct pktgen_thread *t = pktgen_threads; +	int sig = 1; +	 +	while (t) { +		sig = pktgen_wait_thread_run(t); +		if( sig == 0 ) break; +		thread_lock(); +		t=t->next; +		thread_unlock(); +	} +	if(sig == 0) { +		thread_lock(); +		while (t) { +			t->control |= (T_STOP); +			t=t->next; +		} +		thread_unlock(); +	} +	return sig; +} + +static void pktgen_run_all_threads(void) +{ +        struct pktgen_thread *t = pktgen_threads; + +	PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n")); + +	thread_lock(); + +	while(t) { +		t->control |= (T_RUN); +		t = t->next; +	} +	thread_unlock(); + +	current->state = TASK_INTERRUPTIBLE; +	schedule_timeout(HZ/8);  /* Propagate thread->control  */ +			 +	pktgen_wait_all_threads_run(); +} + + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ +       __u64 total_us, bps, mbps, pps, idle; +       char *p = pkt_dev->result; + +       total_us = pkt_dev->stopped_at - pkt_dev->started_at; + +       idle = pkt_dev->idle_acc; + +       p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", +                    (unsigned long long) total_us,  +		    (unsigned long long)(total_us - idle),  +		    (unsigned long long) idle, +                    (unsigned long long) pkt_dev->sofar,  +		    pkt_dev->cur_pkt_size, nr_frags); + +       pps = pkt_dev->sofar * USEC_PER_SEC; + +       while ((total_us >> 32) != 0) { +               pps >>= 1; +               total_us >>= 1; +       } + +       do_div(pps, total_us); +        +       bps = pps * 8 * pkt_dev->cur_pkt_size; + +       mbps = bps; +       do_div(mbps, 1000000); +       p += sprintf(p, "  %llupps %lluMb/sec (%llubps) errors: %llu", +                    (unsigned long long) pps,  +		    (unsigned long long) mbps,  +		    (unsigned long long) bps,  +		    (unsigned long long) pkt_dev->errors); +} +  + +/* Set stopped-at timer, remove from running list, do counters & statistics */ + +static int pktgen_stop_device(struct pktgen_dev *pkt_dev)  +{ +	 +        if (!pkt_dev->running) { +                printk("pktgen: interface: %s is already stopped\n", pkt_dev->ifname); +                return -EINVAL; +        } + +        pkt_dev->stopped_at = getCurUs(); +        pkt_dev->running = 0; + +	show_results(pkt_dev, skb_shinfo(pkt_dev->skb)->nr_frags); + +	if (pkt_dev->skb)  +		kfree_skb(pkt_dev->skb); + +	pkt_dev->skb = NULL; +	 +        return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t ) +{ +	struct pktgen_dev *next, *best = NULL; +         +	if_lock(t); + +	for(next=t->if_list; next ; next=next->next) { +		if(!next->running) continue; +		if(best == NULL) best=next; +		else if ( next->next_tx_us < best->next_tx_us)  +			best =  next; +	} +	if_unlock(t); +        return best; +} + +static void pktgen_stop(struct pktgen_thread *t) { +        struct pktgen_dev *next = NULL; + +	PG_DEBUG(printk("pktgen: entering pktgen_stop.\n")); + +        if_lock(t); + +        for(next=t->if_list; next; next=next->next) +                pktgen_stop_device(next); + +        if_unlock(t); +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t)  +{ +        struct pktgen_dev *cur, *next = NULL; +         +        /* Remove all devices, free mem */ +  +        if_lock(t); + +        for(cur=t->if_list; cur; cur=next) {  +		next = cur->next; +		pktgen_remove_device(t, cur); +	} + +        if_unlock(t); +} + +static void pktgen_rem_thread(struct pktgen_thread *t)  +{ +        /* Remove from the thread list */ + +	struct pktgen_thread *tmp = pktgen_threads; + +        if (strlen(t->fname)) +                remove_proc_entry(t->fname, NULL); + +       thread_lock(); + +	if (tmp == t) +		pktgen_threads = tmp->next; +	else { +		while (tmp) { +			if (tmp->next == t) { +				tmp->next = t->next; +				t->next = NULL; +				break; +			} +			tmp = tmp->next; +		} +	} +        thread_unlock(); +} + +static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ +	struct net_device *odev = NULL; +	__u64 idle_start = 0; +	int ret; + +	odev = pkt_dev->odev; +	 +	if (pkt_dev->delay_us || pkt_dev->delay_ns) { +		u64 now; + +		now = getCurUs(); +		if (now < pkt_dev->next_tx_us) +			spin(pkt_dev, pkt_dev->next_tx_us); + +		/* This is max DELAY, this has special meaning of +		 * "never transmit" +		 */ +		if (pkt_dev->delay_us == 0x7FFFFFFF) { +			pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; +			pkt_dev->next_tx_ns = pkt_dev->delay_ns; +			goto out; +		} +	} +	 +	if (netif_queue_stopped(odev) || need_resched()) { +		idle_start = getCurUs(); +		 +		if (!netif_running(odev)) { +			pktgen_stop_device(pkt_dev); +			goto out; +		} +		if (need_resched())  +			schedule(); +		 +		pkt_dev->idle_acc += getCurUs() - idle_start; +		 +		if (netif_queue_stopped(odev)) { +			pkt_dev->next_tx_us = getCurUs(); /* TODO */ +			pkt_dev->next_tx_ns = 0; +			goto out; /* Try the next interface */ +		} +	} +	 +	if (pkt_dev->last_ok || !pkt_dev->skb) { +		if ((++pkt_dev->clone_count >= pkt_dev->clone_skb ) || (!pkt_dev->skb)) { +			/* build a new pkt */ +			if (pkt_dev->skb)  +				kfree_skb(pkt_dev->skb); +			 +			pkt_dev->skb = fill_packet(odev, pkt_dev); +			if (pkt_dev->skb == NULL) { +				printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n"); +				schedule(); +				pkt_dev->clone_count--; /* back out increment, OOM */ +				goto out; +			} +			pkt_dev->allocated_skbs++; +			pkt_dev->clone_count = 0; /* reset counter */ +		} +	} +	 +	spin_lock_bh(&odev->xmit_lock); +	if (!netif_queue_stopped(odev)) { + +		atomic_inc(&(pkt_dev->skb->users)); +retry_now: +		ret = odev->hard_start_xmit(pkt_dev->skb, odev); +		if (likely(ret == NETDEV_TX_OK)) { +			pkt_dev->last_ok = 1;     +			pkt_dev->sofar++; +			pkt_dev->seq_num++; +			pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; +			 +		} else if (ret == NETDEV_TX_LOCKED  +			   && (odev->features & NETIF_F_LLTX)) { +			cpu_relax(); +			goto retry_now; +		} else {  /* Retry it next time */ +			 +			atomic_dec(&(pkt_dev->skb->users)); +			 +			if (debug && net_ratelimit()) +				printk(KERN_INFO "pktgen: Hard xmit error\n"); +			 +			pkt_dev->errors++; +			pkt_dev->last_ok = 0; +		} + +		pkt_dev->next_tx_us = getCurUs(); +		pkt_dev->next_tx_ns = 0; + +		pkt_dev->next_tx_us += pkt_dev->delay_us; +		pkt_dev->next_tx_ns += pkt_dev->delay_ns; + +		if (pkt_dev->next_tx_ns > 1000) { +			pkt_dev->next_tx_us++; +			pkt_dev->next_tx_ns -= 1000; +		} +	}  + +	else {  /* Retry it next time */ +                pkt_dev->last_ok = 0; +                pkt_dev->next_tx_us = getCurUs(); /* TODO */ +		pkt_dev->next_tx_ns = 0; +        } + +	spin_unlock_bh(&odev->xmit_lock); +	 +	/* If pkt_dev->count is zero, then run forever */ +	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { +		if (atomic_read(&(pkt_dev->skb->users)) != 1) { +			idle_start = getCurUs(); +			while (atomic_read(&(pkt_dev->skb->users)) != 1) { +				if (signal_pending(current)) { +					break; +				} +				schedule(); +			} +			pkt_dev->idle_acc += getCurUs() - idle_start; +		} +                 +		/* Done with this */ +		pktgen_stop_device(pkt_dev); +	}  + out:; + } + +/*  + * Main loop of the thread goes here + */ + +static void pktgen_thread_worker(struct pktgen_thread *t)  +{ +	DEFINE_WAIT(wait); +        struct pktgen_dev *pkt_dev = NULL; +	int cpu = t->cpu; +	sigset_t tmpsig; +	u32 max_before_softirq; +        u32 tx_since_softirq = 0; + +	daemonize("pktgen/%d", cpu); + +        /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */ + +        spin_lock_irq(¤t->sighand->siglock); +        tmpsig = current->blocked; +        siginitsetinv(¤t->blocked,  +                      sigmask(SIGKILL) |  +                      sigmask(SIGSTOP)|  +                      sigmask(SIGTERM)); + +        recalc_sigpending(); +        spin_unlock_irq(¤t->sighand->siglock); + +	/* Migrate to the right CPU */ +	set_cpus_allowed(current, cpumask_of_cpu(cpu)); +        if (smp_processor_id() != cpu) +                BUG(); + +	init_waitqueue_head(&t->queue); + +	t->control &= ~(T_TERMINATE); +	t->control &= ~(T_RUN); +	t->control &= ~(T_STOP); +	t->control &= ~(T_REMDEV); + +        t->pid = current->pid;         + +        PG_DEBUG(printk("pktgen: starting pktgen/%d:  pid=%d\n", cpu, current->pid)); + +	max_before_softirq = t->max_before_softirq; +         +        __set_current_state(TASK_INTERRUPTIBLE); +        mb(); + +        while (1) { +		 +		__set_current_state(TASK_RUNNING); + +		/* +		 * Get next dev to xmit -- if any. +		 */ + +                pkt_dev = next_to_run(t); +                 +                if (pkt_dev) { + +			pktgen_xmit(pkt_dev); + +			/* +			 * We like to stay RUNNING but must also give +			 * others fair share. +			 */ + +			tx_since_softirq += pkt_dev->last_ok; + +			if (tx_since_softirq > max_before_softirq) { +				if (local_softirq_pending()) +					do_softirq(); +				tx_since_softirq = 0; +			} +		} else { +			prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE); +			schedule_timeout(HZ/10); +			finish_wait(&(t->queue), &wait); +		} + +                /*  +		 * Back from sleep, either due to the timeout or signal. +		 * We check if we have any "posted" work for us. +		 */ + +                if (t->control & T_TERMINATE || signal_pending(current))  +                        /* we received a request to terminate ourself */ +                        break; +		 + +		if(t->control & T_STOP) { +			pktgen_stop(t); +			t->control &= ~(T_STOP); +		} + +		if(t->control & T_RUN) { +			pktgen_run(t); +			t->control &= ~(T_RUN); +		} + +		if(t->control & T_REMDEV) { +			pktgen_rem_all_ifs(t); +			t->control &= ~(T_REMDEV); +		} + +		if (need_resched())  +			schedule(); +        }  + +        PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name)); +        pktgen_stop(t); + +        PG_DEBUG(printk("pktgen: %s removing all device\n", t->name)); +        pktgen_rem_all_ifs(t); + +        PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name)); +        pktgen_rem_thread(t); +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* ifname)  +{ +        struct pktgen_dev *pkt_dev = NULL; +        if_lock(t); + +        for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { +                if (strcmp(pkt_dev->ifname, ifname) == 0) { +                        break; +                } +        } + +        if_unlock(t); +	PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname,pkt_dev)); +        return pkt_dev; +} + +/*  + * Adds a dev at front of if_list.  + */ + +static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)  +{ +	int rv = 0; +	 +        if_lock(t); + +        if (pkt_dev->pg_thread) { +                printk("pktgen: ERROR:  already assigned to a thread.\n"); +                rv = -EBUSY; +                goto out; +        } +	pkt_dev->next =t->if_list; t->if_list=pkt_dev; +        pkt_dev->pg_thread = t; +	pkt_dev->running = 0; + + out: +        if_unlock(t);         +        return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char* ifname)  +{ +        struct pktgen_dev *pkt_dev; +	 +	/* We don't allow a device to be on several threads */ + +	if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) { +						    +		pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL); +                if (!pkt_dev)  +                        return -ENOMEM; + +                memset(pkt_dev, 0, sizeof(struct pktgen_dev)); + +		pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state)); +		if (pkt_dev->flows == NULL) { +			kfree(pkt_dev); +			return -ENOMEM; +		} +		memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state)); + +		pkt_dev->min_pkt_size = ETH_ZLEN; +                pkt_dev->max_pkt_size = ETH_ZLEN; +                pkt_dev->nfrags = 0; +                pkt_dev->clone_skb = pg_clone_skb_d; +                pkt_dev->delay_us = pg_delay_d / 1000; +                pkt_dev->delay_ns = pg_delay_d % 1000; +                pkt_dev->count = pg_count_d; +                pkt_dev->sofar = 0; +                pkt_dev->udp_src_min = 9; /* sink port */ +                pkt_dev->udp_src_max = 9; +                pkt_dev->udp_dst_min = 9; +                pkt_dev->udp_dst_max = 9; + +                strncpy(pkt_dev->ifname, ifname, 31); +                sprintf(pkt_dev->fname, "net/%s/%s", PG_PROC_DIR, ifname); + +                if (! pktgen_setup_dev(pkt_dev)) { +                        printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); +			if (pkt_dev->flows) +				vfree(pkt_dev->flows); +                        kfree(pkt_dev); +                        return -ENODEV; +                } + +                pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL); +                if (!pkt_dev->proc_ent) { +                        printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname); +			if (pkt_dev->flows) +				vfree(pkt_dev->flows); +                        kfree(pkt_dev); +                        return -EINVAL; +                } +                pkt_dev->proc_ent->read_proc = proc_if_read; +                pkt_dev->proc_ent->write_proc = proc_if_write; +                pkt_dev->proc_ent->data = (void*)(pkt_dev); +		pkt_dev->proc_ent->owner = THIS_MODULE; + +                return add_dev_to_thread(t, pkt_dev); +        } +        else { +                printk("pktgen: ERROR: interface already used.\n"); +                return -EBUSY; +        } +} + +static struct pktgen_thread *pktgen_find_thread(const char* name)  +{ +        struct pktgen_thread *t = NULL; + +       thread_lock(); + +        t = pktgen_threads; +        while (t) { +                if (strcmp(t->name, name) == 0)  +                        break; + +                t = t->next; +        } +        thread_unlock(); +        return t; +} + +static int pktgen_create_thread(const char* name, int cpu)  +{ +        struct pktgen_thread *t = NULL; + +        if (strlen(name) > 31) { +                printk("pktgen: ERROR:  Thread name cannot be more than 31 characters.\n"); +                return -EINVAL; +        } +         +        if (pktgen_find_thread(name)) { +                printk("pktgen: ERROR: thread: %s already exists\n", name); +                return -EINVAL; +        } + +        t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL)); +        if (!t) { +                printk("pktgen: ERROR: out of memory, can't create new thread.\n"); +                return -ENOMEM; +        } + +        memset(t, 0, sizeof(struct pktgen_thread)); +        strcpy(t->name, name); +        spin_lock_init(&t->if_lock); +	t->cpu = cpu; +         +        sprintf(t->fname, "net/%s/%s", PG_PROC_DIR, t->name); +        t->proc_ent = create_proc_entry(t->fname, 0600, NULL); +        if (!t->proc_ent) { +                printk("pktgen: cannot create %s procfs entry.\n", t->fname); +                kfree(t); +                return -EINVAL; +        } +        t->proc_ent->read_proc = proc_thread_read; +        t->proc_ent->write_proc = proc_thread_write; +        t->proc_ent->data = (void*)(t); +        t->proc_ent->owner = THIS_MODULE; + +        t->next = pktgen_threads; +        pktgen_threads = t; + +	if (kernel_thread((void *) pktgen_thread_worker, (void *) t,  +			  CLONE_FS | CLONE_FILES | CLONE_SIGHAND) < 0) +		printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu); + +	return 0; +} + +/*  + * Removes a device from the thread if_list.  + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)  +{ +	struct pktgen_dev *i, *prev = NULL; + +	i = t->if_list; + +	while(i) { +		if(i == pkt_dev) { +			if(prev) prev->next = i->next; +			else t->if_list = NULL; +			break; +		} +		prev = i; +		i=i->next; +	} +} + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)  +{ + +	PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev)); + +        if (pkt_dev->running) {  +                printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n"); +                pktgen_stop_device(pkt_dev); +        } +         +        /* Dis-associate from the interface */ + +	if (pkt_dev->odev) { +		dev_put(pkt_dev->odev); +                pkt_dev->odev = NULL; +        } +         +	/* And update the thread if_list */ + +	_rem_dev_from_if_list(t, pkt_dev); + +        /* Clean up proc file system */ + +        if (strlen(pkt_dev->fname))  +                remove_proc_entry(pkt_dev->fname, NULL); + +	if (pkt_dev->flows) +		vfree(pkt_dev->flows); +	kfree(pkt_dev); +        return 0; +} + +static int __init pg_init(void)  +{ +	int cpu; +	printk(version); + +        module_fname[0] = 0; + +	create_proc_dir(); + +        sprintf(module_fname, "net/%s/pgctrl", PG_PROC_DIR); +        module_proc_ent = create_proc_entry(module_fname, 0600, NULL); +        if (!module_proc_ent) { +                printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname); +                return -EINVAL; +        } + +        module_proc_ent->proc_fops =  &pktgen_fops; +        module_proc_ent->data = NULL; + +	/* Register us to receive netdevice events */ +	register_netdevice_notifier(&pktgen_notifier_block); +         +	for (cpu = 0; cpu < NR_CPUS ; cpu++) { +		char buf[30]; + +		if (!cpu_online(cpu)) +			continue; + +                sprintf(buf, "kpktgend_%i", cpu); +                pktgen_create_thread(buf, cpu); +        } +        return 0;         +} + +static void __exit pg_cleanup(void) +{ +	wait_queue_head_t queue; +	init_waitqueue_head(&queue); + +        /* Stop all interfaces & threads */         + +        while (pktgen_threads) { +                struct pktgen_thread *t = pktgen_threads; +                pktgen_threads->control |= (T_TERMINATE); + +		wait_event_interruptible_timeout(queue, (t != pktgen_threads), HZ); +        } + +        /* Un-register us from receiving netdevice events */ +	unregister_netdevice_notifier(&pktgen_notifier_block); + +        /* Clean up proc file system */ + +        remove_proc_entry(module_fname, NULL); +         +	remove_proc_dir(); +} + + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se"); +MODULE_DESCRIPTION("Packet Generator tool"); +MODULE_LICENSE("GPL"); +module_param(pg_count_d, int, 0); +module_param(pg_delay_d, int, 0); +module_param(pg_clone_skb_d, int, 0); +module_param(debug, int, 0); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 00000000000..d69ad90e581 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,711 @@ +/* + * INET		An implementation of the TCP/IP protocol suite for the LINUX + *		operating system.  INET is implemented using the  BSD Socket + *		interface as the means of communication with the user level. + * + *		Routing netlink socket interface: protocol independent part. + * + * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + *	Fixes: + *	Vitaly E. Lavrov		RTA_OK arithmetics was wrong. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/security.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/string.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +DECLARE_MUTEX(rtnl_sem); + +void rtnl_lock(void) +{ +	rtnl_shlock(); +} + +int rtnl_lock_interruptible(void) +{ +	return down_interruptible(&rtnl_sem); +} +  +void rtnl_unlock(void) +{ +	rtnl_shunlock(); + +	netdev_run_todo(); +} + +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ +	memset(tb, 0, sizeof(struct rtattr*)*maxattr); + +	while (RTA_OK(rta, len)) { +		unsigned flavor = rta->rta_type; +		if (flavor && flavor <= maxattr) +			tb[flavor-1] = rta; +		rta = RTA_NEXT(rta, len); +	} +	return 0; +} + +struct sock *rtnl; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +{ +	NLMSG_LENGTH(sizeof(struct ifinfomsg)), +	NLMSG_LENGTH(sizeof(struct ifaddrmsg)), +	NLMSG_LENGTH(sizeof(struct rtmsg)), +	NLMSG_LENGTH(sizeof(struct ndmsg)), +	NLMSG_LENGTH(sizeof(struct rtmsg)), +	NLMSG_LENGTH(sizeof(struct tcmsg)), +	NLMSG_LENGTH(sizeof(struct tcmsg)), +	NLMSG_LENGTH(sizeof(struct tcmsg)), +	NLMSG_LENGTH(sizeof(struct tcamsg)) +}; + +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +{ +	IFLA_MAX, +	IFA_MAX, +	RTA_MAX, +	NDA_MAX, +	RTA_MAX, +	TCA_MAX, +	TCA_MAX, +	TCA_MAX, +	TCAA_MAX +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ +	struct rtattr *rta; +	int size = RTA_LENGTH(attrlen); + +	rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); +	rta->rta_type = attrtype; +	rta->rta_len = size; +	memcpy(RTA_DATA(rta), data, attrlen); +} + +size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) +{ +	size_t ret = RTA_PAYLOAD(rta); +	char *src = RTA_DATA(rta); + +	if (ret > 0 && src[ret - 1] == '\0') +		ret--; +	if (size > 0) { +		size_t len = (ret >= size) ? size - 1 : ret; +		memset(dest, 0, size); +		memcpy(dest, src, len); +	} +	return ret; +} + +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ +	int err = 0; + +	NETLINK_CB(skb).dst_groups = group; +	if (echo) +		atomic_inc(&skb->users); +	netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); +	if (echo) +		err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +	return err; +} + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ +	struct rtattr *mx = (struct rtattr*)skb->tail; +	int i; + +	RTA_PUT(skb, RTA_METRICS, 0, NULL); +	for (i=0; i<RTAX_MAX; i++) { +		if (metrics[i]) +			RTA_PUT(skb, i+1, sizeof(u32), metrics+i); +	} +	mx->rta_len = skb->tail - (u8*)mx; +	if (mx->rta_len == RTA_LENGTH(0)) +		skb_trim(skb, (u8*)mx - skb->data); +	return 0; + +rtattr_failure: +	skb_trim(skb, (u8*)mx - skb->data); +	return -1; +} + + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, +				 int type, u32 pid, u32 seq, u32 change) +{ +	struct ifinfomsg *r; +	struct nlmsghdr  *nlh; +	unsigned char	 *b = skb->tail; + +	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); +	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; +	r = NLMSG_DATA(nlh); +	r->ifi_family = AF_UNSPEC; +	r->ifi_type = dev->type; +	r->ifi_index = dev->ifindex; +	r->ifi_flags = dev_get_flags(dev); +	r->ifi_change = change; + +	RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + +	if (1) { +		u32 txqlen = dev->tx_queue_len; +		RTA_PUT(skb, IFLA_TXQLEN, sizeof(txqlen), &txqlen); +	} + +	if (1) { +		u32 weight = dev->weight; +		RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight); +	} + +	if (1) { +		struct rtnl_link_ifmap map = { +			.mem_start   = dev->mem_start, +			.mem_end     = dev->mem_end, +			.base_addr   = dev->base_addr, +			.irq         = dev->irq, +			.dma         = dev->dma, +			.port        = dev->if_port, +		}; +		RTA_PUT(skb, IFLA_MAP, sizeof(map), &map); +	} + +	if (dev->addr_len) { +		RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); +		RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); +	} + +	if (1) { +		u32 mtu = dev->mtu; +		RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); +	} + +	if (dev->ifindex != dev->iflink) { +		u32 iflink = dev->iflink; +		RTA_PUT(skb, IFLA_LINK, sizeof(iflink), &iflink); +	} + +	if (dev->qdisc_sleeping) +		RTA_PUT(skb, IFLA_QDISC, +			strlen(dev->qdisc_sleeping->ops->id) + 1, +			dev->qdisc_sleeping->ops->id); +	 +	if (dev->master) { +		u32 master = dev->master->ifindex; +		RTA_PUT(skb, IFLA_MASTER, sizeof(master), &master); +	} + +	if (dev->get_stats) { +		unsigned long *stats = (unsigned long*)dev->get_stats(dev); +		if (stats) { +			struct rtattr  *a; +			__u32	       *s; +			int		i; +			int		n = sizeof(struct rtnl_link_stats)/4; + +			a = __RTA_PUT(skb, IFLA_STATS, n*4); +			s = RTA_DATA(a); +			for (i=0; i<n; i++) +				s[i] = stats[i]; +		} +	} +	nlh->nlmsg_len = skb->tail - b; +	return skb->len; + +nlmsg_failure: +rtattr_failure: +	skb_trim(skb, b - skb->data); +	return -1; +} + +static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int idx; +	int s_idx = cb->args[0]; +	struct net_device *dev; + +	read_lock(&dev_base_lock); +	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { +		if (idx < s_idx) +			continue; +		if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) +			break; +	} +	read_unlock(&dev_base_lock); +	cb->args[0] = idx; + +	return skb->len; +} + +static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ +	struct ifinfomsg  *ifm = NLMSG_DATA(nlh); +	struct rtattr    **ida = arg; +	struct net_device *dev; +	int err, send_addr_notify = 0; + +	if (ifm->ifi_index >= 0) +		dev = dev_get_by_index(ifm->ifi_index); +	else if (ida[IFLA_IFNAME - 1]) { +		char ifname[IFNAMSIZ]; + +		if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], +		                   IFNAMSIZ) >= IFNAMSIZ) +			return -EINVAL; +		dev = dev_get_by_name(ifname); +	} else +		return -EINVAL; + +	if (!dev) +		return -ENODEV; + +	err = -EINVAL; + +	if (ifm->ifi_flags) +		dev_change_flags(dev, ifm->ifi_flags); + +	if (ida[IFLA_MAP - 1]) { +		struct rtnl_link_ifmap *u_map; +		struct ifmap k_map; + +		if (!dev->set_config) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		if (!netif_device_present(dev)) { +			err = -ENODEV; +			goto out; +		} +		 +		if (ida[IFLA_MAP - 1]->rta_len != RTA_LENGTH(sizeof(*u_map))) +			goto out; + +		u_map = RTA_DATA(ida[IFLA_MAP - 1]); + +		k_map.mem_start = (unsigned long) u_map->mem_start; +		k_map.mem_end = (unsigned long) u_map->mem_end; +		k_map.base_addr = (unsigned short) u_map->base_addr; +		k_map.irq = (unsigned char) u_map->irq; +		k_map.dma = (unsigned char) u_map->dma; +		k_map.port = (unsigned char) u_map->port; + +		err = dev->set_config(dev, &k_map); + +		if (err) +			goto out; +	} + +	if (ida[IFLA_ADDRESS - 1]) { +		if (!dev->set_mac_address) { +			err = -EOPNOTSUPP; +			goto out; +		} +		if (!netif_device_present(dev)) { +			err = -ENODEV; +			goto out; +		} +		if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len)) +			goto out; + +		err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1])); +		if (err) +			goto out; +		send_addr_notify = 1; +	} + +	if (ida[IFLA_BROADCAST - 1]) { +		if (ida[IFLA_BROADCAST - 1]->rta_len != RTA_LENGTH(dev->addr_len)) +			goto out; +		memcpy(dev->broadcast, RTA_DATA(ida[IFLA_BROADCAST - 1]), +		       dev->addr_len); +		send_addr_notify = 1; +	} + +	if (ida[IFLA_MTU - 1]) { +		if (ida[IFLA_MTU - 1]->rta_len != RTA_LENGTH(sizeof(u32))) +			goto out; +		err = dev_set_mtu(dev, *((u32 *) RTA_DATA(ida[IFLA_MTU - 1]))); + +		if (err) +			goto out; + +	} + +	if (ida[IFLA_TXQLEN - 1]) { +		if (ida[IFLA_TXQLEN - 1]->rta_len != RTA_LENGTH(sizeof(u32))) +			goto out; + +		dev->tx_queue_len = *((u32 *) RTA_DATA(ida[IFLA_TXQLEN - 1])); +	} + +	if (ida[IFLA_WEIGHT - 1]) { +		if (ida[IFLA_WEIGHT - 1]->rta_len != RTA_LENGTH(sizeof(u32))) +			goto out; + +		dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1])); +	} + +	if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) { +		char ifname[IFNAMSIZ]; + +		if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], +		                   IFNAMSIZ) >= IFNAMSIZ) +			goto out; +		err = dev_change_name(dev, ifname); +		if (err) +			goto out; +	} + +	err = 0; + +out: +	if (send_addr_notify) +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + +	dev_put(dev); +	return err; +} + +static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int idx; +	int s_idx = cb->family; + +	if (s_idx == 0) +		s_idx = 1; +	for (idx=1; idx<NPROTO; idx++) { +		int type = cb->nlh->nlmsg_type-RTM_BASE; +		if (idx < s_idx || idx == PF_PACKET) +			continue; +		if (rtnetlink_links[idx] == NULL || +		    rtnetlink_links[idx][type].dumpit == NULL) +			continue; +		if (idx > s_idx) +			memset(&cb->args[0], 0, sizeof(cb->args)); +		if (rtnetlink_links[idx][type].dumpit(skb, cb)) +			break; +	} +	cb->family = idx; + +	return skb->len; +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ +	struct sk_buff *skb; +	int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + +			       sizeof(struct rtnl_link_ifmap) + +			       sizeof(struct rtnl_link_stats) + 128); + +	skb = alloc_skb(size, GFP_KERNEL); +	if (!skb) +		return; + +	if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) { +		kfree_skb(skb); +		return; +	} +	NETLINK_CB(skb).dst_groups = RTMGRP_LINK; +	netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ +	return 0; +} + +/* Protected by RTNL sempahore.  */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ +	struct rtnetlink_link *link; +	struct rtnetlink_link *link_tab; +	int sz_idx, kind; +	int min_len; +	int family; +	int type; +	int err; + +	/* Only requests are handled by kernel now */ +	if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) +		return 0; + +	type = nlh->nlmsg_type; + +	/* A control message: ignore them */ +	if (type < RTM_BASE) +		return 0; + +	/* Unknown message: reply with EINVAL */ +	if (type > RTM_MAX) +		goto err_inval; + +	type -= RTM_BASE; + +	/* All the messages must have at least 1 byte length */ +	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) +		return 0; + +	family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; +	if (family >= NPROTO) { +		*errp = -EAFNOSUPPORT; +		return -1; +	} + +	link_tab = rtnetlink_links[family]; +	if (link_tab == NULL) +		link_tab = rtnetlink_links[PF_UNSPEC]; +	link = &link_tab[type]; + +	sz_idx = type>>2; +	kind = type&3; + +	if (kind != 2 && security_netlink_recv(skb)) { +		*errp = -EPERM; +		return -1; +	} + +	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { +		u32 rlen; + +		if (link->dumpit == NULL) +			link = &(rtnetlink_links[PF_UNSPEC][type]); + +		if (link->dumpit == NULL) +			goto err_inval; + +		if ((*errp = netlink_dump_start(rtnl, skb, nlh, +						link->dumpit, +						rtnetlink_done)) != 0) { +			return -1; +		} +		rlen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (rlen > skb->len) +			rlen = skb->len; +		skb_pull(skb, rlen); +		return -1; +	} + +	memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + +	min_len = rtm_min[sz_idx]; +	if (nlh->nlmsg_len < min_len) +		goto err_inval; + +	if (nlh->nlmsg_len > min_len) { +		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); +		struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + +		while (RTA_OK(attr, attrlen)) { +			unsigned flavor = attr->rta_type; +			if (flavor) { +				if (flavor > rta_max[sz_idx]) +					goto err_inval; +				rta_buf[flavor-1] = attr; +			} +			attr = RTA_NEXT(attr, attrlen); +		} +	} + +	if (link->doit == NULL) +		link = &(rtnetlink_links[PF_UNSPEC][type]); +	if (link->doit == NULL) +		goto err_inval; +	err = link->doit(skb, nlh, (void *)&rta_buf[0]); + +	*errp = err; +	return err; + +err_inval: +	*errp = -EINVAL; +	return -1; +} + +/*  + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +static inline int rtnetlink_rcv_skb(struct sk_buff *skb) +{ +	int err; +	struct nlmsghdr * nlh; + +	while (skb->len >= NLMSG_SPACE(0)) { +		u32 rlen; + +		nlh = (struct nlmsghdr *)skb->data; +		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) +			return 0; +		rlen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (rlen > skb->len) +			rlen = skb->len; +		if (rtnetlink_rcv_msg(skb, nlh, &err)) { +			/* Not error, but we must interrupt processing here: +			 *   Note, that in this case we do not pull message +			 *   from skb, it will be processed later. +			 */ +			if (err == 0) +				return -1; +			netlink_ack(skb, nlh, err); +		} else if (nlh->nlmsg_flags&NLM_F_ACK) +			netlink_ack(skb, nlh, 0); +		skb_pull(skb, rlen); +	} + +	return 0; +} + +/* + *  rtnetlink input queue processing routine: + *	- try to acquire shared lock. If it is failed, defer processing. + *	- feed skbs to rtnetlink_rcv_skb, until it refuse a message, + *	  that will occur, when a dump started and/or acquisition of + *	  exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ +	do { +		struct sk_buff *skb; + +		if (rtnl_shlock_nowait()) +			return; + +		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { +			if (rtnetlink_rcv_skb(skb)) { +				if (skb->len) +					skb_queue_head(&sk->sk_receive_queue, +						       skb); +				else +					kfree_skb(skb); +				break; +			} +			kfree_skb(skb); +		} + +		up(&rtnl_sem); + +		netdev_run_todo(); +	} while (rtnl && rtnl->sk_receive_queue.qlen); +} + +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ +	[RTM_GETLINK  - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, +	[RTM_SETLINK  - RTM_BASE] = { .doit   = do_setlink	      }, +	[RTM_GETADDR  - RTM_BASE] = { .dumpit = rtnetlink_dump_all    }, +	[RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all    }, +	[RTM_NEWNEIGH - RTM_BASE] = { .doit   = neigh_add	      }, +	[RTM_DELNEIGH - RTM_BASE] = { .doit   = neigh_delete	      }, +	[RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info	      } +}; + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ +	struct net_device *dev = ptr; +	switch (event) { +	case NETDEV_UNREGISTER: +		rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); +		break; +	case NETDEV_REGISTER: +		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +		break; +	case NETDEV_UP: +	case NETDEV_DOWN: +		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +		break; +	case NETDEV_CHANGE: +	case NETDEV_GOING_DOWN: +		break; +	default: +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		break; +	} +	return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { +	.notifier_call	= rtnetlink_event, +}; + +void __init rtnetlink_init(void) +{ +	int i; + +	rtattr_max = 0; +	for (i = 0; i < ARRAY_SIZE(rta_max); i++) +		if (rta_max[i] > rtattr_max) +			rtattr_max = rta_max[i]; +	rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); +	if (!rta_buf) +		panic("rtnetlink_init: cannot allocate rta_buf\n"); + +	rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); +	if (rtnl == NULL) +		panic("rtnetlink_init: cannot initialize rtnetlink\n"); +	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); +	register_netdevice_notifier(&rtnetlink_dev_notifier); +	rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; +	rtnetlink_links[PF_PACKET] = link_rtnetlink_table; +} + +EXPORT_SYMBOL(__rta_fill); +EXPORT_SYMBOL(rtattr_strlcpy); +EXPORT_SYMBOL(rtattr_parse); +EXPORT_SYMBOL(rtnetlink_links); +EXPORT_SYMBOL(rtnetlink_put_metrics); +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_lock_interruptible); +EXPORT_SYMBOL(rtnl_sem); +EXPORT_SYMBOL(rtnl_unlock); diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 00000000000..a2ebf30f6aa --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,291 @@ +/* scm.c - Socket level control messages processing. + * + * Author:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + *              Alignment and value checking mods by Craig Metz + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/security.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/compat.h> +#include <net/scm.h> + + +/* + *	Only allow a user to send credentials, that they could set with  + *	setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ +	if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && +	    ((creds->uid == current->uid || creds->uid == current->euid || +	      creds->uid == current->suid) || capable(CAP_SETUID)) && +	    ((creds->gid == current->gid || creds->gid == current->egid || +	      creds->gid == current->sgid) || capable(CAP_SETGID))) { +	       return 0; +	} +	return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ +	int *fdp = (int*)CMSG_DATA(cmsg); +	struct scm_fp_list *fpl = *fplp; +	struct file **fpp; +	int i, num; + +	num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + +	if (num <= 0) +		return 0; + +	if (num > SCM_MAX_FD) +		return -EINVAL; + +	if (!fpl) +	{ +		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); +		if (!fpl) +			return -ENOMEM; +		*fplp = fpl; +		fpl->count = 0; +	} +	fpp = &fpl->fp[fpl->count]; + +	if (fpl->count + num > SCM_MAX_FD) +		return -EINVAL; +	 +	/* +	 *	Verify the descriptors and increment the usage count. +	 */ +	  +	for (i=0; i< num; i++) +	{ +		int fd = fdp[i]; +		struct file *file; + +		if (fd < 0 || !(file = fget(fd))) +			return -EBADF; +		*fpp++ = file; +		fpl->count++; +	} +	return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ +	struct scm_fp_list *fpl = scm->fp; +	int i; + +	if (fpl) { +		scm->fp = NULL; +		for (i=fpl->count-1; i>=0; i--) +			fput(fpl->fp[i]); +		kfree(fpl); +	} +} + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ +	struct cmsghdr *cmsg; +	int err; + +	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) +	{ +		err = -EINVAL; + +		/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ +		/* The first check was omitted in <= 2.2.5. The reasoning was +		   that parser checks cmsg_len in any case, so that +		   additional check would be work duplication. +		   But if cmsg_level is not SOL_SOCKET, we do not check  +		   for too short ancillary data object at all! Oops. +		   OK, let's add it... +		 */ +		if (!CMSG_OK(msg, cmsg)) +			goto error; + +		if (cmsg->cmsg_level != SOL_SOCKET) +			continue; + +		switch (cmsg->cmsg_type) +		{ +		case SCM_RIGHTS: +			err=scm_fp_copy(cmsg, &p->fp); +			if (err<0) +				goto error; +			break; +		case SCM_CREDENTIALS: +			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) +				goto error; +			memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); +			err = scm_check_creds(&p->creds); +			if (err) +				goto error; +			break; +		default: +			goto error; +		} +	} + +	if (p->fp && !p->fp->count) +	{ +		kfree(p->fp); +		p->fp = NULL; +	} +	return 0; +	 +error: +	scm_destroy(p); +	return err; +} + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ +	struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control; +	struct cmsghdr cmhdr; +	int cmlen = CMSG_LEN(len); +	int err; + +	if (MSG_CMSG_COMPAT & msg->msg_flags) +		return put_cmsg_compat(msg, level, type, len, data); + +	if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { +		msg->msg_flags |= MSG_CTRUNC; +		return 0; /* XXX: return error? check spec. */ +	} +	if (msg->msg_controllen < cmlen) { +		msg->msg_flags |= MSG_CTRUNC; +		cmlen = msg->msg_controllen; +	} +	cmhdr.cmsg_level = level; +	cmhdr.cmsg_type = type; +	cmhdr.cmsg_len = cmlen; + +	err = -EFAULT; +	if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) +		goto out;  +	if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) +		goto out; +	cmlen = CMSG_SPACE(len); +	msg->msg_control += cmlen; +	msg->msg_controllen -= cmlen; +	err = 0; +out: +	return err; +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ +	struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control; + +	int fdmax = 0; +	int fdnum = scm->fp->count; +	struct file **fp = scm->fp->fp; +	int __user *cmfptr; +	int err = 0, i; + +	if (MSG_CMSG_COMPAT & msg->msg_flags) { +		scm_detach_fds_compat(msg, scm); +		return; +	} + +	if (msg->msg_controllen > sizeof(struct cmsghdr)) +		fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) +			 / sizeof(int)); + +	if (fdnum < fdmax) +		fdmax = fdnum; + +	for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) +	{ +		int new_fd; +		err = security_file_receive(fp[i]); +		if (err) +			break; +		err = get_unused_fd(); +		if (err < 0) +			break; +		new_fd = err; +		err = put_user(new_fd, cmfptr); +		if (err) { +			put_unused_fd(new_fd); +			break; +		} +		/* Bump the usage count and install the file. */ +		get_file(fp[i]); +		fd_install(new_fd, fp[i]); +	} + +	if (i > 0) +	{ +		int cmlen = CMSG_LEN(i*sizeof(int)); +		if (!err) +			err = put_user(SOL_SOCKET, &cm->cmsg_level); +		if (!err) +			err = put_user(SCM_RIGHTS, &cm->cmsg_type); +		if (!err) +			err = put_user(cmlen, &cm->cmsg_len); +		if (!err) { +			cmlen = CMSG_SPACE(i*sizeof(int)); +			msg->msg_control += cmlen; +			msg->msg_controllen -= cmlen; +		} +	} +	if (i < fdnum || (fdnum && fdmax <= 0)) +		msg->msg_flags |= MSG_CTRUNC; + +	/* +	 * All of the files that fit in the message have had their +	 * usage counts incremented, so we just free the list. +	 */ +	__scm_destroy(scm); +} + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ +	struct scm_fp_list *new_fpl; +	int i; + +	if (!fpl) +		return NULL; + +	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); +	if (new_fpl) { +		for (i=fpl->count-1; i>=0; i--) +			get_file(fpl->fp[i]); +		memcpy(new_fpl, fpl, sizeof(*fpl)); +	} +	return new_fpl; +} + +EXPORT_SYMBOL(__scm_destroy); +EXPORT_SYMBOL(__scm_send); +EXPORT_SYMBOL(put_cmsg); +EXPORT_SYMBOL(scm_detach_fds); +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 00000000000..bf02ca9f80a --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,1460 @@ +/* + *	Routines having to do with the 'struct sk_buff' memory handlers. + * + *	Authors:	Alan Cox <iiitac@pyr.swan.ac.uk> + *			Florian La Roche <rzsfl@rz.uni-sb.de> + * + *	Version:	$Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ + * + *	Fixes: + *		Alan Cox	:	Fixed the worst of the load + *					balancer bugs. + *		Dave Platt	:	Interrupt stacking fix. + *	Richard Kooijman	:	Timestamp fixes. + *		Alan Cox	:	Changed buffer format. + *		Alan Cox	:	destructor hook for AF_UNIX etc. + *		Linus Torvalds	:	Better skb_clone. + *		Alan Cox	:	Added skb_copy. + *		Alan Cox	:	Added all the changed routines Linus + *					only put in the headers + *		Ray VanTassle	:	Fixed --skb->lock in free + *		Alan Cox	:	skb_copy copy arp field + *		Andi Kleen	:	slabified it. + *		Robert Olsson	:	Removed skb_head_pool + * + *	NOTE: + *		The __skb_ routines should be called with interrupts + *	disabled, or you better be *real* sure that the operation is atomic + *	with respect to whatever list is being frobbed (e.g. via lock_sock() + *	or via disabling bottom half handlers, etc). + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ + +/* + *	The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#ifdef CONFIG_NET_CLS_ACT +#include <net/pkt_sched.h> +#endif +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/cache.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/highmem.h> + +#include <net/protocol.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <net/xfrm.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +static kmem_cache_t *skbuff_head_cache; + +/* + *	Keep out-of-line to prevent kernel bloat. + *	__builtin_return_address is not used because it is not always + *	reliable. + */ + +/** + *	skb_over_panic	- 	private function + *	@skb: buffer + *	@sz: size + *	@here: address + * + *	Out of line support code for skb_put(). Not user callable. + */ +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ +	printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s", +		here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); +	BUG(); +} + +/** + *	skb_under_panic	- 	private function + *	@skb: buffer + *	@sz: size + *	@here: address + * + *	Out of line support code for skb_push(). Not user callable. + */ + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ +	printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s", +               here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); +	BUG(); +} + +/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few + *	'private' fields and also do memory statistics to find all the + *	[BEEP] leaks. + * + */ + +/** + *	alloc_skb	-	allocate a network buffer + *	@size: size to allocate + *	@gfp_mask: allocation mask + * + *	Allocate a new &sk_buff. The returned buffer has no headroom and a + *	tail room of size bytes. The object has a reference count of one. + *	The return is the buffer. On a failure the return is %NULL. + * + *	Buffers may only be allocated from interrupts using a @gfp_mask of + *	%GFP_ATOMIC. + */ +struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) +{ +	struct sk_buff *skb; +	u8 *data; + +	/* Get the HEAD */ +	skb = kmem_cache_alloc(skbuff_head_cache, +			       gfp_mask & ~__GFP_DMA); +	if (!skb) +		goto out; + +	/* Get the DATA. Size must match skb_add_mtu(). */ +	size = SKB_DATA_ALIGN(size); +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +	if (!data) +		goto nodata; + +	memset(skb, 0, offsetof(struct sk_buff, truesize)); +	skb->truesize = size + sizeof(struct sk_buff); +	atomic_set(&skb->users, 1); +	skb->head = data; +	skb->data = data; +	skb->tail = data; +	skb->end  = data + size; + +	atomic_set(&(skb_shinfo(skb)->dataref), 1); +	skb_shinfo(skb)->nr_frags  = 0; +	skb_shinfo(skb)->tso_size = 0; +	skb_shinfo(skb)->tso_segs = 0; +	skb_shinfo(skb)->frag_list = NULL; +out: +	return skb; +nodata: +	kmem_cache_free(skbuff_head_cache, skb); +	skb = NULL; +	goto out; +} + +/** + *	alloc_skb_from_cache	-	allocate a network buffer + *	@cp: kmem_cache from which to allocate the data area + *           (object size must be big enough for @size bytes + skb overheads) + *	@size: size to allocate + *	@gfp_mask: allocation mask + * + *	Allocate a new &sk_buff. The returned buffer has no headroom and + *	tail room of size bytes. The object has a reference count of one. + *	The return is the buffer. On a failure the return is %NULL. + * + *	Buffers may only be allocated from interrupts using a @gfp_mask of + *	%GFP_ATOMIC. + */ +struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, +				     unsigned int size, int gfp_mask) +{ +	struct sk_buff *skb; +	u8 *data; + +	/* Get the HEAD */ +	skb = kmem_cache_alloc(skbuff_head_cache, +			       gfp_mask & ~__GFP_DMA); +	if (!skb) +		goto out; + +	/* Get the DATA. */ +	size = SKB_DATA_ALIGN(size); +	data = kmem_cache_alloc(cp, gfp_mask); +	if (!data) +		goto nodata; + +	memset(skb, 0, offsetof(struct sk_buff, truesize)); +	skb->truesize = size + sizeof(struct sk_buff); +	atomic_set(&skb->users, 1); +	skb->head = data; +	skb->data = data; +	skb->tail = data; +	skb->end  = data + size; + +	atomic_set(&(skb_shinfo(skb)->dataref), 1); +	skb_shinfo(skb)->nr_frags  = 0; +	skb_shinfo(skb)->tso_size = 0; +	skb_shinfo(skb)->tso_segs = 0; +	skb_shinfo(skb)->frag_list = NULL; +out: +	return skb; +nodata: +	kmem_cache_free(skbuff_head_cache, skb); +	skb = NULL; +	goto out; +} + + +static void skb_drop_fraglist(struct sk_buff *skb) +{ +	struct sk_buff *list = skb_shinfo(skb)->frag_list; + +	skb_shinfo(skb)->frag_list = NULL; + +	do { +		struct sk_buff *this = list; +		list = list->next; +		kfree_skb(this); +	} while (list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ +	struct sk_buff *list; + +	for (list = skb_shinfo(skb)->frag_list; list; list = list->next) +		skb_get(list); +} + +void skb_release_data(struct sk_buff *skb) +{ +	if (!skb->cloned || +	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, +			       &skb_shinfo(skb)->dataref)) { +		if (skb_shinfo(skb)->nr_frags) { +			int i; +			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +				put_page(skb_shinfo(skb)->frags[i].page); +		} + +		if (skb_shinfo(skb)->frag_list) +			skb_drop_fraglist(skb); + +		kfree(skb->head); +	} +} + +/* + *	Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ +	skb_release_data(skb); +	kmem_cache_free(skbuff_head_cache, skb); +} + +/** + *	__kfree_skb - private function + *	@skb: buffer + * + *	Free an sk_buff. Release anything attached to the buffer. + *	Clean the state. This is an internal helper function. Users should + *	always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ +	if (skb->list) { +	 	printk(KERN_WARNING "Warning: kfree_skb passed an skb still " +		       "on a list (from %p).\n", NET_CALLER(skb)); +		BUG(); +	} + +	dst_release(skb->dst); +#ifdef CONFIG_XFRM +	secpath_put(skb->sp); +#endif +	if(skb->destructor) { +		if (in_irq()) +			printk(KERN_WARNING "Warning: kfree_skb on " +					    "hard IRQ %p\n", NET_CALLER(skb)); +		skb->destructor(skb); +	} +#ifdef CONFIG_NETFILTER +	nf_conntrack_put(skb->nfct); +#ifdef CONFIG_BRIDGE_NETFILTER +	nf_bridge_put(skb->nf_bridge); +#endif +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED +	skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT +	skb->tc_verd = 0; +	skb->tc_classid = 0; +#endif +#endif + +	kfree_skbmem(skb); +} + +/** + *	skb_clone	-	duplicate an sk_buff + *	@skb: buffer to clone + *	@gfp_mask: allocation priority + * + *	Duplicate an &sk_buff. The new one is not owned by a socket. Both + *	copies share the same packet data but not structure. The new + *	buffer has a reference count of 1. If the allocation fails the + *	function returns %NULL otherwise the new buffer is returned. + * + *	If this function is called from an interrupt gfp_mask() must be + *	%GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ +	struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + +	if (!n)  +		return NULL; + +#define C(x) n->x = skb->x + +	n->next = n->prev = NULL; +	n->list = NULL; +	n->sk = NULL; +	C(stamp); +	C(dev); +	C(real_dev); +	C(h); +	C(nh); +	C(mac); +	C(dst); +	dst_clone(skb->dst); +	C(sp); +#ifdef CONFIG_INET +	secpath_get(skb->sp); +#endif +	memcpy(n->cb, skb->cb, sizeof(skb->cb)); +	C(len); +	C(data_len); +	C(csum); +	C(local_df); +	n->cloned = 1; +	n->nohdr = 0; +	C(pkt_type); +	C(ip_summed); +	C(priority); +	C(protocol); +	C(security); +	n->destructor = NULL; +#ifdef CONFIG_NETFILTER +	C(nfmark); +	C(nfcache); +	C(nfct); +	nf_conntrack_get(skb->nfct); +	C(nfctinfo); +#ifdef CONFIG_NETFILTER_DEBUG +	C(nf_debug); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER +	C(nf_bridge); +	nf_bridge_get(skb->nf_bridge); +#endif +#endif /*CONFIG_NETFILTER*/ +#if defined(CONFIG_HIPPI) +	C(private); +#endif +#ifdef CONFIG_NET_SCHED +	C(tc_index); +#ifdef CONFIG_NET_CLS_ACT +	n->tc_verd = SET_TC_VERD(skb->tc_verd,0); +	n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); +	n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); +	C(input_dev); +	C(tc_classid); +#endif + +#endif +	C(truesize); +	atomic_set(&n->users, 1); +	C(head); +	C(data); +	C(tail); +	C(end); + +	atomic_inc(&(skb_shinfo(skb)->dataref)); +	skb->cloned = 1; + +	return n; +} + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ +	/* +	 *	Shift between the two data areas in bytes +	 */ +	unsigned long offset = new->data - old->data; + +	new->list	= NULL; +	new->sk		= NULL; +	new->dev	= old->dev; +	new->real_dev	= old->real_dev; +	new->priority	= old->priority; +	new->protocol	= old->protocol; +	new->dst	= dst_clone(old->dst); +#ifdef CONFIG_INET +	new->sp		= secpath_get(old->sp); +#endif +	new->h.raw	= old->h.raw + offset; +	new->nh.raw	= old->nh.raw + offset; +	new->mac.raw	= old->mac.raw + offset; +	memcpy(new->cb, old->cb, sizeof(old->cb)); +	new->local_df	= old->local_df; +	new->pkt_type	= old->pkt_type; +	new->stamp	= old->stamp; +	new->destructor = NULL; +	new->security	= old->security; +#ifdef CONFIG_NETFILTER +	new->nfmark	= old->nfmark; +	new->nfcache	= old->nfcache; +	new->nfct	= old->nfct; +	nf_conntrack_get(old->nfct); +	new->nfctinfo	= old->nfctinfo; +#ifdef CONFIG_NETFILTER_DEBUG +	new->nf_debug	= old->nf_debug; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER +	new->nf_bridge	= old->nf_bridge; +	nf_bridge_get(old->nf_bridge); +#endif +#endif +#ifdef CONFIG_NET_SCHED +#ifdef CONFIG_NET_CLS_ACT +	new->tc_verd = old->tc_verd; +#endif +	new->tc_index	= old->tc_index; +#endif +	atomic_set(&new->users, 1); +	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; +	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; +} + +/** + *	skb_copy	-	create private copy of an sk_buff + *	@skb: buffer to copy + *	@gfp_mask: allocation priority + * + *	Make a copy of both an &sk_buff and its data. This is used when the + *	caller wishes to modify the data and needs a private copy of the + *	data to alter. Returns %NULL on failure or the pointer to the buffer + *	on success. The returned buffer has a reference count of 1. + * + *	As by-product this function converts non-linear &sk_buff to linear + *	one, so that &sk_buff becomes completely private and caller is allowed + *	to modify all the data of returned buffer. This means that this + *	function is not recommended for use in circumstances when only + *	header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) +{ +	int headerlen = skb->data - skb->head; +	/* +	 *	Allocate the copy buffer +	 */ +	struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, +				      gfp_mask); +	if (!n) +		return NULL; + +	/* Set the data pointer */ +	skb_reserve(n, headerlen); +	/* Set the tail pointer and length */ +	skb_put(n, skb->len); +	n->csum	     = skb->csum; +	n->ip_summed = skb->ip_summed; + +	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) +		BUG(); + +	copy_skb_header(n, skb); +	return n; +} + + +/** + *	pskb_copy	-	create copy of an sk_buff with private head. + *	@skb: buffer to copy + *	@gfp_mask: allocation priority + * + *	Make a copy of both an &sk_buff and part of its data, located + *	in header. Fragmented data remain shared. This is used when + *	the caller wishes to modify only header of &sk_buff and needs + *	private copy of the header to alter. Returns %NULL on failure + *	or the pointer to the buffer on success. + *	The returned buffer has a reference count of 1. + */ + +struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) +{ +	/* +	 *	Allocate the copy buffer +	 */ +	struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); + +	if (!n) +		goto out; + +	/* Set the data pointer */ +	skb_reserve(n, skb->data - skb->head); +	/* Set the tail pointer and length */ +	skb_put(n, skb_headlen(skb)); +	/* Copy the bytes */ +	memcpy(n->data, skb->data, n->len); +	n->csum	     = skb->csum; +	n->ip_summed = skb->ip_summed; + +	n->data_len  = skb->data_len; +	n->len	     = skb->len; + +	if (skb_shinfo(skb)->nr_frags) { +		int i; + +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; +			get_page(skb_shinfo(n)->frags[i].page); +		} +		skb_shinfo(n)->nr_frags = i; +	} + +	if (skb_shinfo(skb)->frag_list) { +		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; +		skb_clone_fraglist(n); +	} + +	copy_skb_header(n, skb); +out: +	return n; +} + +/** + *	pskb_expand_head - reallocate header of &sk_buff + *	@skb: buffer to reallocate + *	@nhead: room to add at head + *	@ntail: room to add at tail + *	@gfp_mask: allocation priority + * + *	Expands (or creates identical copy, if &nhead and &ntail are zero) + *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have + *	reference count of 1. Returns zero in the case of success or error, + *	if expansion failed. In the last case, &sk_buff is not changed. + * + *	All the pointers pointing into skb header may change and must be + *	reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) +{ +	int i; +	u8 *data; +	int size = nhead + (skb->end - skb->head) + ntail; +	long off; + +	if (skb_shared(skb)) +		BUG(); + +	size = SKB_DATA_ALIGN(size); + +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +	if (!data) +		goto nodata; + +	/* Copy only real data... and, alas, header. This should be +	 * optimized for the cases when header is void. */ +	memcpy(data + nhead, skb->head, skb->tail - skb->head); +	memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +		get_page(skb_shinfo(skb)->frags[i].page); + +	if (skb_shinfo(skb)->frag_list) +		skb_clone_fraglist(skb); + +	skb_release_data(skb); + +	off = (data + nhead) - skb->head; + +	skb->head     = data; +	skb->end      = data + size; +	skb->data    += off; +	skb->tail    += off; +	skb->mac.raw += off; +	skb->h.raw   += off; +	skb->nh.raw  += off; +	skb->cloned   = 0; +	skb->nohdr    = 0; +	atomic_set(&skb_shinfo(skb)->dataref, 1); +	return 0; + +nodata: +	return -ENOMEM; +} + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ +	struct sk_buff *skb2; +	int delta = headroom - skb_headroom(skb); + +	if (delta <= 0) +		skb2 = pskb_copy(skb, GFP_ATOMIC); +	else { +		skb2 = skb_clone(skb, GFP_ATOMIC); +		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, +					     GFP_ATOMIC)) { +			kfree_skb(skb2); +			skb2 = NULL; +		} +	} +	return skb2; +} + + +/** + *	skb_copy_expand	-	copy and expand sk_buff + *	@skb: buffer to copy + *	@newheadroom: new free bytes at head + *	@newtailroom: new free bytes at tail + *	@gfp_mask: allocation priority + * + *	Make a copy of both an &sk_buff and its data and while doing so + *	allocate additional space. + * + *	This is used when the caller wishes to modify the data and needs a + *	private copy of the data to alter as well as more space for new fields. + *	Returns %NULL on failure or the pointer to the buffer + *	on success. The returned buffer has a reference count of 1. + * + *	You must pass %GFP_ATOMIC as the allocation priority if this function + *	is called from an interrupt. + * + *	BUG ALERT: ip_summed is not copied. Why does this work? Is it used + *	only by netfilter in the cases when checksum is recalculated? --ANK + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, +				int newheadroom, int newtailroom, int gfp_mask) +{ +	/* +	 *	Allocate the copy buffer +	 */ +	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, +				      gfp_mask); +	int head_copy_len, head_copy_off; + +	if (!n) +		return NULL; + +	skb_reserve(n, newheadroom); + +	/* Set the tail pointer and length */ +	skb_put(n, skb->len); + +	head_copy_len = skb_headroom(skb); +	head_copy_off = 0; +	if (newheadroom <= head_copy_len) +		head_copy_len = newheadroom; +	else +		head_copy_off = newheadroom - head_copy_len; + +	/* Copy the linear header and data. */ +	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, +			  skb->len + head_copy_len)) +		BUG(); + +	copy_skb_header(n, skb); + +	return n; +} + +/** + *	skb_pad			-	zero pad the tail of an skb + *	@skb: buffer to pad + *	@pad: space to pad + * + *	Ensure that a buffer is followed by a padding area that is zero + *	filled. Used by network drivers which may DMA or transfer data + *	beyond the buffer end onto the wire. + * + *	May return NULL in out of memory cases. + */ +  +struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +{ +	struct sk_buff *nskb; +	 +	/* If the skbuff is non linear tailroom is always zero.. */ +	if (skb_tailroom(skb) >= pad) { +		memset(skb->data+skb->len, 0, pad); +		return skb; +	} +	 +	nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); +	kfree_skb(skb); +	if (nskb) +		memset(nskb->data+nskb->len, 0, pad); +	return nskb; +}	 +  +/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. + * If realloc==0 and trimming is impossible without change of data, + * it is BUG(). + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +{ +	int offset = skb_headlen(skb); +	int nfrags = skb_shinfo(skb)->nr_frags; +	int i; + +	for (i = 0; i < nfrags; i++) { +		int end = offset + skb_shinfo(skb)->frags[i].size; +		if (end > len) { +			if (skb_cloned(skb)) { +				if (!realloc) +					BUG(); +				if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +					return -ENOMEM; +			} +			if (len <= offset) { +				put_page(skb_shinfo(skb)->frags[i].page); +				skb_shinfo(skb)->nr_frags--; +			} else { +				skb_shinfo(skb)->frags[i].size = len - offset; +			} +		} +		offset = end; +	} + +	if (offset < len) { +		skb->data_len -= skb->len - len; +		skb->len       = len; +	} else { +		if (len <= skb_headlen(skb)) { +			skb->len      = len; +			skb->data_len = 0; +			skb->tail     = skb->data + len; +			if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) +				skb_drop_fraglist(skb); +		} else { +			skb->data_len -= skb->len - len; +			skb->len       = len; +		} +	} + +	return 0; +} + +/** + *	__pskb_pull_tail - advance tail of skb header + *	@skb: buffer to reallocate + *	@delta: number of bytes to advance tail + * + *	The function makes a sense only on a fragmented &sk_buff, + *	it expands header moving its tail forward and copying necessary + *	data from fragmented part. + * + *	&sk_buff MUST have reference count of 1. + * + *	Returns %NULL (and &sk_buff does not change) if pull failed + *	or value of new tail of skb in the case of success. + * + *	All the pointers pointing into skb header may change and must be + *	reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ +	/* If skb has not enough free space at tail, get new one +	 * plus 128 bytes for future expansions. If we have enough +	 * room at tail, reallocate without expansion only if skb is cloned. +	 */ +	int i, k, eat = (skb->tail + delta) - skb->end; + +	if (eat > 0 || skb_cloned(skb)) { +		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, +				     GFP_ATOMIC)) +			return NULL; +	} + +	if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) +		BUG(); + +	/* Optimization: no fragments, no reasons to preestimate +	 * size of pulled pages. Superb. +	 */ +	if (!skb_shinfo(skb)->frag_list) +		goto pull_pages; + +	/* Estimate size of pulled pages. */ +	eat = delta; +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		if (skb_shinfo(skb)->frags[i].size >= eat) +			goto pull_pages; +		eat -= skb_shinfo(skb)->frags[i].size; +	} + +	/* If we need update frag list, we are in troubles. +	 * Certainly, it possible to add an offset to skb data, +	 * but taking into account that pulling is expected to +	 * be very rare operation, it is worth to fight against +	 * further bloating skb head and crucify ourselves here instead. +	 * Pure masohism, indeed. 8)8) +	 */ +	if (eat) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; +		struct sk_buff *clone = NULL; +		struct sk_buff *insp = NULL; + +		do { +			if (!list) +				BUG(); + +			if (list->len <= eat) { +				/* Eaten as whole. */ +				eat -= list->len; +				list = list->next; +				insp = list; +			} else { +				/* Eaten partially. */ + +				if (skb_shared(list)) { +					/* Sucks! We need to fork list. :-( */ +					clone = skb_clone(list, GFP_ATOMIC); +					if (!clone) +						return NULL; +					insp = list->next; +					list = clone; +				} else { +					/* This may be pulled without +					 * problems. */ +					insp = list; +				} +				if (!pskb_pull(list, eat)) { +					if (clone) +						kfree_skb(clone); +					return NULL; +				} +				break; +			} +		} while (eat); + +		/* Free pulled out fragments. */ +		while ((list = skb_shinfo(skb)->frag_list) != insp) { +			skb_shinfo(skb)->frag_list = list->next; +			kfree_skb(list); +		} +		/* And insert new clone at head. */ +		if (clone) { +			clone->next = list; +			skb_shinfo(skb)->frag_list = clone; +		} +	} +	/* Success! Now we may commit changes to skb data. */ + +pull_pages: +	eat = delta; +	k = 0; +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		if (skb_shinfo(skb)->frags[i].size <= eat) { +			put_page(skb_shinfo(skb)->frags[i].page); +			eat -= skb_shinfo(skb)->frags[i].size; +		} else { +			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; +			if (eat) { +				skb_shinfo(skb)->frags[k].page_offset += eat; +				skb_shinfo(skb)->frags[k].size -= eat; +				eat = 0; +			} +			k++; +		} +	} +	skb_shinfo(skb)->nr_frags = k; + +	skb->tail     += delta; +	skb->data_len -= delta; + +	return skb->tail; +} + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ +	int i, copy; +	int start = skb_headlen(skb); + +	if (offset > (int)skb->len - len) +		goto fault; + +	/* Copy header. */ +	if ((copy = start - offset) > 0) { +		if (copy > len) +			copy = len; +		memcpy(to, skb->data + offset, copy); +		if ((len -= copy) == 0) +			return 0; +		offset += copy; +		to     += copy; +	} + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		int end; + +		BUG_TRAP(start <= offset + len); + +		end = start + skb_shinfo(skb)->frags[i].size; +		if ((copy = end - offset) > 0) { +			u8 *vaddr; + +			if (copy > len) +				copy = len; + +			vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); +			memcpy(to, +			       vaddr + skb_shinfo(skb)->frags[i].page_offset+ +			       offset - start, copy); +			kunmap_skb_frag(vaddr); + +			if ((len -= copy) == 0) +				return 0; +			offset += copy; +			to     += copy; +		} +		start = end; +	} + +	if (skb_shinfo(skb)->frag_list) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; + +		for (; list; list = list->next) { +			int end; + +			BUG_TRAP(start <= offset + len); + +			end = start + list->len; +			if ((copy = end - offset) > 0) { +				if (copy > len) +					copy = len; +				if (skb_copy_bits(list, offset - start, +						  to, copy)) +					goto fault; +				if ((len -= copy) == 0) +					return 0; +				offset += copy; +				to     += copy; +			} +			start = end; +		} +	} +	if (!len) +		return 0; + +fault: +	return -EFAULT; +} + +/* Checksum skb data. */ + +unsigned int skb_checksum(const struct sk_buff *skb, int offset, +			  int len, unsigned int csum) +{ +	int start = skb_headlen(skb); +	int i, copy = start - offset; +	int pos = 0; + +	/* Checksum header. */ +	if (copy > 0) { +		if (copy > len) +			copy = len; +		csum = csum_partial(skb->data + offset, copy, csum); +		if ((len -= copy) == 0) +			return csum; +		offset += copy; +		pos	= copy; +	} + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		int end; + +		BUG_TRAP(start <= offset + len); + +		end = start + skb_shinfo(skb)->frags[i].size; +		if ((copy = end - offset) > 0) { +			unsigned int csum2; +			u8 *vaddr; +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + +			if (copy > len) +				copy = len; +			vaddr = kmap_skb_frag(frag); +			csum2 = csum_partial(vaddr + frag->page_offset + +					     offset - start, copy, 0); +			kunmap_skb_frag(vaddr); +			csum = csum_block_add(csum, csum2, pos); +			if (!(len -= copy)) +				return csum; +			offset += copy; +			pos    += copy; +		} +		start = end; +	} + +	if (skb_shinfo(skb)->frag_list) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; + +		for (; list; list = list->next) { +			int end; + +			BUG_TRAP(start <= offset + len); + +			end = start + list->len; +			if ((copy = end - offset) > 0) { +				unsigned int csum2; +				if (copy > len) +					copy = len; +				csum2 = skb_checksum(list, offset - start, +						     copy, 0); +				csum = csum_block_add(csum, csum2, pos); +				if ((len -= copy) == 0) +					return csum; +				offset += copy; +				pos    += copy; +			} +			start = end; +		} +	} +	if (len) +		BUG(); + +	return csum; +} + +/* Both of above in one bottle. */ + +unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, +				    u8 *to, int len, unsigned int csum) +{ +	int start = skb_headlen(skb); +	int i, copy = start - offset; +	int pos = 0; + +	/* Copy header. */ +	if (copy > 0) { +		if (copy > len) +			copy = len; +		csum = csum_partial_copy_nocheck(skb->data + offset, to, +						 copy, csum); +		if ((len -= copy) == 0) +			return csum; +		offset += copy; +		to     += copy; +		pos	= copy; +	} + +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +		int end; + +		BUG_TRAP(start <= offset + len); + +		end = start + skb_shinfo(skb)->frags[i].size; +		if ((copy = end - offset) > 0) { +			unsigned int csum2; +			u8 *vaddr; +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + +			if (copy > len) +				copy = len; +			vaddr = kmap_skb_frag(frag); +			csum2 = csum_partial_copy_nocheck(vaddr + +							  frag->page_offset + +							  offset - start, to, +							  copy, 0); +			kunmap_skb_frag(vaddr); +			csum = csum_block_add(csum, csum2, pos); +			if (!(len -= copy)) +				return csum; +			offset += copy; +			to     += copy; +			pos    += copy; +		} +		start = end; +	} + +	if (skb_shinfo(skb)->frag_list) { +		struct sk_buff *list = skb_shinfo(skb)->frag_list; + +		for (; list; list = list->next) { +			unsigned int csum2; +			int end; + +			BUG_TRAP(start <= offset + len); + +			end = start + list->len; +			if ((copy = end - offset) > 0) { +				if (copy > len) +					copy = len; +				csum2 = skb_copy_and_csum_bits(list, +							       offset - start, +							       to, copy, 0); +				csum = csum_block_add(csum, csum2, pos); +				if ((len -= copy) == 0) +					return csum; +				offset += copy; +				to     += copy; +				pos    += copy; +			} +			start = end; +		} +	} +	if (len) +		BUG(); +	return csum; +} + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ +	unsigned int csum; +	long csstart; + +	if (skb->ip_summed == CHECKSUM_HW) +		csstart = skb->h.raw - skb->data; +	else +		csstart = skb_headlen(skb); + +	if (csstart > skb_headlen(skb)) +		BUG(); + +	memcpy(to, skb->data, csstart); + +	csum = 0; +	if (csstart != skb->len) +		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, +					      skb->len - csstart, 0); + +	if (skb->ip_summed == CHECKSUM_HW) { +		long csstuff = csstart + skb->csum; + +		*((unsigned short *)(to + csstuff)) = csum_fold(csum); +	} +} + +/** + *	skb_dequeue - remove from the head of the queue + *	@list: list to dequeue from + * + *	Remove the head of the list. The list lock is taken so the function + *	may be used safely with other locking list functions. The head item is + *	returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ +	unsigned long flags; +	struct sk_buff *result; + +	spin_lock_irqsave(&list->lock, flags); +	result = __skb_dequeue(list); +	spin_unlock_irqrestore(&list->lock, flags); +	return result; +} + +/** + *	skb_dequeue_tail - remove from the tail of the queue + *	@list: list to dequeue from + * + *	Remove the tail of the list. The list lock is taken so the function + *	may be used safely with other locking list functions. The tail item is + *	returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ +	unsigned long flags; +	struct sk_buff *result; + +	spin_lock_irqsave(&list->lock, flags); +	result = __skb_dequeue_tail(list); +	spin_unlock_irqrestore(&list->lock, flags); +	return result; +} + +/** + *	skb_queue_purge - empty a list + *	@list: list to empty + * + *	Delete all buffers on an &sk_buff list. Each buffer is removed from + *	the list and one reference dropped. This function takes the list + *	lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ +	struct sk_buff *skb; +	while ((skb = skb_dequeue(list)) != NULL) +		kfree_skb(skb); +} + +/** + *	skb_queue_head - queue a buffer at the list head + *	@list: list to use + *	@newsk: buffer to queue + * + *	Queue a buffer at the start of the list. This function takes the + *	list lock and can be used safely with other locking &sk_buff functions + *	safely. + * + *	A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ +	unsigned long flags; + +	spin_lock_irqsave(&list->lock, flags); +	__skb_queue_head(list, newsk); +	spin_unlock_irqrestore(&list->lock, flags); +} + +/** + *	skb_queue_tail - queue a buffer at the list tail + *	@list: list to use + *	@newsk: buffer to queue + * + *	Queue a buffer at the tail of the list. This function takes the + *	list lock and can be used safely with other locking &sk_buff functions + *	safely. + * + *	A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ +	unsigned long flags; + +	spin_lock_irqsave(&list->lock, flags); +	__skb_queue_tail(list, newsk); +	spin_unlock_irqrestore(&list->lock, flags); +} +/** + *	skb_unlink	-	remove a buffer from a list + *	@skb: buffer to remove + * + *	Place a packet after a given packet in a list. The list locks are taken + *	and this function is atomic with respect to other list locked calls + * + *	Works even without knowing the list it is sitting on, which can be + *	handy at times. It also means that THE LIST MUST EXIST when you + *	unlink. Thus a list must have its contents unlinked before it is + *	destroyed. + */ +void skb_unlink(struct sk_buff *skb) +{ +	struct sk_buff_head *list = skb->list; + +	if (list) { +		unsigned long flags; + +		spin_lock_irqsave(&list->lock, flags); +		if (skb->list == list) +			__skb_unlink(skb, skb->list); +		spin_unlock_irqrestore(&list->lock, flags); +	} +} + + +/** + *	skb_append	-	append a buffer + *	@old: buffer to insert after + *	@newsk: buffer to insert + * + *	Place a packet after a given packet in a list. The list locks are taken + *	and this function is atomic with respect to other list locked calls. + *	A buffer cannot be placed on two lists at the same time. + */ + +void skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ +	unsigned long flags; + +	spin_lock_irqsave(&old->list->lock, flags); +	__skb_append(old, newsk); +	spin_unlock_irqrestore(&old->list->lock, flags); +} + + +/** + *	skb_insert	-	insert a buffer + *	@old: buffer to insert before + *	@newsk: buffer to insert + * + *	Place a packet before a given packet in a list. The list locks are taken + *	and this function is atomic with respect to other list locked calls + *	A buffer cannot be placed on two lists at the same time. + */ + +void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +{ +	unsigned long flags; + +	spin_lock_irqsave(&old->list->lock, flags); +	__skb_insert(newsk, old->prev, old, old->list); +	spin_unlock_irqrestore(&old->list->lock, flags); +} + +#if 0 +/* + * 	Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ +	/* Must match allocation in alloc_skb */ +	mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); + +	kmem_add_cache_size(mtu); +} +#endif + +static inline void skb_split_inside_header(struct sk_buff *skb, +					   struct sk_buff* skb1, +					   const u32 len, const int pos) +{ +	int i; + +	memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len); + +	/* And move data appendix as is. */ +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + +	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; +	skb_shinfo(skb)->nr_frags  = 0; +	skb1->data_len		   = skb->data_len; +	skb1->len		   += skb1->data_len; +	skb->data_len		   = 0; +	skb->len		   = len; +	skb->tail		   = skb->data + len; +} + +static inline void skb_split_no_header(struct sk_buff *skb, +				       struct sk_buff* skb1, +				       const u32 len, int pos) +{ +	int i, k = 0; +	const int nfrags = skb_shinfo(skb)->nr_frags; + +	skb_shinfo(skb)->nr_frags = 0; +	skb1->len		  = skb1->data_len = skb->len - len; +	skb->len		  = len; +	skb->data_len		  = len - pos; + +	for (i = 0; i < nfrags; i++) { +		int size = skb_shinfo(skb)->frags[i].size; + +		if (pos + size > len) { +			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + +			if (pos < len) { +				/* Split frag. +				 * We have two variants in this case: +				 * 1. Move all the frag to the second +				 *    part, if it is possible. F.e. +				 *    this approach is mandatory for TUX, +				 *    where splitting is expensive. +				 * 2. Split is accurately. We make this. +				 */ +				get_page(skb_shinfo(skb)->frags[i].page); +				skb_shinfo(skb1)->frags[0].page_offset += len - pos; +				skb_shinfo(skb1)->frags[0].size -= len - pos; +				skb_shinfo(skb)->frags[i].size	= len - pos; +				skb_shinfo(skb)->nr_frags++; +			} +			k++; +		} else +			skb_shinfo(skb)->nr_frags++; +		pos += size; +	} +	skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ +	int pos = skb_headlen(skb); + +	if (len < pos)	/* Split line is inside header. */ +		skb_split_inside_header(skb, skb1, len, pos); +	else		/* Second chunk has no header, nothing to copy. */ +		skb_split_no_header(skb, skb1, len, pos); +} + +void __init skb_init(void) +{ +	skbuff_head_cache = kmem_cache_create("skbuff_head_cache", +					      sizeof(struct sk_buff), +					      0, +					      SLAB_HWCACHE_ALIGN, +					      NULL, NULL); +	if (!skbuff_head_cache) +		panic("cannot create skbuff cache"); +} + +EXPORT_SYMBOL(___pskb_trim); +EXPORT_SYMBOL(__kfree_skb); +EXPORT_SYMBOL(__pskb_pull_tail); +EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(pskb_expand_head); +EXPORT_SYMBOL(skb_checksum); +EXPORT_SYMBOL(skb_clone); +EXPORT_SYMBOL(skb_clone_fraglist); +EXPORT_SYMBOL(skb_copy); +EXPORT_SYMBOL(skb_copy_and_csum_bits); +EXPORT_SYMBOL(skb_copy_and_csum_dev); +EXPORT_SYMBOL(skb_copy_bits); +EXPORT_SYMBOL(skb_copy_expand); +EXPORT_SYMBOL(skb_over_panic); +EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(skb_realloc_headroom); +EXPORT_SYMBOL(skb_under_panic); +EXPORT_SYMBOL(skb_dequeue); +EXPORT_SYMBOL(skb_dequeue_tail); +EXPORT_SYMBOL(skb_insert); +EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_head); +EXPORT_SYMBOL(skb_queue_tail); +EXPORT_SYMBOL(skb_unlink); +EXPORT_SYMBOL(skb_append); +EXPORT_SYMBOL(skb_split); diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 00000000000..629ab4a5b45 --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,1565 @@ +/* + * INET		An implementation of the TCP/IP protocol suite for the LINUX + *		operating system.  INET is implemented using the  BSD Socket + *		interface as the means of communication with the user level. + * + *		Generic socket support routines. Memory allocators, socket lock/release + *		handler for protocols to use and generic option handler. + * + * + * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ + * + * Authors:	Ross Biro, <bir7@leland.Stanford.Edu> + *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + *		Florian La Roche, <flla@stud.uni-sb.de> + *		Alan Cox, <A.Cox@swansea.ac.uk> + * + * Fixes: + *		Alan Cox	: 	Numerous verify_area() problems + *		Alan Cox	:	Connecting on a connecting socket + *					now returns an error for tcp. + *		Alan Cox	:	sock->protocol is set correctly. + *					and is not sometimes left as 0. + *		Alan Cox	:	connect handles icmp errors on a + *					connect properly. Unfortunately there + *					is a restart syscall nasty there. I + *					can't match BSD without hacking the C + *					library. Ideas urgently sought! + *		Alan Cox	:	Disallow bind() to addresses that are + *					not ours - especially broadcast ones!! + *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost) + *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets, + *					instead they leave that for the DESTROY timer. + *		Alan Cox	:	Clean up error flag in accept + *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer + *					was buggy. Put a remove_sock() in the handler + *					for memory when we hit 0. Also altered the timer + *					code. The ACK stuff can wait and needs major  + *					TCP layer surgery. + *		Alan Cox	:	Fixed TCP ack bug, removed remove sock + *					and fixed timer/inet_bh race. + *		Alan Cox	:	Added zapped flag for TCP + *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code + *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources + *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing. + *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + *		Rick Sladkey	:	Relaxed UDP rules for matching packets. + *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support + *	Pauline Middelink	:	identd support + *		Alan Cox	:	Fixed connect() taking signals I think. + *		Alan Cox	:	SO_LINGER supported + *		Alan Cox	:	Error reporting fixes + *		Anonymous	:	inet_create tidied up (sk->reuse setting) + *		Alan Cox	:	inet sockets don't set sk->type! + *		Alan Cox	:	Split socket option code + *		Alan Cox	:	Callbacks + *		Alan Cox	:	Nagle flag for Charles & Johannes stuff + *		Alex		:	Removed restriction on inet fioctl + *		Alan Cox	:	Splitting INET from NET core + *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt() + *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code + *		Alan Cox	:	Split IP from generic code + *		Alan Cox	:	New kfree_skbmem() + *		Alan Cox	:	Make SO_DEBUG superuser only. + *		Alan Cox	:	Allow anyone to clear SO_DEBUG + *					(compatibility fix) + *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput. + *		Alan Cox	:	Allocator for a socket is settable. + *		Alan Cox	:	SO_ERROR includes soft errors. + *		Alan Cox	:	Allow NULL arguments on some SO_ opts + *		Alan Cox	: 	Generic socket allocation to make hooks + *					easier (suggested by Craig Metz). + *		Michael Pall	:	SO_ERROR returns positive errno again + *              Steve Whitehouse:       Added default destructor to free + *                                      protocol private data. + *              Steve Whitehouse:       Added various other default routines + *                                      common to several socket families. + *              Chris Evans     :       Call suser() check last on F_SETOWN + *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s() + *		Andi Kleen	:	Fix write_space callback + *		Chris Evans	:	Security fixes - signedness again + *		Arnaldo C. Melo :       cleanups, use skb_queue_purge + * + * To Fix: + * + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/poll.h> +#include <linux/tcp.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/netdevice.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/xfrm.h> +#include <linux/ipsec.h> + +#include <linux/filter.h> + +#ifdef CONFIG_INET +#include <net/tcp.h> +#endif + +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms.  This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS		256 +#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256) +#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max = SK_WMEM_MAX; +__u32 sysctl_rmem_max = SK_RMEM_MAX; +__u32 sysctl_wmem_default = SK_WMEM_MAX; +__u32 sysctl_rmem_default = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancilliary data plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +{ +	struct timeval tv; + +	if (optlen < sizeof(tv)) +		return -EINVAL; +	if (copy_from_user(&tv, optval, sizeof(tv))) +		return -EFAULT; + +	*timeo_p = MAX_SCHEDULE_TIMEOUT; +	if (tv.tv_sec == 0 && tv.tv_usec == 0) +		return 0; +	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) +		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); +	return 0; +} + +static void sock_warn_obsolete_bsdism(const char *name) +{ +	static int warned; +	static char warncomm[TASK_COMM_LEN]; +	if (strcmp(warncomm, current->comm) && warned < 5) {  +		strcpy(warncomm,  current->comm);  +		printk(KERN_WARNING "process `%s' is using obsolete " +		       "%s SO_BSDCOMPAT\n", warncomm, name); +		warned++; +	} +} + +static void sock_disable_timestamp(struct sock *sk) +{	 +	if (sock_flag(sk, SOCK_TIMESTAMP)) {  +		sock_reset_flag(sk, SOCK_TIMESTAMP); +		net_disable_timestamp(); +	} +} + + +/* + *	This is meant for all protocols to use and covers goings on + *	at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, +		    char __user *optval, int optlen) +{ +	struct sock *sk=sock->sk; +	struct sk_filter *filter; +	int val; +	int valbool; +	struct linger ling; +	int ret = 0; +	 +	/* +	 *	Options without arguments +	 */ + +#ifdef SO_DONTLINGER		/* Compatibility item... */ +	switch (optname) { +		case SO_DONTLINGER: +			sock_reset_flag(sk, SOCK_LINGER); +			return 0; +	} +#endif	 +		 +  	if(optlen<sizeof(int)) +  		return(-EINVAL); +  	 +	if (get_user(val, (int __user *)optval)) +		return -EFAULT; +	 +  	valbool = val?1:0; + +	lock_sock(sk); + +  	switch(optname)  +  	{ +		case SO_DEBUG:	 +			if(val && !capable(CAP_NET_ADMIN)) +			{ +				ret = -EACCES; +			} +			else if (valbool) +				sock_set_flag(sk, SOCK_DBG); +			else +				sock_reset_flag(sk, SOCK_DBG); +			break; +		case SO_REUSEADDR: +			sk->sk_reuse = valbool; +			break; +		case SO_TYPE: +		case SO_ERROR: +			ret = -ENOPROTOOPT; +		  	break; +		case SO_DONTROUTE: +			if (valbool) +				sock_set_flag(sk, SOCK_LOCALROUTE); +			else +				sock_reset_flag(sk, SOCK_LOCALROUTE); +			break; +		case SO_BROADCAST: +			sock_valbool_flag(sk, SOCK_BROADCAST, valbool); +			break; +		case SO_SNDBUF: +			/* Don't error on this BSD doesn't and if you think +			   about it this is right. Otherwise apps have to +			   play 'guess the biggest size' games. RCVBUF/SNDBUF +			   are treated in BSD as hints */ +			    +			if (val > sysctl_wmem_max) +				val = sysctl_wmem_max; + +			sk->sk_userlocks |= SOCK_SNDBUF_LOCK; +			if ((val * 2) < SOCK_MIN_SNDBUF) +				sk->sk_sndbuf = SOCK_MIN_SNDBUF; +			else +				sk->sk_sndbuf = val * 2; + +			/* +			 *	Wake up sending tasks if we +			 *	upped the value. +			 */ +			sk->sk_write_space(sk); +			break; + +		case SO_RCVBUF: +			/* Don't error on this BSD doesn't and if you think +			   about it this is right. Otherwise apps have to +			   play 'guess the biggest size' games. RCVBUF/SNDBUF +			   are treated in BSD as hints */ +			   +			if (val > sysctl_rmem_max) +				val = sysctl_rmem_max; + +			sk->sk_userlocks |= SOCK_RCVBUF_LOCK; +			/* FIXME: is this lower bound the right one? */ +			if ((val * 2) < SOCK_MIN_RCVBUF) +				sk->sk_rcvbuf = SOCK_MIN_RCVBUF; +			else +				sk->sk_rcvbuf = val * 2; +			break; + +		case SO_KEEPALIVE: +#ifdef CONFIG_INET +			if (sk->sk_protocol == IPPROTO_TCP) +				tcp_set_keepalive(sk, valbool); +#endif +			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); +			break; + +	 	case SO_OOBINLINE: +			sock_valbool_flag(sk, SOCK_URGINLINE, valbool); +			break; + +	 	case SO_NO_CHECK: +			sk->sk_no_check = valbool; +			break; + +		case SO_PRIORITY: +			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))  +				sk->sk_priority = val; +			else +				ret = -EPERM; +			break; + +		case SO_LINGER: +			if(optlen<sizeof(ling)) { +				ret = -EINVAL;	/* 1003.1g */ +				break; +			} +			if (copy_from_user(&ling,optval,sizeof(ling))) { +				ret = -EFAULT; +				break; +			} +			if (!ling.l_onoff) +				sock_reset_flag(sk, SOCK_LINGER); +			else { +#if (BITS_PER_LONG == 32) +				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) +					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; +				else +#endif +					sk->sk_lingertime = ling.l_linger * HZ; +				sock_set_flag(sk, SOCK_LINGER); +			} +			break; + +		case SO_BSDCOMPAT: +			sock_warn_obsolete_bsdism("setsockopt"); +			break; + +		case SO_PASSCRED: +			if (valbool) +				set_bit(SOCK_PASSCRED, &sock->flags); +			else +				clear_bit(SOCK_PASSCRED, &sock->flags); +			break; + +		case SO_TIMESTAMP: +			if (valbool)  { +				sock_set_flag(sk, SOCK_RCVTSTAMP); +				sock_enable_timestamp(sk); +			} else +				sock_reset_flag(sk, SOCK_RCVTSTAMP); +			break; + +		case SO_RCVLOWAT: +			if (val < 0) +				val = INT_MAX; +			sk->sk_rcvlowat = val ? : 1; +			break; + +		case SO_RCVTIMEO: +			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); +			break; + +		case SO_SNDTIMEO: +			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); +			break; + +#ifdef CONFIG_NETDEVICES +		case SO_BINDTODEVICE: +		{ +			char devname[IFNAMSIZ];  + +			/* Sorry... */  +			if (!capable(CAP_NET_RAW)) { +				ret = -EPERM; +				break; +			} + +			/* Bind this socket to a particular device like "eth0", +			 * as specified in the passed interface name. If the +			 * name is "" or the option length is zero the socket  +			 * is not bound.  +			 */  + +			if (!valbool) { +				sk->sk_bound_dev_if = 0; +			} else { +				if (optlen > IFNAMSIZ)  +					optlen = IFNAMSIZ;  +				if (copy_from_user(devname, optval, optlen)) { +					ret = -EFAULT; +					break; +				} + +				/* Remove any cached route for this socket. */ +				sk_dst_reset(sk); + +				if (devname[0] == '\0') { +					sk->sk_bound_dev_if = 0; +				} else { +					struct net_device *dev = dev_get_by_name(devname); +					if (!dev) { +						ret = -ENODEV; +						break; +					} +					sk->sk_bound_dev_if = dev->ifindex; +					dev_put(dev); +				} +			} +			break; +		} +#endif + + +		case SO_ATTACH_FILTER: +			ret = -EINVAL; +			if (optlen == sizeof(struct sock_fprog)) { +				struct sock_fprog fprog; + +				ret = -EFAULT; +				if (copy_from_user(&fprog, optval, sizeof(fprog))) +					break; + +				ret = sk_attach_filter(&fprog, sk); +			} +			break; + +		case SO_DETACH_FILTER: +			spin_lock_bh(&sk->sk_lock.slock); +			filter = sk->sk_filter; +                        if (filter) { +				sk->sk_filter = NULL; +				spin_unlock_bh(&sk->sk_lock.slock); +				sk_filter_release(sk, filter); +				break; +			} +			spin_unlock_bh(&sk->sk_lock.slock); +			ret = -ENONET; +			break; + +		/* We implement the SO_SNDLOWAT etc to +		   not be settable (1003.1g 5.3) */ +		default: +		  	ret = -ENOPROTOOPT; +			break; +  	} +	release_sock(sk); +	return ret; +} + + +int sock_getsockopt(struct socket *sock, int level, int optname, +		    char __user *optval, int __user *optlen) +{ +	struct sock *sk = sock->sk; +	 +	union +	{ +  		int val; +  		struct linger ling; +		struct timeval tm; +	} v; +	 +	unsigned int lv = sizeof(int); +	int len; +  	 +  	if(get_user(len,optlen)) +  		return -EFAULT; +	if(len < 0) +		return -EINVAL; +		 +  	switch(optname)  +  	{ +		case SO_DEBUG:		 +			v.val = sock_flag(sk, SOCK_DBG); +			break; +		 +		case SO_DONTROUTE: +			v.val = sock_flag(sk, SOCK_LOCALROUTE); +			break; +		 +		case SO_BROADCAST: +			v.val = !!sock_flag(sk, SOCK_BROADCAST); +			break; + +		case SO_SNDBUF: +			v.val = sk->sk_sndbuf; +			break; +		 +		case SO_RCVBUF: +			v.val = sk->sk_rcvbuf; +			break; + +		case SO_REUSEADDR: +			v.val = sk->sk_reuse; +			break; + +		case SO_KEEPALIVE: +			v.val = !!sock_flag(sk, SOCK_KEEPOPEN); +			break; + +		case SO_TYPE: +			v.val = sk->sk_type;		  		 +			break; + +		case SO_ERROR: +			v.val = -sock_error(sk); +			if(v.val==0) +				v.val = xchg(&sk->sk_err_soft, 0); +			break; + +		case SO_OOBINLINE: +			v.val = !!sock_flag(sk, SOCK_URGINLINE); +			break; +	 +		case SO_NO_CHECK: +			v.val = sk->sk_no_check; +			break; + +		case SO_PRIORITY: +			v.val = sk->sk_priority; +			break; +		 +		case SO_LINGER:	 +			lv		= sizeof(v.ling); +			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER); + 			v.ling.l_linger	= sk->sk_lingertime / HZ; +			break; +					 +		case SO_BSDCOMPAT: +			sock_warn_obsolete_bsdism("getsockopt"); +			break; + +		case SO_TIMESTAMP: +			v.val = sock_flag(sk, SOCK_RCVTSTAMP); +			break; + +		case SO_RCVTIMEO: +			lv=sizeof(struct timeval); +			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { +				v.tm.tv_sec = 0; +				v.tm.tv_usec = 0; +			} else { +				v.tm.tv_sec = sk->sk_rcvtimeo / HZ; +				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; +			} +			break; + +		case SO_SNDTIMEO: +			lv=sizeof(struct timeval); +			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { +				v.tm.tv_sec = 0; +				v.tm.tv_usec = 0; +			} else { +				v.tm.tv_sec = sk->sk_sndtimeo / HZ; +				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; +			} +			break; + +		case SO_RCVLOWAT: +			v.val = sk->sk_rcvlowat; +			break; + +		case SO_SNDLOWAT: +			v.val=1; +			break;  + +		case SO_PASSCRED: +			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; +			break; + +		case SO_PEERCRED: +			if (len > sizeof(sk->sk_peercred)) +				len = sizeof(sk->sk_peercred); +			if (copy_to_user(optval, &sk->sk_peercred, len)) +				return -EFAULT; +			goto lenout; + +		case SO_PEERNAME: +		{ +			char address[128]; + +			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) +				return -ENOTCONN; +			if (lv < len) +				return -EINVAL; +			if (copy_to_user(optval, address, len)) +				return -EFAULT; +			goto lenout; +		} + +		/* Dubious BSD thing... Probably nobody even uses it, but +		 * the UNIX standard wants it for whatever reason... -DaveM +		 */ +		case SO_ACCEPTCONN: +			v.val = sk->sk_state == TCP_LISTEN; +			break; + +		case SO_PEERSEC: +			return security_socket_getpeersec(sock, optval, optlen, len); + +		default: +			return(-ENOPROTOOPT); +	} +	if (len > lv) +		len = lv; +	if (copy_to_user(optval, &v, len)) +		return -EFAULT; +lenout: +  	if (put_user(len, optlen)) +  		return -EFAULT; +  	return 0; +} + +/** + *	sk_alloc - All socket objects are allocated here + *	@family - protocol family + *	@priority - for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + *	@prot - struct proto associated with this new sock instance + *	@zero_it - if we should zero the newly allocated sock + */ +struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) +{ +	struct sock *sk = NULL; +	kmem_cache_t *slab = prot->slab; + +	if (slab != NULL) +		sk = kmem_cache_alloc(slab, priority); +	else +		sk = kmalloc(prot->obj_size, priority); + +	if (sk) { +		if (zero_it) { +			memset(sk, 0, prot->obj_size); +			sk->sk_family = family; +			sk->sk_prot = prot; +			sock_lock_init(sk); +		} +		 +		if (security_sk_alloc(sk, family, priority)) { +			kmem_cache_free(slab, sk); +			sk = NULL; +		} else +			__module_get(prot->owner); +	} +	return sk; +} + +void sk_free(struct sock *sk) +{ +	struct sk_filter *filter; +	struct module *owner = sk->sk_prot->owner; + +	if (sk->sk_destruct) +		sk->sk_destruct(sk); + +	filter = sk->sk_filter; +	if (filter) { +		sk_filter_release(sk, filter); +		sk->sk_filter = NULL; +	} + +	sock_disable_timestamp(sk); + +	if (atomic_read(&sk->sk_omem_alloc)) +		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", +		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + +	security_sk_free(sk); +	if (sk->sk_prot->slab != NULL) +		kmem_cache_free(sk->sk_prot->slab, sk); +	else +		kfree(sk); +	module_put(owner); +} + +void __init sk_init(void) +{ +	if (num_physpages <= 4096) { +		sysctl_wmem_max = 32767; +		sysctl_rmem_max = 32767; +		sysctl_wmem_default = 32767; +		sysctl_rmem_default = 32767; +	} else if (num_physpages >= 131072) { +		sysctl_wmem_max = 131071; +		sysctl_rmem_max = 131071; +	} +} + +/* + *	Simple resource managers for sockets. + */ + + +/*  + * Write buffer destructor automatically called from kfree_skb.  + */ +void sock_wfree(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; + +	/* In case it might be waiting for more memory. */ +	atomic_sub(skb->truesize, &sk->sk_wmem_alloc); +	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) +		sk->sk_write_space(sk); +	sock_put(sk); +} + +/*  + * Read buffer destructor automatically called from kfree_skb.  + */ +void sock_rfree(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; + +	atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + + +int sock_i_uid(struct sock *sk) +{ +	int uid; + +	read_lock(&sk->sk_callback_lock); +	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; +	read_unlock(&sk->sk_callback_lock); +	return uid; +} + +unsigned long sock_i_ino(struct sock *sk) +{ +	unsigned long ino; + +	read_lock(&sk->sk_callback_lock); +	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; +	read_unlock(&sk->sk_callback_lock); +	return ino; +} + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ +	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +		struct sk_buff * skb = alloc_skb(size, priority); +		if (skb) { +			skb_set_owner_w(skb, sk); +			return skb; +		} +	} +	return NULL; +} + +/* + * Allocate a skb from the socket's receive buffer. + */  +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ +	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { +		struct sk_buff *skb = alloc_skb(size, priority); +		if (skb) { +			skb_set_owner_r(skb, sk); +			return skb; +		} +	} +	return NULL; +} + +/*  + * Allocate a memory block from the socket's option memory buffer. + */  +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ +	if ((unsigned)size <= sysctl_optmem_max && +	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { +		void *mem; +		/* First do the add, to avoid the race if kmalloc + 		 * might sleep. +		 */ +		atomic_add(size, &sk->sk_omem_alloc); +		mem = kmalloc(size, priority); +		if (mem) +			return mem; +		atomic_sub(size, &sk->sk_omem_alloc); +	} +	return NULL; +} + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ +	kfree(mem); +	atomic_sub(size, &sk->sk_omem_alloc); +} + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. +   I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock * sk, long timeo) +{ +	DEFINE_WAIT(wait); + +	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +	for (;;) { +		if (!timeo) +			break; +		if (signal_pending(current)) +			break; +		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); +		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) +			break; +		if (sk->sk_shutdown & SEND_SHUTDOWN) +			break; +		if (sk->sk_err) +			break; +		timeo = schedule_timeout(timeo); +	} +	finish_wait(sk->sk_sleep, &wait); +	return timeo; +} + + +/* + *	Generic send/receive buffer handlers + */ + +static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, +					    unsigned long header_len, +					    unsigned long data_len, +					    int noblock, int *errcode) +{ +	struct sk_buff *skb; +	unsigned int gfp_mask; +	long timeo; +	int err; + +	gfp_mask = sk->sk_allocation; +	if (gfp_mask & __GFP_WAIT) +		gfp_mask |= __GFP_REPEAT; + +	timeo = sock_sndtimeo(sk, noblock); +	while (1) { +		err = sock_error(sk); +		if (err != 0) +			goto failure; + +		err = -EPIPE; +		if (sk->sk_shutdown & SEND_SHUTDOWN) +			goto failure; + +		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +			skb = alloc_skb(header_len, sk->sk_allocation); +			if (skb) { +				int npages; +				int i; + +				/* No pages, we're done... */ +				if (!data_len) +					break; + +				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +				skb->truesize += data_len; +				skb_shinfo(skb)->nr_frags = npages; +				for (i = 0; i < npages; i++) { +					struct page *page; +					skb_frag_t *frag; + +					page = alloc_pages(sk->sk_allocation, 0); +					if (!page) { +						err = -ENOBUFS; +						skb_shinfo(skb)->nr_frags = i; +						kfree_skb(skb); +						goto failure; +					} + +					frag = &skb_shinfo(skb)->frags[i]; +					frag->page = page; +					frag->page_offset = 0; +					frag->size = (data_len >= PAGE_SIZE ? +						      PAGE_SIZE : +						      data_len); +					data_len -= PAGE_SIZE; +				} + +				/* Full success... */ +				break; +			} +			err = -ENOBUFS; +			goto failure; +		} +		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +		err = -EAGAIN; +		if (!timeo) +			goto failure; +		if (signal_pending(current)) +			goto interrupted; +		timeo = sock_wait_for_wmem(sk, timeo); +	} + +	skb_set_owner_w(skb, sk); +	return skb; + +interrupted: +	err = sock_intr_errno(timeo); +failure: +	*errcode = err; +	return NULL; +} + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,  +				    int noblock, int *errcode) +{ +	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +} + +static void __lock_sock(struct sock *sk) +{ +	DEFINE_WAIT(wait); + +	for(;;) { +		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, +					TASK_UNINTERRUPTIBLE); +		spin_unlock_bh(&sk->sk_lock.slock); +		schedule(); +		spin_lock_bh(&sk->sk_lock.slock); +		if(!sock_owned_by_user(sk)) +			break; +	} +	finish_wait(&sk->sk_lock.wq, &wait); +} + +static void __release_sock(struct sock *sk) +{ +	struct sk_buff *skb = sk->sk_backlog.head; + +	do { +		sk->sk_backlog.head = sk->sk_backlog.tail = NULL; +		bh_unlock_sock(sk); + +		do { +			struct sk_buff *next = skb->next; + +			skb->next = NULL; +			sk->sk_backlog_rcv(sk, skb); + +			/* +			 * We are in process context here with softirqs +			 * disabled, use cond_resched_softirq() to preempt. +			 * This is safe to do because we've taken the backlog +			 * queue private: +			 */ +			cond_resched_softirq(); + +			skb = next; +		} while (skb != NULL); + +		bh_lock_sock(sk); +	} while((skb = sk->sk_backlog.head) != NULL); +} + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * sk - sock to wait on + * timeo - for how long + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo) +{ +	int rc; +	DEFINE_WAIT(wait); + +	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); +	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); +	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); +	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); +	finish_wait(sk->sk_sleep, &wait); +	return rc; +} + +EXPORT_SYMBOL(sk_wait_data); + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ +	return -EOPNOTSUPP; +} + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr,  +		    int len, int flags) +{ +	return -EOPNOTSUPP; +} + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ +	return -EOPNOTSUPP; +} + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ +	return -EOPNOTSUPP; +} + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr,  +		    int *len, int peer) +{ +	return -EOPNOTSUPP; +} + +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +{ +	return 0; +} + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ +	return -EOPNOTSUPP; +} + +int sock_no_listen(struct socket *sock, int backlog) +{ +	return -EOPNOTSUPP; +} + +int sock_no_shutdown(struct socket *sock, int how) +{ +	return -EOPNOTSUPP; +} + +int sock_no_setsockopt(struct socket *sock, int level, int optname, +		    char __user *optval, int optlen) +{ +	return -EOPNOTSUPP; +} + +int sock_no_getsockopt(struct socket *sock, int level, int optname, +		    char __user *optval, int __user *optlen) +{ +	return -EOPNOTSUPP; +} + +int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, +		    size_t len) +{ +	return -EOPNOTSUPP; +} + +int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, +		    size_t len, int flags) +{ +	return -EOPNOTSUPP; +} + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ +	/* Mirror missing mmap method error code */ +	return -ENODEV; +} + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ +	ssize_t res; +	struct msghdr msg = {.msg_flags = flags}; +	struct kvec iov; +	char *kaddr = kmap(page); +	iov.iov_base = kaddr + offset; +	iov.iov_len = size; +	res = kernel_sendmsg(sock, &msg, &iov, 1, size); +	kunmap(page); +	return res; +} + +/* + *	Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ +	read_lock(&sk->sk_callback_lock); +	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) +		wake_up_interruptible_all(sk->sk_sleep); +	read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_error_report(struct sock *sk) +{ +	read_lock(&sk->sk_callback_lock); +	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) +		wake_up_interruptible(sk->sk_sleep); +	sk_wake_async(sk,0,POLL_ERR);  +	read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_readable(struct sock *sk, int len) +{ +	read_lock(&sk->sk_callback_lock); +	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) +		wake_up_interruptible(sk->sk_sleep); +	sk_wake_async(sk,1,POLL_IN); +	read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_write_space(struct sock *sk) +{ +	read_lock(&sk->sk_callback_lock); + +	/* Do not wake up a writer until he can make "significant" +	 * progress.  --DaveM +	 */ +	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { +		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) +			wake_up_interruptible(sk->sk_sleep); + +		/* Should agree with poll, otherwise some programs break */ +		if (sock_writeable(sk)) +			sk_wake_async(sk, 2, POLL_OUT); +	} + +	read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_destruct(struct sock *sk) +{ +	if (sk->sk_protinfo) +		kfree(sk->sk_protinfo); +} + +void sk_send_sigurg(struct sock *sk) +{ +	if (sk->sk_socket && sk->sk_socket->file) +		if (send_sigurg(&sk->sk_socket->file->f_owner)) +			sk_wake_async(sk, 3, POLL_PRI); +} + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, +		    unsigned long expires) +{ +	if (!mod_timer(timer, expires)) +		sock_hold(sk); +} + +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ +	if (timer_pending(timer) && del_timer(timer)) +		__sock_put(sk); +} + +EXPORT_SYMBOL(sk_stop_timer); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ +	skb_queue_head_init(&sk->sk_receive_queue); +	skb_queue_head_init(&sk->sk_write_queue); +	skb_queue_head_init(&sk->sk_error_queue); + +	sk->sk_send_head	=	NULL; + +	init_timer(&sk->sk_timer); +	 +	sk->sk_allocation	=	GFP_KERNEL; +	sk->sk_rcvbuf		=	sysctl_rmem_default; +	sk->sk_sndbuf		=	sysctl_wmem_default; +	sk->sk_state		=	TCP_CLOSE; +	sk->sk_socket		=	sock; + +	sock_set_flag(sk, SOCK_ZAPPED); + +	if(sock) +	{ +		sk->sk_type	=	sock->type; +		sk->sk_sleep	=	&sock->wait; +		sock->sk	=	sk; +	} else +		sk->sk_sleep	=	NULL; + +	rwlock_init(&sk->sk_dst_lock); +	rwlock_init(&sk->sk_callback_lock); + +	sk->sk_state_change	=	sock_def_wakeup; +	sk->sk_data_ready	=	sock_def_readable; +	sk->sk_write_space	=	sock_def_write_space; +	sk->sk_error_report	=	sock_def_error_report; +	sk->sk_destruct		=	sock_def_destruct; + +	sk->sk_sndmsg_page	=	NULL; +	sk->sk_sndmsg_off	=	0; + +	sk->sk_peercred.pid 	=	0; +	sk->sk_peercred.uid	=	-1; +	sk->sk_peercred.gid	=	-1; +	sk->sk_write_pending	=	0; +	sk->sk_rcvlowat		=	1; +	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT; +	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT; + +	sk->sk_stamp.tv_sec     = -1L; +	sk->sk_stamp.tv_usec    = -1L; + +	atomic_set(&sk->sk_refcnt, 1); +} + +void fastcall lock_sock(struct sock *sk) +{ +	might_sleep(); +	spin_lock_bh(&(sk->sk_lock.slock)); +	if (sk->sk_lock.owner) +		__lock_sock(sk); +	sk->sk_lock.owner = (void *)1; +	spin_unlock_bh(&(sk->sk_lock.slock)); +} + +EXPORT_SYMBOL(lock_sock); + +void fastcall release_sock(struct sock *sk) +{ +	spin_lock_bh(&(sk->sk_lock.slock)); +	if (sk->sk_backlog.tail) +		__release_sock(sk); +	sk->sk_lock.owner = NULL; +        if (waitqueue_active(&(sk->sk_lock.wq))) +		wake_up(&(sk->sk_lock.wq)); +	spin_unlock_bh(&(sk->sk_lock.slock)); +} +EXPORT_SYMBOL(release_sock); + +int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{  +	if (!sock_flag(sk, SOCK_TIMESTAMP)) +		sock_enable_timestamp(sk); +	if (sk->sk_stamp.tv_sec == -1)  +		return -ENOENT; +	if (sk->sk_stamp.tv_sec == 0) +		do_gettimeofday(&sk->sk_stamp); +	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ? +		-EFAULT : 0;  +}  +EXPORT_SYMBOL(sock_get_timestamp); + +void sock_enable_timestamp(struct sock *sk) +{	 +	if (!sock_flag(sk, SOCK_TIMESTAMP)) {  +		sock_set_flag(sk, SOCK_TIMESTAMP); +		net_enable_timestamp(); +	} +} +EXPORT_SYMBOL(sock_enable_timestamp);  + +/* + *	Get a socket option on an socket. + * + *	FIX: POSIX 1003.1g is very ambiguous here. It states that + *	asynchronous errors should be reported by getsockopt. We assume + *	this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, +			   char __user *optval, int __user *optlen) +{ +	struct sock *sk = sock->sk; + +	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_getsockopt); + +int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, +			struct msghdr *msg, size_t size, int flags) +{ +	struct sock *sk = sock->sk; +	int addr_len = 0; +	int err; + +	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, +				   flags & ~MSG_DONTWAIT, &addr_len); +	if (err >= 0) +		msg->msg_namelen = addr_len; +	return err; +} + +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + *	Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, +			   char __user *optval, int optlen) +{ +	struct sock *sk = sock->sk; + +	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_setsockopt); + +void sk_common_release(struct sock *sk) +{ +	if (sk->sk_prot->destroy) +		sk->sk_prot->destroy(sk); + +	/* +	 * Observation: when sock_common_release is called, processes have +	 * no access to socket. But net still has. +	 * Step one, detach it from networking: +	 * +	 * A. Remove from hash tables. +	 */ + +	sk->sk_prot->unhash(sk); + +	/* +	 * In this point socket cannot receive new packets, but it is possible +	 * that some packets are in flight because some CPU runs receiver and +	 * did hash table lookup before we unhashed socket. They will achieve +	 * receive queue and will be purged by socket destructor. +	 * +	 * Also we still have packets pending on receive queue and probably, +	 * our own packets waiting in device queues. sock_destroy will drain +	 * receive queue, but transmitted packets will delay socket destruction +	 * until the last reference will be released. +	 */ + +	sock_orphan(sk); + +	xfrm_sk_free_policy(sk); + +#ifdef INET_REFCNT_DEBUG +	if (atomic_read(&sk->sk_refcnt) != 1) +		printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", +		       sk, atomic_read(&sk->sk_refcnt)); +#endif +	sock_put(sk); +} + +EXPORT_SYMBOL(sk_common_release); + +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +int proto_register(struct proto *prot, int alloc_slab) +{ +	int rc = -ENOBUFS; + +	write_lock(&proto_list_lock); + +	if (alloc_slab) { +		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, +					       SLAB_HWCACHE_ALIGN, NULL, NULL); + +		if (prot->slab == NULL) { +			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", +			       prot->name); +			goto out_unlock; +		} +	} + +	list_add(&prot->node, &proto_list); +	rc = 0; +out_unlock: +	write_unlock(&proto_list_lock); +	return rc; +} + +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ +	write_lock(&proto_list_lock); + +	if (prot->slab != NULL) { +		kmem_cache_destroy(prot->slab); +		prot->slab = NULL; +	} + +	list_del(&prot->node); +	write_unlock(&proto_list_lock); +} + +EXPORT_SYMBOL(proto_unregister); + +#ifdef CONFIG_PROC_FS +static inline struct proto *__proto_head(void) +{ +	return list_entry(proto_list.next, struct proto, node); +} + +static inline struct proto *proto_head(void) +{ +	return list_empty(&proto_list) ? NULL : __proto_head(); +} + +static inline struct proto *proto_next(struct proto *proto) +{ +	return proto->node.next == &proto_list ? NULL : +		list_entry(proto->node.next, struct proto, node); +} + +static inline struct proto *proto_get_idx(loff_t pos) +{ +	struct proto *proto; +	loff_t i = 0; + +	list_for_each_entry(proto, &proto_list, node) +		if (i++ == pos) +			goto out; + +	proto = NULL; +out: +	return proto; +} + +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) +{ +	read_lock(&proto_list_lock); +	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) +{ +	read_unlock(&proto_list_lock); +} + +static char proto_method_implemented(const void *method) +{ +	return method == NULL ? 'n' : 'y'; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ +	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s " +			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", +		   proto->name, +		   proto->obj_size, +		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1, +		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, +		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", +		   proto->max_header, +		   proto->slab == NULL ? "no" : "yes", +		   module_name(proto->owner), +		   proto_method_implemented(proto->close), +		   proto_method_implemented(proto->connect), +		   proto_method_implemented(proto->disconnect), +		   proto_method_implemented(proto->accept), +		   proto_method_implemented(proto->ioctl), +		   proto_method_implemented(proto->init), +		   proto_method_implemented(proto->destroy), +		   proto_method_implemented(proto->shutdown), +		   proto_method_implemented(proto->setsockopt), +		   proto_method_implemented(proto->getsockopt), +		   proto_method_implemented(proto->sendmsg), +		   proto_method_implemented(proto->recvmsg), +		   proto_method_implemented(proto->sendpage), +		   proto_method_implemented(proto->bind), +		   proto_method_implemented(proto->backlog_rcv), +		   proto_method_implemented(proto->hash), +		   proto_method_implemented(proto->unhash), +		   proto_method_implemented(proto->get_port), +		   proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", +			   "protocol", +			   "size", +			   "sockets", +			   "memory", +			   "press", +			   "maxhdr", +			   "slab", +			   "module", +			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); +	else +		proto_seq_printf(seq, v); +	return 0; +} + +static struct seq_operations proto_seq_ops = { +	.start  = proto_seq_start, +	.next   = proto_seq_next, +	.stop   = proto_seq_stop, +	.show   = proto_seq_show, +}; + +static int proto_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &proto_seq_ops); +} + +static struct file_operations proto_seq_fops = { +	.owner		= THIS_MODULE, +	.open		= proto_seq_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +}; + +static int __init proto_init(void) +{ +	/* register /proc/net/protocols */ +	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ + +EXPORT_SYMBOL(sk_alloc); +EXPORT_SYMBOL(sk_free); +EXPORT_SYMBOL(sk_send_sigurg); +EXPORT_SYMBOL(sock_alloc_send_skb); +EXPORT_SYMBOL(sock_init_data); +EXPORT_SYMBOL(sock_kfree_s); +EXPORT_SYMBOL(sock_kmalloc); +EXPORT_SYMBOL(sock_no_accept); +EXPORT_SYMBOL(sock_no_bind); +EXPORT_SYMBOL(sock_no_connect); +EXPORT_SYMBOL(sock_no_getname); +EXPORT_SYMBOL(sock_no_getsockopt); +EXPORT_SYMBOL(sock_no_ioctl); +EXPORT_SYMBOL(sock_no_listen); +EXPORT_SYMBOL(sock_no_mmap); +EXPORT_SYMBOL(sock_no_poll); +EXPORT_SYMBOL(sock_no_recvmsg); +EXPORT_SYMBOL(sock_no_sendmsg); +EXPORT_SYMBOL(sock_no_sendpage); +EXPORT_SYMBOL(sock_no_setsockopt); +EXPORT_SYMBOL(sock_no_shutdown); +EXPORT_SYMBOL(sock_no_socketpair); +EXPORT_SYMBOL(sock_rfree); +EXPORT_SYMBOL(sock_setsockopt); +EXPORT_SYMBOL(sock_wfree); +EXPORT_SYMBOL(sock_wmalloc); +EXPORT_SYMBOL(sock_i_uid); +EXPORT_SYMBOL(sock_i_ino); +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_optmem_max); +EXPORT_SYMBOL(sysctl_rmem_max); +EXPORT_SYMBOL(sysctl_wmem_max); +#endif diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 00000000000..1e27a57b5a9 --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,287 @@ +/* + *     SUCS NET3: + * + *     Generic stream handling routines. These are generic for most + *     protocols. Even IP. Tonight 8-). + *     This is used because TCP, LLC (others too) layer all have mostly + *     identical sendmsg() and recvmsg() code. + *     So we (will) share it here. + * + *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br> + *                     (from old tcp.c code) + *                     Alan Cox <alan@redhat.com> (Borrowed comments 8-)) + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/signal.h> +#include <linux/tcp.h> +#include <linux/wait.h> +#include <net/sock.h> + +/** + * sk_stream_write_space - stream socket write_space callback. + * sk - socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ +	struct socket *sock = sk->sk_socket; + +	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { +		clear_bit(SOCK_NOSPACE, &sock->flags); + +		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) +			wake_up_interruptible(sk->sk_sleep); +		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) +			sock_wake_async(sock, 2, POLL_OUT); +	} +} + +EXPORT_SYMBOL(sk_stream_write_space); + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk - sock to wait on + * @timeo_p - for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ +	struct task_struct *tsk = current; +	DEFINE_WAIT(wait); + +	while (1) { +		if (sk->sk_err) +			return sock_error(sk); +		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) +			return -EPIPE; +		if (!*timeo_p) +			return -EAGAIN; +		if (signal_pending(tsk)) +			return sock_intr_errno(*timeo_p); + +		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); +		sk->sk_write_pending++; +		if (sk_wait_event(sk, timeo_p, +				  !((1 << sk->sk_state) &  +				    ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) +			break; +		finish_wait(sk->sk_sleep, &wait); +		sk->sk_write_pending--; +	} +	return 0; +} + +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk - socket to verify + */ +static inline int sk_stream_closing(struct sock *sk) +{ +	return (1 << sk->sk_state) & +	       (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ +	if (timeout) { +		DEFINE_WAIT(wait); + +		do { +			prepare_to_wait(sk->sk_sleep, &wait, +					TASK_INTERRUPTIBLE); +			if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) +				break; +		} while (!signal_pending(current) && timeout); + +		finish_wait(sk->sk_sleep, &wait); +	} +} + +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk - socket to wait for memory + * @timeo_p - for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ +	int err = 0; +	long vm_wait = 0; +	long current_timeo = *timeo_p; +	DEFINE_WAIT(wait); + +	if (sk_stream_memory_free(sk)) +		current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + +	while (1) { +		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + +		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + +		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) +			goto do_error; +		if (!*timeo_p) +			goto do_nonblock; +		if (signal_pending(current)) +			goto do_interrupted; +		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +		if (sk_stream_memory_free(sk) && !vm_wait) +			break; + +		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +		sk->sk_write_pending++; +		sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && +						  vm_wait); +		sk->sk_write_pending--; + +		if (vm_wait) { +			vm_wait -= current_timeo; +			current_timeo = *timeo_p; +			if (current_timeo != MAX_SCHEDULE_TIMEOUT && +			    (current_timeo -= vm_wait) < 0) +				current_timeo = 0; +			vm_wait = 0; +		} +		*timeo_p = current_timeo; +	} +out: +	finish_wait(sk->sk_sleep, &wait); +	return err; + +do_error: +	err = -EPIPE; +	goto out; +do_nonblock: +	err = -EAGAIN; +	goto out; +do_interrupted: +	err = sock_intr_errno(*timeo_p); +	goto out; +} + +EXPORT_SYMBOL(sk_stream_wait_memory); + +void sk_stream_rfree(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; + +	atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +	sk->sk_forward_alloc += skb->truesize; +} + +EXPORT_SYMBOL(sk_stream_rfree); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ +	if (err == -EPIPE) +		err = sock_error(sk) ? : -EPIPE; +	if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) +		send_sig(SIGPIPE, current, 0); +	return err; +} + +EXPORT_SYMBOL(sk_stream_error); + +void __sk_stream_mem_reclaim(struct sock *sk) +{ +	if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) { +		atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM, +			   sk->sk_prot->memory_allocated); +		sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1; +		if (*sk->sk_prot->memory_pressure && +		    (atomic_read(sk->sk_prot->memory_allocated) < +		     sk->sk_prot->sysctl_mem[0])) +			*sk->sk_prot->memory_pressure = 0; +	} +} + +EXPORT_SYMBOL(__sk_stream_mem_reclaim); + +int sk_stream_mem_schedule(struct sock *sk, int size, int kind) +{ +	int amt = sk_stream_pages(size); + +	sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM; +	atomic_add(amt, sk->sk_prot->memory_allocated); + +	/* Under limit. */ +	if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) { +		if (*sk->sk_prot->memory_pressure) +			*sk->sk_prot->memory_pressure = 0; +		return 1; +	} + +	/* Over hard limit. */ +	if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) { +		sk->sk_prot->enter_memory_pressure(); +		goto suppress_allocation; +	} + +	/* Under pressure. */ +	if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1]) +		sk->sk_prot->enter_memory_pressure(); + +	if (kind) { +		if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0]) +			return 1; +	} else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0]) +		return 1; + +	if (!*sk->sk_prot->memory_pressure || +	    sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) * +				sk_stream_pages(sk->sk_wmem_queued + +						atomic_read(&sk->sk_rmem_alloc) + +						sk->sk_forward_alloc)) +		return 1; + +suppress_allocation: + +	if (!kind) { +		sk_stream_moderate_sndbuf(sk); + +		/* Fail only if socket is _under_ its sndbuf. +		 * In this case we cannot block, so that we have to fail. +		 */ +		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) +			return 1; +	} + +	/* Alas. Undo changes. */ +	sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM; +	atomic_sub(amt, sk->sk_prot->memory_allocated); +	return 0; +} + +EXPORT_SYMBOL(sk_stream_mem_schedule); + +void sk_stream_kill_queues(struct sock *sk) +{ +	/* First the read buffer. */ +	__skb_queue_purge(&sk->sk_receive_queue); + +	/* Next, the error queue. */ +	__skb_queue_purge(&sk->sk_error_queue); + +	/* Next, the write queue. */ +	BUG_TRAP(skb_queue_empty(&sk->sk_write_queue)); + +	/* Account for returned memory. */ +	sk_stream_mem_reclaim(sk); + +	BUG_TRAP(!sk->sk_wmem_queued); +	BUG_TRAP(!sk->sk_forward_alloc); + +	/* It is _impossible_ for the backlog to contain anything +	 * when we get here.  All user references to this socket +	 * have gone away, only the net layer knows can touch it. +	 */ +} + +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 00000000000..c8be646cb19 --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,182 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/config.h> +#include <linux/module.h> + +#ifdef CONFIG_SYSCTL + +extern int netdev_max_backlog; +extern int weight_p; +extern int no_cong_thresh; +extern int no_cong; +extern int lo_cong; +extern int mod_cong; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + +extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; +extern int sysctl_somaxconn; + +#ifdef CONFIG_NET_DIVERT +extern char sysctl_divert_version[]; +#endif /* CONFIG_NET_DIVERT */ + +/* + * This strdup() is used for creating copies of network  + * device names to be handed over to sysctl. + */ +  +char *net_sysctl_strdup(const char *s) +{ +	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); +	if (rv) +		strcpy(rv, s); +	return rv; +} + +ctl_table core_table[] = { +#ifdef CONFIG_NET +	{ +		.ctl_name	= NET_CORE_WMEM_MAX, +		.procname	= "wmem_max", +		.data		= &sysctl_wmem_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_RMEM_MAX, +		.procname	= "rmem_max", +		.data		= &sysctl_rmem_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_WMEM_DEFAULT, +		.procname	= "wmem_default", +		.data		= &sysctl_wmem_default, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_RMEM_DEFAULT, +		.procname	= "rmem_default", +		.data		= &sysctl_rmem_default, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_DEV_WEIGHT, +		.procname	= "dev_weight", +		.data		= &weight_p, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_MAX_BACKLOG, +		.procname	= "netdev_max_backlog", +		.data		= &netdev_max_backlog, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_NO_CONG_THRESH, +		.procname	= "no_cong_thresh", +		.data		= &no_cong_thresh, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_NO_CONG, +		.procname	= "no_cong", +		.data		= &no_cong, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_LO_CONG, +		.procname	= "lo_cong", +		.data		= &lo_cong, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_MOD_CONG, +		.procname	= "mod_cong", +		.data		= &mod_cong, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ +		.ctl_name	= NET_CORE_MSG_COST, +		.procname	= "message_cost", +		.data		= &net_msg_cost, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec_jiffies, +		.strategy	= &sysctl_jiffies, +	}, +	{ +		.ctl_name	= NET_CORE_MSG_BURST, +		.procname	= "message_burst", +		.data		= &net_msg_burst, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= NET_CORE_OPTMEM_MAX, +		.procname	= "optmem_max", +		.data		= &sysctl_optmem_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +#ifdef CONFIG_NET_DIVERT +	{ +		.ctl_name	= NET_CORE_DIVERT_VERSION, +		.procname	= "divert_version", +		.data		= (void *)sysctl_divert_version, +		.maxlen		= 32, +		.mode		= 0444, +		.proc_handler	= &proc_dostring +	}, +#endif /* CONFIG_NET_DIVERT */ +#endif /* CONFIG_NET */ +	{ +		.ctl_name	= NET_CORE_SOMAXCONN, +		.procname	= "somaxconn", +		.data		= &sysctl_somaxconn, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec +	}, +	{ .ctl_name = 0 } +}; + +EXPORT_SYMBOL(net_sysctl_strdup); + +#endif diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 00000000000..e11a8654f36 --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,155 @@ +/* + *	Generic address resultion entity + * + *	Authors: + *	net_random Alan Cox + *	net_ratelimit Andy Kleen + * + *	Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + *	This program is free software; you can redistribute it and/or + *      modify it under the terms of the GNU General Public License + *      as published by the Free Software Foundation; either version + *      2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + + +/* +  This is a maximally equidistributed combined Tausworthe generator +  based on code from GNU Scientific Library 1.5 (30 Jun 2004) + +   x_n = (s1_n ^ s2_n ^ s3_n)  + +   s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) +   s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) +   s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + +   The period of this generator is about 2^88. + +   From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe +   Generators", Mathematics of Computation, 65, 213 (1996), 203--213. + +   This is available on the net from L'Ecuyer's home page, + +   http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps +   ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps  + +   There is an erratum in the paper "Tables of Maximally +   Equidistributed Combined LFSR Generators", Mathematics of +   Computation, 68, 225 (1999), 261--269: +   http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + +        ... the k_j most significant bits of z_j must be non- +        zero, for each j. (Note: this restriction also applies to the  +        computer code given in [4], but was mistakenly not mentioned in +        that paper.) +    +   This affects the seeding procedure by imposing the requirement +   s1 > 1, s2 > 7, s3 > 15. + +*/ +struct nrnd_state { +	u32 s1, s2, s3; +}; + +static DEFINE_PER_CPU(struct nrnd_state, net_rand_state); + +static u32 __net_random(struct nrnd_state *state) +{ +#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b) + +	state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); +	state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); +	state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + +	return (state->s1 ^ state->s2 ^ state->s3); +} + +static void __net_srandom(struct nrnd_state *state, unsigned long s) +{ +	if (s == 0) +		s = 1;      /* default seed is 1 */ + +#define LCG(n) (69069 * n) +	state->s1 = LCG(s); +	state->s2 = LCG(state->s1); +	state->s3 = LCG(state->s2); + +	/* "warm it up" */ +	__net_random(state); +	__net_random(state); +	__net_random(state); +	__net_random(state); +	__net_random(state); +	__net_random(state); +} + + +unsigned long net_random(void) +{ +	unsigned long r; +	struct nrnd_state *state = &get_cpu_var(net_rand_state); +	r = __net_random(state); +	put_cpu_var(state); +	return r; +} + + +void net_srandom(unsigned long entropy) +{ +	struct nrnd_state *state = &get_cpu_var(net_rand_state); +	__net_srandom(state, state->s1^entropy); +	put_cpu_var(state); +} + +void __init net_random_init(void) +{ +	int i; + +	for (i = 0; i < NR_CPUS; i++) { +		struct nrnd_state *state = &per_cpu(net_rand_state,i); +		__net_srandom(state, i+jiffies); +	} +} + +static int net_random_reseed(void) +{ +	int i; +	unsigned long seed[NR_CPUS]; + +	get_random_bytes(seed, sizeof(seed)); +	for (i = 0; i < NR_CPUS; i++) { +		struct nrnd_state *state = &per_cpu(net_rand_state,i); +		__net_srandom(state, seed[i]); +	} +	return 0; +} +late_initcall(net_random_reseed); + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10; + +/*  + * All net warning printk()s should be guarded by this function. + */  +int net_ratelimit(void) +{ +	return __printk_ratelimit(net_msg_cost, net_msg_burst); +} + +EXPORT_SYMBOL(net_random); +EXPORT_SYMBOL(net_ratelimit); +EXPORT_SYMBOL(net_srandom); diff --git a/net/core/wireless.c b/net/core/wireless.c new file mode 100644 index 00000000000..750cc5daeb0 --- /dev/null +++ b/net/core/wireless.c @@ -0,0 +1,1459 @@ +/* + * This file implement the Wireless Extensions APIs. + * + * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com> + * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +/************************** DOCUMENTATION **************************/ +/* + * API definition : + * -------------- + * See <linux/wireless.h> for details of the APIs and the rest. + * + * History : + * ------- + * + * v1 - 5.12.01 - Jean II + *	o Created this file. + * + * v2 - 13.12.01 - Jean II + *	o Move /proc/net/wireless stuff from net/core/dev.c to here + *	o Make Wireless Extension IOCTLs go through here + *	o Added iw_handler handling ;-) + *	o Added standard ioctl description + *	o Initial dumb commit strategy based on orinoco.c + * + * v3 - 19.12.01 - Jean II + *	o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call + *	o Add event dispatcher function + *	o Add event description + *	o Propagate events as rtnetlink IFLA_WIRELESS option + *	o Generate event on selected SET requests + * + * v4 - 18.04.02 - Jean II + *	o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1 + * + * v5 - 21.06.02 - Jean II + *	o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup) + *	o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes + *	o Add IWEVCUSTOM for driver specific event/scanning token + *	o Turn on WE_STRICT_WRITE by default + kernel warning + *	o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num) + *	o Fix off-by-one in test (extra_size <= IFNAMSIZ) + * + * v6 - 9.01.03 - Jean II + *	o Add common spy support : iw_handler_set_spy(), wireless_spy_update() + *	o Add enhanced spy support : iw_handler_set_thrspy() and event. + *	o Add WIRELESS_EXT version display in /proc/net/wireless + * + * v6 - 18.06.04 - Jean II + *	o Change get_spydata() method for added safety + *	o Remove spy #ifdef, they are always on -> cleaner code + *	o Allow any size GET request if user specifies length > max + *		and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV + *	o Start migrating get_wireless_stats to struct iw_handler_def + *	o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus + * Based on patch from Pavel Roskin <proski@gnu.org> : + *	o Fix kernel data leak to user space in private handler handling + */ + +/***************************** INCLUDES *****************************/ + +#include <linux/config.h>		/* Not needed ??? */ +#include <linux/module.h> +#include <linux/types.h>		/* off_t */ +#include <linux/netdevice.h>		/* struct ifreq, dev_get_by_name() */ +#include <linux/proc_fs.h> +#include <linux/rtnetlink.h>		/* rtnetlink stuff */ +#include <linux/seq_file.h> +#include <linux/init.h>			/* for __init */ +#include <linux/if_arp.h>		/* ARPHRD_ETHER */ + +#include <linux/wireless.h>		/* Pretty obvious */ +#include <net/iw_handler.h>		/* New driver API */ + +#include <asm/uaccess.h>		/* copy_to_user() */ + +/**************************** CONSTANTS ****************************/ + +/* Debugging stuff */ +#undef WE_IOCTL_DEBUG		/* Debug IOCTL API */ +#undef WE_EVENT_DEBUG		/* Debug Event dispatcher */ +#undef WE_SPY_DEBUG		/* Debug enhanced spy support */ + +/* Options */ +#define WE_EVENT_NETLINK	/* Propagate events using rtnetlink */ +#define WE_SET_EVENT		/* Generate an event on some set commands */ + +/************************* GLOBAL VARIABLES *************************/ +/* + * You should not use global variables, because of re-entrancy. + * On our case, it's only const, so it's OK... + */ +/* + * Meta-data about all the standard Wireless Extension request we + * know about. + */ +static const struct iw_ioctl_description standard_ioctl[] = { +	[SIOCSIWCOMMIT	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_NULL, +	}, +	[SIOCGIWNAME	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_CHAR, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWNWID	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +		.flags		= IW_DESCR_FLAG_EVENT, +	}, +	[SIOCGIWNWID	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWFREQ	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_FREQ, +		.flags		= IW_DESCR_FLAG_EVENT, +	}, +	[SIOCGIWFREQ	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_FREQ, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWMODE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_UINT, +		.flags		= IW_DESCR_FLAG_EVENT, +	}, +	[SIOCGIWMODE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_UINT, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWSENS	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWSENS	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWRANGE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_NULL, +	}, +	[SIOCGIWRANGE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= sizeof(struct iw_range), +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWPRIV	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_NULL, +	}, +	[SIOCGIWPRIV	- SIOCIWFIRST] = { /* (handled directly by us) */ +		.header_type	= IW_HEADER_TYPE_NULL, +	}, +	[SIOCSIWSTATS	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_NULL, +	}, +	[SIOCGIWSTATS	- SIOCIWFIRST] = { /* (handled directly by us) */ +		.header_type	= IW_HEADER_TYPE_NULL, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWSPY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= sizeof(struct sockaddr), +		.max_tokens	= IW_MAX_SPY, +	}, +	[SIOCGIWSPY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= sizeof(struct sockaddr) + +				  sizeof(struct iw_quality), +		.max_tokens	= IW_MAX_SPY, +	}, +	[SIOCSIWTHRSPY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= sizeof(struct iw_thrspy), +		.min_tokens	= 1, +		.max_tokens	= 1, +	}, +	[SIOCGIWTHRSPY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= sizeof(struct iw_thrspy), +		.min_tokens	= 1, +		.max_tokens	= 1, +	}, +	[SIOCSIWAP	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_ADDR, +	}, +	[SIOCGIWAP	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_ADDR, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCGIWAPLIST	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= sizeof(struct sockaddr) + +				  sizeof(struct iw_quality), +		.max_tokens	= IW_MAX_AP, +		.flags		= IW_DESCR_FLAG_NOMAX, +	}, +	[SIOCSIWSCAN	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWSCAN	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_SCAN_MAX_DATA, +		.flags		= IW_DESCR_FLAG_NOMAX, +	}, +	[SIOCSIWESSID	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ESSID_MAX_SIZE + 1, +		.flags		= IW_DESCR_FLAG_EVENT, +	}, +	[SIOCGIWESSID	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ESSID_MAX_SIZE + 1, +		.flags		= IW_DESCR_FLAG_DUMP, +	}, +	[SIOCSIWNICKN	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ESSID_MAX_SIZE + 1, +	}, +	[SIOCGIWNICKN	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ESSID_MAX_SIZE + 1, +	}, +	[SIOCSIWRATE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWRATE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWRTS	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWRTS	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWFRAG	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWFRAG	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWTXPOW	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWTXPOW	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWRETRY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWRETRY	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCSIWENCODE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ENCODING_TOKEN_MAX, +		.flags		= IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, +	}, +	[SIOCGIWENCODE	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_ENCODING_TOKEN_MAX, +		.flags		= IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, +	}, +	[SIOCSIWPOWER	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +	[SIOCGIWPOWER	- SIOCIWFIRST] = { +		.header_type	= IW_HEADER_TYPE_PARAM, +	}, +}; +static const int standard_ioctl_num = (sizeof(standard_ioctl) / +				       sizeof(struct iw_ioctl_description)); + +/* + * Meta-data about all the additional standard Wireless Extension events + * we know about. + */ +static const struct iw_ioctl_description standard_event[] = { +	[IWEVTXDROP	- IWEVFIRST] = { +		.header_type	= IW_HEADER_TYPE_ADDR, +	}, +	[IWEVQUAL	- IWEVFIRST] = { +		.header_type	= IW_HEADER_TYPE_QUAL, +	}, +	[IWEVCUSTOM	- IWEVFIRST] = { +		.header_type	= IW_HEADER_TYPE_POINT, +		.token_size	= 1, +		.max_tokens	= IW_CUSTOM_MAX, +	}, +	[IWEVREGISTERED	- IWEVFIRST] = { +		.header_type	= IW_HEADER_TYPE_ADDR, +	}, +	[IWEVEXPIRED	- IWEVFIRST] = { +		.header_type	= IW_HEADER_TYPE_ADDR,  +	}, +}; +static const int standard_event_num = (sizeof(standard_event) / +				       sizeof(struct iw_ioctl_description)); + +/* Size (in bytes) of the various private data types */ +static const char iw_priv_type_size[] = { +	0,				/* IW_PRIV_TYPE_NONE */ +	1,				/* IW_PRIV_TYPE_BYTE */ +	1,				/* IW_PRIV_TYPE_CHAR */ +	0,				/* Not defined */ +	sizeof(__u32),			/* IW_PRIV_TYPE_INT */ +	sizeof(struct iw_freq),		/* IW_PRIV_TYPE_FLOAT */ +	sizeof(struct sockaddr),	/* IW_PRIV_TYPE_ADDR */ +	0,				/* Not defined */ +}; + +/* Size (in bytes) of various events */ +static const int event_type_size[] = { +	IW_EV_LCP_LEN,			/* IW_HEADER_TYPE_NULL */ +	0, +	IW_EV_CHAR_LEN,			/* IW_HEADER_TYPE_CHAR */ +	0, +	IW_EV_UINT_LEN,			/* IW_HEADER_TYPE_UINT */ +	IW_EV_FREQ_LEN,			/* IW_HEADER_TYPE_FREQ */ +	IW_EV_ADDR_LEN,			/* IW_HEADER_TYPE_ADDR */ +	0, +	IW_EV_POINT_LEN,		/* Without variable payload */ +	IW_EV_PARAM_LEN,		/* IW_HEADER_TYPE_PARAM */ +	IW_EV_QUAL_LEN,			/* IW_HEADER_TYPE_QUAL */ +}; + +/************************ COMMON SUBROUTINES ************************/ +/* + * Stuff that may be used in various place or doesn't fit in one + * of the section below. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the driver handler associated with a specific Wireless Extension. + * Called from various place, so make sure it remains efficient. + */ +static inline iw_handler get_handler(struct net_device *dev, +				     unsigned int cmd) +{ +	/* Don't "optimise" the following variable, it will crash */ +	unsigned int	index;		/* *MUST* be unsigned */ + +	/* Check if we have some wireless handlers defined */ +	if(dev->wireless_handlers == NULL) +		return NULL; + +	/* Try as a standard command */ +	index = cmd - SIOCIWFIRST; +	if(index < dev->wireless_handlers->num_standard) +		return dev->wireless_handlers->standard[index]; + +	/* Try as a private command */ +	index = cmd - SIOCIWFIRSTPRIV; +	if(index < dev->wireless_handlers->num_private) +		return dev->wireless_handlers->private[index]; + +	/* Not found */ +	return NULL; +} + +/* ---------------------------------------------------------------- */ +/* + * Get statistics out of the driver + */ +static inline struct iw_statistics *get_wireless_stats(struct net_device *dev) +{ +	/* New location */ +	if((dev->wireless_handlers != NULL) && +	   (dev->wireless_handlers->get_wireless_stats != NULL)) +		return dev->wireless_handlers->get_wireless_stats(dev); + +	/* Old location, will be phased out in next WE */ +	return (dev->get_wireless_stats ? +		dev->get_wireless_stats(dev) : +		(struct iw_statistics *) NULL); +} + +/* ---------------------------------------------------------------- */ +/* + * Call the commit handler in the driver + * (if exist and if conditions are right) + * + * Note : our current commit strategy is currently pretty dumb, + * but we will be able to improve on that... + * The goal is to try to agreagate as many changes as possible + * before doing the commit. Drivers that will define a commit handler + * are usually those that need a reset after changing parameters, so + * we want to minimise the number of reset. + * A cool idea is to use a timer : at each "set" command, we re-set the + * timer, when the timer eventually fires, we call the driver. + * Hopefully, more on that later. + * + * Also, I'm waiting to see how many people will complain about the + * netif_running(dev) test. I'm open on that one... + * Hopefully, the driver will remember to do a commit in "open()" ;-) + */ +static inline int call_commit_handler(struct net_device *	dev) +{ +	if((netif_running(dev)) && +	   (dev->wireless_handlers->standard[0] != NULL)) { +		/* Call the commit handler on the driver */ +		return dev->wireless_handlers->standard[0](dev, NULL, +							   NULL, NULL); +	} else +		return 0;		/* Command completed successfully */ +} + +/* ---------------------------------------------------------------- */ +/* + * Calculate size of private arguments + */ +static inline int get_priv_size(__u16	args) +{ +	int	num = args & IW_PRIV_SIZE_MASK; +	int	type = (args & IW_PRIV_TYPE_MASK) >> 12; + +	return num * iw_priv_type_size[type]; +} + +/* ---------------------------------------------------------------- */ +/* + * Re-calculate the size of private arguments + */ +static inline int adjust_priv_size(__u16		args, +				   union iwreq_data *	wrqu) +{ +	int	num = wrqu->data.length; +	int	max = args & IW_PRIV_SIZE_MASK; +	int	type = (args & IW_PRIV_TYPE_MASK) >> 12; + +	/* Make sure the driver doesn't goof up */ +	if (max < num) +		num = max; + +	return num * iw_priv_type_size[type]; +} + + +/******************** /proc/net/wireless SUPPORT ********************/ +/* + * The /proc/net/wireless file is a human readable user-space interface + * exporting various wireless specific statistics from the wireless devices. + * This is the most popular part of the Wireless Extensions ;-) + * + * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). + * The content of the file is basically the content of "struct iw_statistics". + */ + +#ifdef CONFIG_PROC_FS + +/* ---------------------------------------------------------------- */ +/* + * Print one entry (line) of /proc/net/wireless + */ +static __inline__ void wireless_seq_printf_stats(struct seq_file *seq, +						 struct net_device *dev) +{ +	/* Get stats from the driver */ +	struct iw_statistics *stats = get_wireless_stats(dev); + +	if (stats) { +		seq_printf(seq, "%6s: %04x  %3d%c  %3d%c  %3d%c  %6d %6d %6d " +				"%6d %6d   %6d\n", +			   dev->name, stats->status, stats->qual.qual, +			   stats->qual.updated & IW_QUAL_QUAL_UPDATED +			   ? '.' : ' ', +			   ((__u8) stats->qual.level), +			   stats->qual.updated & IW_QUAL_LEVEL_UPDATED +			   ? '.' : ' ', +			   ((__u8) stats->qual.noise), +			   stats->qual.updated & IW_QUAL_NOISE_UPDATED +			   ? '.' : ' ', +			   stats->discard.nwid, stats->discard.code, +			   stats->discard.fragment, stats->discard.retries, +			   stats->discard.misc, stats->miss.beacon); +		stats->qual.updated = 0; +	} +} + +/* ---------------------------------------------------------------- */ +/* + * Print info for /proc/net/wireless (print all entries) + */ +static int wireless_seq_show(struct seq_file *seq, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_printf(seq, "Inter-| sta-|   Quality        |   Discarded " +				"packets               | Missed | WE\n" +				" face | tus | link level noise |  nwid  " +				"crypt   frag  retry   misc | beacon | %d\n", +			   WIRELESS_EXT); +	else +		wireless_seq_printf_stats(seq, v); +	return 0; +} + +extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); +extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); +extern void dev_seq_stop(struct seq_file *seq, void *v); + +static struct seq_operations wireless_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = wireless_seq_show, +}; + +static int wireless_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &wireless_seq_ops); +} + +static struct file_operations wireless_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = wireless_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +int __init wireless_proc_init(void) +{ +	if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops)) +		return -ENOMEM; + +	return 0; +} +#endif	/* CONFIG_PROC_FS */ + +/************************** IOCTL SUPPORT **************************/ +/* + * The original user space API to configure all those Wireless Extensions + * is through IOCTLs. + * In there, we check if we need to call the new driver API (iw_handler) + * or just call the driver ioctl handler. + */ + +/* ---------------------------------------------------------------- */ +/* + *	Allow programatic access to /proc/net/wireless even if /proc + *	doesn't exist... Also more efficient... + */ +static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr) +{ +	/* Get stats from the driver */ +	struct iw_statistics *stats; + +	stats = get_wireless_stats(dev); +	if (stats != (struct iw_statistics *) NULL) { +		struct iwreq *	wrq = (struct iwreq *)ifr; + +		/* Copy statistics to the user buffer */ +		if(copy_to_user(wrq->u.data.pointer, stats, +				sizeof(struct iw_statistics))) +			return -EFAULT; + +		/* Check if we need to clear the update flag */ +		if(wrq->u.data.flags != 0) +			stats->qual.updated = 0; +		return 0; +	} else +		return -EOPNOTSUPP; +} + +/* ---------------------------------------------------------------- */ +/* + * Export the driver private handler definition + * They will be picked up by tools like iwpriv... + */ +static inline int ioctl_export_private(struct net_device *	dev, +				       struct ifreq *		ifr) +{ +	struct iwreq *				iwr = (struct iwreq *) ifr; + +	/* Check if the driver has something to export */ +	if((dev->wireless_handlers->num_private_args == 0) || +	   (dev->wireless_handlers->private_args == NULL)) +		return -EOPNOTSUPP; + +	/* Check NULL pointer */ +	if(iwr->u.data.pointer == NULL) +		return -EFAULT; + +	/* Check if there is enough buffer up there */ +	if(iwr->u.data.length < dev->wireless_handlers->num_private_args) { +		/* User space can't know in advance how large the buffer +		 * needs to be. Give it a hint, so that we can support +		 * any size buffer we want somewhat efficiently... */ +		iwr->u.data.length = dev->wireless_handlers->num_private_args; +		return -E2BIG; +	} + +	/* Set the number of available ioctls. */ +	iwr->u.data.length = dev->wireless_handlers->num_private_args; + +	/* Copy structure to the user buffer. */ +	if (copy_to_user(iwr->u.data.pointer, +			 dev->wireless_handlers->private_args, +			 sizeof(struct iw_priv_args) * iwr->u.data.length)) +		return -EFAULT; + +	return 0; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a standard Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + */ +static inline int ioctl_standard_call(struct net_device *	dev, +				      struct ifreq *		ifr, +				      unsigned int		cmd, +				      iw_handler		handler) +{ +	struct iwreq *				iwr = (struct iwreq *) ifr; +	const struct iw_ioctl_description *	descr; +	struct iw_request_info			info; +	int					ret = -EINVAL; + +	/* Get the description of the IOCTL */ +	if((cmd - SIOCIWFIRST) >= standard_ioctl_num) +		return -EOPNOTSUPP; +	descr = &(standard_ioctl[cmd - SIOCIWFIRST]); + +#ifdef WE_IOCTL_DEBUG +	printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n", +	       ifr->ifr_name, cmd); +	printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif	/* WE_IOCTL_DEBUG */ + +	/* Prepare the call */ +	info.cmd = cmd; +	info.flags = 0; + +	/* Check if we have a pointer to user space data or not */ +	if(descr->header_type != IW_HEADER_TYPE_POINT) { + +		/* No extra arguments. Trivial to handle */ +		ret = handler(dev, &info, &(iwr->u), NULL); + +#ifdef WE_SET_EVENT +		/* Generate an event to notify listeners of the change */ +		if((descr->flags & IW_DESCR_FLAG_EVENT) && +		   ((ret == 0) || (ret == -EIWCOMMIT))) +			wireless_send_event(dev, cmd, &(iwr->u), NULL); +#endif	/* WE_SET_EVENT */ +	} else { +		char *	extra; +		int	extra_size; +		int	user_length = 0; +		int	err; + +		/* Calculate space needed by arguments. Always allocate +		 * for max space. Easier, and won't last long... */ +		extra_size = descr->max_tokens * descr->token_size; + +		/* Check what user space is giving us */ +		if(IW_IS_SET(cmd)) { +			/* Check NULL pointer */ +			if((iwr->u.data.pointer == NULL) && +			   (iwr->u.data.length != 0)) +				return -EFAULT; +			/* Check if number of token fits within bounds */ +			if(iwr->u.data.length > descr->max_tokens) +				return -E2BIG; +			if(iwr->u.data.length < descr->min_tokens) +				return -EINVAL; +		} else { +			/* Check NULL pointer */ +			if(iwr->u.data.pointer == NULL) +				return -EFAULT; +			/* Save user space buffer size for checking */ +			user_length = iwr->u.data.length; + +			/* Don't check if user_length > max to allow forward +			 * compatibility. The test user_length < min is +			 * implied by the test at the end. */ + +			/* Support for very large requests */ +			if((descr->flags & IW_DESCR_FLAG_NOMAX) && +			   (user_length > descr->max_tokens)) { +				/* Allow userspace to GET more than max so +				 * we can support any size GET requests. +				 * There is still a limit : -ENOMEM. */ +				extra_size = user_length * descr->token_size; +				/* Note : user_length is originally a __u16, +				 * and token_size is controlled by us, +				 * so extra_size won't get negative and +				 * won't overflow... */ +			} +		} + +#ifdef WE_IOCTL_DEBUG +		printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", +		       dev->name, extra_size); +#endif	/* WE_IOCTL_DEBUG */ + +		/* Create the kernel buffer */ +		extra = kmalloc(extra_size, GFP_KERNEL); +		if (extra == NULL) { +			return -ENOMEM; +		} + +		/* If it is a SET, get all the extra data in here */ +		if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { +			err = copy_from_user(extra, iwr->u.data.pointer, +					     iwr->u.data.length * +					     descr->token_size); +			if (err) { +				kfree(extra); +				return -EFAULT; +			} +#ifdef WE_IOCTL_DEBUG +			printk(KERN_DEBUG "%s (WE) : Got %d bytes\n", +			       dev->name, +			       iwr->u.data.length * descr->token_size); +#endif	/* WE_IOCTL_DEBUG */ +		} + +		/* Call the handler */ +		ret = handler(dev, &info, &(iwr->u), extra); + +		/* If we have something to return to the user */ +		if (!ret && IW_IS_GET(cmd)) { +			/* Check if there is enough buffer up there */ +			if(user_length < iwr->u.data.length) { +				kfree(extra); +				return -E2BIG; +			} + +			err = copy_to_user(iwr->u.data.pointer, extra, +					   iwr->u.data.length * +					   descr->token_size); +			if (err) +				ret =  -EFAULT;				    +#ifdef WE_IOCTL_DEBUG +			printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n", +			       dev->name, +			       iwr->u.data.length * descr->token_size); +#endif	/* WE_IOCTL_DEBUG */ +		} + +#ifdef WE_SET_EVENT +		/* Generate an event to notify listeners of the change */ +		if((descr->flags & IW_DESCR_FLAG_EVENT) && +		   ((ret == 0) || (ret == -EIWCOMMIT))) { +			if(descr->flags & IW_DESCR_FLAG_RESTRICT) +				/* If the event is restricted, don't +				 * export the payload */ +				wireless_send_event(dev, cmd, &(iwr->u), NULL); +			else +				wireless_send_event(dev, cmd, &(iwr->u), +						    extra); +		} +#endif	/* WE_SET_EVENT */ + +		/* Cleanup - I told you it wasn't that long ;-) */ +		kfree(extra); +	} + +	/* Call commit handler if needed and defined */ +	if(ret == -EIWCOMMIT) +		ret = call_commit_handler(dev); + +	/* Here, we will generate the appropriate event if needed */ + +	return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a private Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + * It's not as nice and slimline as the standard wrapper. The cause + * is struct iw_priv_args, which was not really designed for the + * job we are going here. + * + * IMPORTANT : This function prevent to set and get data on the same + * IOCTL and enforce the SET/GET convention. Not doing it would be + * far too hairy... + * If you need to set and get data at the same time, please don't use + * a iw_handler but process it in your ioctl handler (i.e. use the + * old driver API). + */ +static inline int ioctl_private_call(struct net_device *	dev, +				     struct ifreq *		ifr, +				     unsigned int		cmd, +				     iw_handler		handler) +{ +	struct iwreq *			iwr = (struct iwreq *) ifr; +	const struct iw_priv_args *	descr = NULL; +	struct iw_request_info		info; +	int				extra_size = 0; +	int				i; +	int				ret = -EINVAL; + +	/* Get the description of the IOCTL */ +	for(i = 0; i < dev->wireless_handlers->num_private_args; i++) +		if(cmd == dev->wireless_handlers->private_args[i].cmd) { +			descr = &(dev->wireless_handlers->private_args[i]); +			break; +		} + +#ifdef WE_IOCTL_DEBUG +	printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n", +	       ifr->ifr_name, cmd); +	if(descr) { +		printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n", +		       dev->name, descr->name, +		       descr->set_args, descr->get_args); +	} +#endif	/* WE_IOCTL_DEBUG */ + +	/* Compute the size of the set/get arguments */ +	if(descr != NULL) { +		if(IW_IS_SET(cmd)) { +			int	offset = 0;	/* For sub-ioctls */ +			/* Check for sub-ioctl handler */ +			if(descr->name[0] == '\0') +				/* Reserve one int for sub-ioctl index */ +				offset = sizeof(__u32); + +			/* Size of set arguments */ +			extra_size = get_priv_size(descr->set_args); + +			/* Does it fits in iwr ? */ +			if((descr->set_args & IW_PRIV_SIZE_FIXED) && +			   ((extra_size + offset) <= IFNAMSIZ)) +				extra_size = 0; +		} else { +			/* Size of get arguments */ +			extra_size = get_priv_size(descr->get_args); + +			/* Does it fits in iwr ? */ +			if((descr->get_args & IW_PRIV_SIZE_FIXED) && +			   (extra_size <= IFNAMSIZ)) +				extra_size = 0; +		} +	} + +	/* Prepare the call */ +	info.cmd = cmd; +	info.flags = 0; + +	/* Check if we have a pointer to user space data or not. */ +	if(extra_size == 0) { +		/* No extra arguments. Trivial to handle */ +		ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u)); +	} else { +		char *	extra; +		int	err; + +		/* Check what user space is giving us */ +		if(IW_IS_SET(cmd)) { +			/* Check NULL pointer */ +			if((iwr->u.data.pointer == NULL) && +			   (iwr->u.data.length != 0)) +				return -EFAULT; + +			/* Does it fits within bounds ? */ +			if(iwr->u.data.length > (descr->set_args & +						 IW_PRIV_SIZE_MASK)) +				return -E2BIG; +		} else { +			/* Check NULL pointer */ +			if(iwr->u.data.pointer == NULL) +				return -EFAULT; +		} + +#ifdef WE_IOCTL_DEBUG +		printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", +		       dev->name, extra_size); +#endif	/* WE_IOCTL_DEBUG */ + +		/* Always allocate for max space. Easier, and won't last +		 * long... */ +		extra = kmalloc(extra_size, GFP_KERNEL); +		if (extra == NULL) { +			return -ENOMEM; +		} + +		/* If it is a SET, get all the extra data in here */ +		if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { +			err = copy_from_user(extra, iwr->u.data.pointer, +					     extra_size); +			if (err) { +				kfree(extra); +				return -EFAULT; +			} +#ifdef WE_IOCTL_DEBUG +			printk(KERN_DEBUG "%s (WE) : Got %d elem\n", +			       dev->name, iwr->u.data.length); +#endif	/* WE_IOCTL_DEBUG */ +		} + +		/* Call the handler */ +		ret = handler(dev, &info, &(iwr->u), extra); + +		/* If we have something to return to the user */ +		if (!ret && IW_IS_GET(cmd)) { + +			/* Adjust for the actual length if it's variable, +			 * avoid leaking kernel bits outside. */ +			if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) { +				extra_size = adjust_priv_size(descr->get_args, +							      &(iwr->u)); +			} + +			err = copy_to_user(iwr->u.data.pointer, extra, +					   extra_size); +			if (err) +				ret =  -EFAULT;				    +#ifdef WE_IOCTL_DEBUG +			printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n", +			       dev->name, iwr->u.data.length); +#endif	/* WE_IOCTL_DEBUG */ +		} + +		/* Cleanup - I told you it wasn't that long ;-) */ +		kfree(extra); +	} + + +	/* Call commit handler if needed and defined */ +	if(ret == -EIWCOMMIT) +		ret = call_commit_handler(dev); + +	return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Main IOCTl dispatcher. Called from the main networking code + * (dev_ioctl() in net/core/dev.c). + * Check the type of IOCTL and call the appropriate wrapper... + */ +int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) +{ +	struct net_device *dev; +	iw_handler	handler; + +	/* Permissions are already checked in dev_ioctl() before calling us. +	 * The copy_to/from_user() of ifr is also dealt with in there */ + +	/* Make sure the device exist */ +	if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) +		return -ENODEV; + +	/* A bunch of special cases, then the generic case... +	 * Note that 'cmd' is already filtered in dev_ioctl() with +	 * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ +	switch(cmd)  +	{ +		case SIOCGIWSTATS: +			/* Get Wireless Stats */ +			return dev_iwstats(dev, ifr); + +		case SIOCGIWPRIV: +			/* Check if we have some wireless handlers defined */ +			if(dev->wireless_handlers != NULL) { +				/* We export to user space the definition of +				 * the private handler ourselves */ +				return ioctl_export_private(dev, ifr); +			} +			// ## Fall-through for old API ## +		default: +			/* Generic IOCTL */ +			/* Basic check */ +			if (!netif_device_present(dev)) +				return -ENODEV; +			/* New driver API : try to find the handler */ +			handler = get_handler(dev, cmd); +			if(handler != NULL) { +				/* Standard and private are not the same */ +				if(cmd < SIOCIWFIRSTPRIV) +					return ioctl_standard_call(dev, +								   ifr, +								   cmd, +								   handler); +				else +					return ioctl_private_call(dev, +								  ifr, +								  cmd, +								  handler); +			} +			/* Old driver API : call driver ioctl handler */ +			if (dev->do_ioctl) { +				return dev->do_ioctl(dev, ifr, cmd); +			} +			return -EOPNOTSUPP; +	} +	/* Not reached */ +	return -EINVAL; +} + +/************************* EVENT PROCESSING *************************/ +/* + * Process events generated by the wireless layer or the driver. + * Most often, the event will be propagated through rtnetlink + */ + +#ifdef WE_EVENT_NETLINK +/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here. + * It is declared in <linux/rtnetlink.h> */ + +/* ---------------------------------------------------------------- */ +/* + * Fill a rtnetlink message with our event data. + * Note that we propage only the specified event and don't dump the + * current wireless config. Dumping the wireless config is far too + * expensive (for each parameter, the driver need to query the hardware). + */ +static inline int rtnetlink_fill_iwinfo(struct sk_buff *	skb, +					struct net_device *	dev, +					int			type, +					char *			event, +					int			event_len) +{ +	struct ifinfomsg *r; +	struct nlmsghdr  *nlh; +	unsigned char	 *b = skb->tail; + +	nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); +	r = NLMSG_DATA(nlh); +	r->ifi_family = AF_UNSPEC; +	r->ifi_type = dev->type; +	r->ifi_index = dev->ifindex; +	r->ifi_flags = dev->flags; +	r->ifi_change = 0;	/* Wireless changes don't affect those flags */ + +	/* Add the wireless events in the netlink packet */ +	RTA_PUT(skb, IFLA_WIRELESS, +		event_len, event); + +	nlh->nlmsg_len = skb->tail - b; +	return skb->len; + +nlmsg_failure: +rtattr_failure: +	skb_trim(skb, b - skb->data); +	return -1; +} + +/* ---------------------------------------------------------------- */ +/* + * Create and broadcast and send it on the standard rtnetlink socket + * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c + * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field + * within a RTM_NEWLINK event. + */ +static inline void rtmsg_iwinfo(struct net_device *	dev, +				char *			event, +				int			event_len) +{ +	struct sk_buff *skb; +	int size = NLMSG_GOODSIZE; + +	skb = alloc_skb(size, GFP_ATOMIC); +	if (!skb) +		return; + +	if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK, +				  event, event_len) < 0) { +		kfree_skb(skb); +		return; +	} +	NETLINK_CB(skb).dst_groups = RTMGRP_LINK; +	netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); +} +#endif	/* WE_EVENT_NETLINK */ + +/* ---------------------------------------------------------------- */ +/* + * Main event dispatcher. Called from other parts and drivers. + * Send the event on the appropriate channels. + * May be called from interrupt context. + */ +void wireless_send_event(struct net_device *	dev, +			 unsigned int		cmd, +			 union iwreq_data *	wrqu, +			 char *			extra) +{ +	const struct iw_ioctl_description *	descr = NULL; +	int extra_len = 0; +	struct iw_event  *event;		/* Mallocated whole event */ +	int event_len;				/* Its size */ +	int hdr_len;				/* Size of the event header */ +	/* Don't "optimise" the following variable, it will crash */ +	unsigned	cmd_index;		/* *MUST* be unsigned */ + +	/* Get the description of the IOCTL */ +	if(cmd <= SIOCIWLAST) { +		cmd_index = cmd - SIOCIWFIRST; +		if(cmd_index < standard_ioctl_num) +			descr = &(standard_ioctl[cmd_index]); +	} else { +		cmd_index = cmd - IWEVFIRST; +		if(cmd_index < standard_event_num) +			descr = &(standard_event[cmd_index]); +	} +	/* Don't accept unknown events */ +	if(descr == NULL) { +		/* Note : we don't return an error to the driver, because +		 * the driver would not know what to do about it. It can't +		 * return an error to the user, because the event is not +		 * initiated by a user request. +		 * The best the driver could do is to log an error message. +		 * We will do it ourselves instead... +		 */ +	  	printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n", +		       dev->name, cmd); +		return; +	} +#ifdef WE_EVENT_DEBUG +	printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n", +	       dev->name, cmd); +	printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif	/* WE_EVENT_DEBUG */ + +	/* Check extra parameters and set extra_len */ +	if(descr->header_type == IW_HEADER_TYPE_POINT) { +		/* Check if number of token fits within bounds */ +		if(wrqu->data.length > descr->max_tokens) { +		  	printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length); +			return; +		} +		if(wrqu->data.length < descr->min_tokens) { +		  	printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length); +			return; +		} +		/* Calculate extra_len - extra is NULL for restricted events */ +		if(extra != NULL) +			extra_len = wrqu->data.length * descr->token_size; +#ifdef WE_EVENT_DEBUG +		printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len); +#endif	/* WE_EVENT_DEBUG */ +	} + +	/* Total length of the event */ +	hdr_len = event_type_size[descr->header_type]; +	event_len = hdr_len + extra_len; + +#ifdef WE_EVENT_DEBUG +	printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len); +#endif	/* WE_EVENT_DEBUG */ + +	/* Create temporary buffer to hold the event */ +	event = kmalloc(event_len, GFP_ATOMIC); +	if(event == NULL) +		return; + +	/* Fill event */ +	event->len = event_len; +	event->cmd = cmd; +	memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN); +	if(extra != NULL) +		memcpy(((char *) event) + hdr_len, extra, extra_len); + +#ifdef WE_EVENT_NETLINK +	/* rtnetlink event channel */ +	rtmsg_iwinfo(dev, (char *) event, event_len); +#endif	/* WE_EVENT_NETLINK */ + +	/* Cleanup */ +	kfree(event); + +	return;		/* Always success, I guess ;-) */ +} + +/********************** ENHANCED IWSPY SUPPORT **********************/ +/* + * In the old days, the driver was handling spy support all by itself. + * Now, the driver can delegate this task to Wireless Extensions. + * It needs to use those standard spy iw_handler in struct iw_handler_def, + * push data to us via wireless_spy_update() and include struct iw_spy_data + * in its private part (and advertise it in iw_handler_def->spy_offset). + * One of the main advantage of centralising spy support here is that + * it becomes much easier to improve and extend it without having to touch + * the drivers. One example is the addition of the Spy-Threshold events. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the pointer to the spy data in the driver. + * Because this is called on the Rx path via wireless_spy_update(), + * we want it to be efficient... + */ +static inline struct iw_spy_data * get_spydata(struct net_device *dev) +{ +	/* This is the new way */ +	if(dev->wireless_data) +		return(dev->wireless_data->spy_data); + +	/* This is the old way. Doesn't work for multi-headed drivers. +	 * It will be removed in the next version of WE. */ +	return (dev->priv + dev->wireless_handlers->spy_offset); +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set Spy List + */ +int iw_handler_set_spy(struct net_device *	dev, +		       struct iw_request_info *	info, +		       union iwreq_data *	wrqu, +		       char *			extra) +{ +	struct iw_spy_data *	spydata = get_spydata(dev); +	struct sockaddr *	address = (struct sockaddr *) extra; + +	if(!dev->wireless_data) +		/* Help user know that driver needs updating */ +		printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n", +		       dev->name); +	/* Make sure driver is not buggy or using the old API */ +	if(!spydata) +		return -EOPNOTSUPP; + +	/* Disable spy collection while we copy the addresses. +	 * While we copy addresses, any call to wireless_spy_update() +	 * will NOP. This is OK, as anyway the addresses are changing. */ +	spydata->spy_number = 0; + +	/* We want to operate without locking, because wireless_spy_update() +	 * most likely will happen in the interrupt handler, and therefore +	 * have its own locking constraints and needs performance. +	 * The rtnl_lock() make sure we don't race with the other iw_handlers. +	 * This make sure wireless_spy_update() "see" that the spy list +	 * is temporarily disabled. */ +	wmb(); + +	/* Are there are addresses to copy? */ +	if(wrqu->data.length > 0) { +		int i; + +		/* Copy addresses */ +		for(i = 0; i < wrqu->data.length; i++) +			memcpy(spydata->spy_address[i], address[i].sa_data, +			       ETH_ALEN); +		/* Reset stats */ +		memset(spydata->spy_stat, 0, +		       sizeof(struct iw_quality) * IW_MAX_SPY); + +#ifdef WE_SPY_DEBUG +		printk(KERN_DEBUG "iw_handler_set_spy() :  offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length); +		for (i = 0; i < wrqu->data.length; i++) +			printk(KERN_DEBUG +			       "%02X:%02X:%02X:%02X:%02X:%02X \n", +			       spydata->spy_address[i][0], +			       spydata->spy_address[i][1], +			       spydata->spy_address[i][2], +			       spydata->spy_address[i][3], +			       spydata->spy_address[i][4], +			       spydata->spy_address[i][5]); +#endif	/* WE_SPY_DEBUG */ +	} + +	/* Make sure above is updated before re-enabling */ +	wmb(); + +	/* Enable addresses */ +	spydata->spy_number = wrqu->data.length; + +	return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get Spy List + */ +int iw_handler_get_spy(struct net_device *	dev, +		       struct iw_request_info *	info, +		       union iwreq_data *	wrqu, +		       char *			extra) +{ +	struct iw_spy_data *	spydata = get_spydata(dev); +	struct sockaddr *	address = (struct sockaddr *) extra; +	int			i; + +	/* Make sure driver is not buggy or using the old API */ +	if(!spydata) +		return -EOPNOTSUPP; + +	wrqu->data.length = spydata->spy_number; + +	/* Copy addresses. */ +	for(i = 0; i < spydata->spy_number; i++) 	{ +		memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); +		address[i].sa_family = AF_UNIX; +	} +	/* Copy stats to the user buffer (just after). */ +	if(spydata->spy_number > 0) +		memcpy(extra  + (sizeof(struct sockaddr) *spydata->spy_number), +		       spydata->spy_stat, +		       sizeof(struct iw_quality) * spydata->spy_number); +	/* Reset updated flags. */ +	for(i = 0; i < spydata->spy_number; i++) +		spydata->spy_stat[i].updated = 0; +	return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int iw_handler_set_thrspy(struct net_device *	dev, +			  struct iw_request_info *info, +			  union iwreq_data *	wrqu, +			  char *		extra) +{ +	struct iw_spy_data *	spydata = get_spydata(dev); +	struct iw_thrspy *	threshold = (struct iw_thrspy *) extra; + +	/* Make sure driver is not buggy or using the old API */ +	if(!spydata) +		return -EOPNOTSUPP; + +	/* Just do it */ +	memcpy(&(spydata->spy_thr_low), &(threshold->low), +	       2 * sizeof(struct iw_quality)); + +	/* Clear flag */ +	memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + +#ifdef WE_SPY_DEBUG +	printk(KERN_DEBUG "iw_handler_set_thrspy() :  low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level); +#endif	/* WE_SPY_DEBUG */ + +	return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int iw_handler_get_thrspy(struct net_device *	dev, +			  struct iw_request_info *info, +			  union iwreq_data *	wrqu, +			  char *		extra) +{ +	struct iw_spy_data *	spydata = get_spydata(dev); +	struct iw_thrspy *	threshold = (struct iw_thrspy *) extra; + +	/* Make sure driver is not buggy or using the old API */ +	if(!spydata) +		return -EOPNOTSUPP; + +	/* Just do it */ +	memcpy(&(threshold->low), &(spydata->spy_thr_low), +	       2 * sizeof(struct iw_quality)); + +	return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device *	dev, +				 struct iw_spy_data *	spydata, +				 unsigned char *	address, +				 struct iw_quality *	wstats) +{ +	union iwreq_data	wrqu; +	struct iw_thrspy	threshold; + +	/* Init */ +	wrqu.data.length = 1; +	wrqu.data.flags = 0; +	/* Copy address */ +	memcpy(threshold.addr.sa_data, address, ETH_ALEN); +	threshold.addr.sa_family = ARPHRD_ETHER; +	/* Copy stats */ +	memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); +	/* Copy also thresholds */ +	memcpy(&(threshold.low), &(spydata->spy_thr_low), +	       2 * sizeof(struct iw_quality)); + +#ifdef WE_SPY_DEBUG +	printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n", +	       threshold.addr.sa_data[0], +	       threshold.addr.sa_data[1], +	       threshold.addr.sa_data[2], +	       threshold.addr.sa_data[3], +	       threshold.addr.sa_data[4], +	       threshold.addr.sa_data[5], threshold.qual.level); +#endif	/* WE_SPY_DEBUG */ + +	/* Send event to user space */ +	wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void wireless_spy_update(struct net_device *	dev, +			 unsigned char *	address, +			 struct iw_quality *	wstats) +{ +	struct iw_spy_data *	spydata = get_spydata(dev); +	int			i; +	int			match = -1; + +	/* Make sure driver is not buggy or using the old API */ +	if(!spydata) +		return; + +#ifdef WE_SPY_DEBUG +	printk(KERN_DEBUG "wireless_spy_update() :  offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]); +#endif	/* WE_SPY_DEBUG */ + +	/* Update all records that match */ +	for(i = 0; i < spydata->spy_number; i++) +		if(!memcmp(address, spydata->spy_address[i], ETH_ALEN)) { +			memcpy(&(spydata->spy_stat[i]), wstats, +			       sizeof(struct iw_quality)); +			match = i; +		} + +	/* Generate an event if we cross the spy threshold. +	 * To avoid event storms, we have a simple hysteresis : we generate +	 * event only when we go under the low threshold or above the +	 * high threshold. */ +	if(match >= 0) { +		if(spydata->spy_thr_under[match]) { +			if(wstats->level > spydata->spy_thr_high.level) { +				spydata->spy_thr_under[match] = 0; +				iw_send_thrspy_event(dev, spydata, +						     address, wstats); +			} +		} else { +			if(wstats->level < spydata->spy_thr_low.level) { +				spydata->spy_thr_under[match] = 1; +				iw_send_thrspy_event(dev, spydata, +						     address, wstats); +			} +		} +	} +} + +EXPORT_SYMBOL(iw_handler_get_spy); +EXPORT_SYMBOL(iw_handler_get_thrspy); +EXPORT_SYMBOL(iw_handler_set_spy); +EXPORT_SYMBOL(iw_handler_set_thrspy); +EXPORT_SYMBOL(wireless_send_event); +EXPORT_SYMBOL(wireless_spy_update);  |