diff options
| -rw-r--r-- | Documentation/networking/netlink_mmap.txt | 339 | ||||
| -rw-r--r-- | include/linux/netfilter/nfnetlink.h | 11 | ||||
| -rw-r--r-- | include/linux/netlink.h | 11 | ||||
| -rw-r--r-- | include/linux/skbuff.h | 6 | ||||
| -rw-r--r-- | include/net/netfilter/nf_conntrack.h | 2 | ||||
| -rw-r--r-- | include/net/netfilter/nf_conntrack_expect.h | 4 | ||||
| -rw-r--r-- | include/uapi/linux/netlink.h | 32 | ||||
| -rw-r--r-- | include/uapi/linux/netlink_diag.h | 10 | ||||
| -rw-r--r-- | net/Kconfig | 9 | ||||
| -rw-r--r-- | net/core/skbuff.c | 30 | ||||
| -rw-r--r-- | net/ipv4/inet_diag.c | 6 | ||||
| -rw-r--r-- | net/ipv4/udp_diag.c | 4 | ||||
| -rw-r--r-- | net/netfilter/nf_conntrack_core.c | 8 | ||||
| -rw-r--r-- | net/netfilter/nf_conntrack_expect.c | 8 | ||||
| -rw-r--r-- | net/netfilter/nfnetlink.c | 20 | ||||
| -rw-r--r-- | net/netfilter/nfnetlink_log.c | 12 | ||||
| -rw-r--r-- | net/netfilter/nfnetlink_queue_core.c | 3 | ||||
| -rw-r--r-- | net/netlink/af_netlink.c | 836 | ||||
| -rw-r--r-- | net/netlink/af_netlink.h | 20 | ||||
| -rw-r--r-- | net/netlink/diag.c | 32 | ||||
| -rw-r--r-- | net/sched/cls_flow.c | 2 | 
21 files changed, 1331 insertions, 74 deletions
diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt new file mode 100644 index 00000000000..1c2dab40962 --- /dev/null +++ b/Documentation/networking/netlink_mmap.txt @@ -0,0 +1,339 @@ +This file documents how to use memory mapped I/O with netlink. + +Author: Patrick McHardy <kaber@trash.net> + +Overview +-------- + +Memory mapped netlink I/O can be used to increase throughput and decrease +overhead of unicast receive and transmit operations. Some netlink subsystems +require high throughput, these are mainly the netfilter subsystems +nfnetlink_queue and nfnetlink_log, but it can also help speed up large +dump operations of f.i. the routing database. + +Memory mapped netlink I/O used two circular ring buffers for RX and TX which +are mapped into the processes address space. + +The RX ring is used by the kernel to directly construct netlink messages into +user-space memory without copying them as done with regular socket I/O, +additionally as long as the ring contains messages no recvmsg() or poll() +syscalls have to be issued by user-space to get more message. + +The TX ring is used to process messages directly from user-space memory, the +kernel processes all messages contained in the ring using a single sendmsg() +call. + +Usage overview +-------------- + +In order to use memory mapped netlink I/O, user-space needs three main changes: + +- ring setup +- conversion of the RX path to get messages from the ring instead of recvmsg() +- conversion of the TX path to construct messages into the ring + +Ring setup is done using setsockopt() to provide the ring parameters to the +kernel, then a call to mmap() to map the ring into the processes address space: + +- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, ¶ms, sizeof(params)); +- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, ¶ms, sizeof(params)); +- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) + +Usage of either ring is optional, but even if only the RX ring is used the +mapping still needs to be writable in order to update the frame status after +processing. + +Conversion of the reception path involves calling poll() on the file +descriptor, once the socket is readable the frames from the ring are +processsed in order until no more messages are available, as indicated by +a status word in the frame header. + +On kernel side, in order to make use of memory mapped I/O on receive, the +originating netlink subsystem needs to support memory mapped I/O, otherwise +it will use an allocated socket buffer as usual and the contents will be + copied to the ring on transmission, nullifying most of the performance gains. +Dumps of kernel databases automatically support memory mapped I/O. + +Conversion of the transmit path involves changing message contruction to +use memory from the TX ring instead of (usually) a buffer declared on the +stack and setting up the frame header approriately. Optionally poll() can +be used to wait for free frames in the TX ring. + +Structured and definitions for using memory mapped I/O are contained in +<linux/netlink.h>. + +RX and TX rings +---------------- + +Each ring contains a number of continous memory blocks, containing frames of +fixed size dependant on the parameters used for ring setup. + +Ring:	[ block 0 ] +		[ frame 0 ] +		[ frame 1 ] +	[ block 1 ] +		[ frame 2 ] +		[ frame 3 ] +	... +	[ block n ] +		[ frame 2 * n ] +		[ frame 2 * n + 1 ] + +The blocks are only visible to the kernel, from the point of view of user-space +the ring just contains the frames in a continous memory zone. + +The ring parameters used for setting up the ring are defined as follows: + +struct nl_mmap_req { +	unsigned int	nm_block_size; +	unsigned int	nm_block_nr; +	unsigned int	nm_frame_size; +	unsigned int	nm_frame_nr; +}; + +Frames are grouped into blocks, where each block is a continous region of memory +and holds nm_block_size / nm_frame_size frames. The total number of frames in +the ring is nm_frame_nr. The following invariants hold: + +- frames_per_block = nm_block_size / nm_frame_size + +- nm_frame_nr = frames_per_block * nm_block_nr + +Some parameters are constrained, specifically: + +- nm_block_size must be a multiple of the architectures memory page size. +  The getpagesize() function can be used to get the page size. + +- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be +  able to hold at least the frame header + +- nm_frame_size must be smaller or equal to nm_block_size + +- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT + +- nm_frame_nr must equal the actual number of frames as specified above. + +When the kernel can't allocate phsyically continous memory for a ring block, +it will fall back to use physically discontinous memory. This might affect +performance negatively, in order to avoid this the nm_frame_size parameter +should be chosen to be as small as possible for the required frame size and +the number of blocks should be increased instead. + +Ring frames +------------ + +Each frames contain a frame header, consisting of a synchronization word and some +meta-data, and the message itself. + +Frame:	[ header message ] + +The frame header is defined as follows: + +struct nl_mmap_hdr { +	unsigned int	nm_status; +	unsigned int	nm_len; +	__u32		nm_group; +	/* credentials */ +	__u32		nm_pid; +	__u32		nm_uid; +	__u32		nm_gid; +}; + +- nm_status is used for synchronizing processing between the kernel and user- +  space and specifies ownership of the frame as well as the operation to perform + +- nm_len contains the length of the message contained in the data area + +- nm_group specified the destination multicast group of message + +- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending +  process. These values correspond to the data available using SOCK_PASSCRED in +  the SCM_CREDENTIALS cmsg. + +The possible values in the status word are: + +- NL_MMAP_STATUS_UNUSED: +	RX ring:	frame belongs to the kernel and contains no message +			for user-space. Approriate action is to invoke poll() +			to wait for new messages. + +	TX ring:	frame belongs to user-space and can be used for +			message construction. + +- NL_MMAP_STATUS_RESERVED: +	RX ring only:	frame is currently used by the kernel for message +			construction and contains no valid message yet. +			Appropriate action is to invoke poll() to wait for +			new messages. + +- NL_MMAP_STATUS_VALID: +	RX ring:	frame contains a valid message. Approriate action is +			to process the message and release the frame back to +			the kernel by setting the status to +			NL_MMAP_STATUS_UNUSED or queue the frame by setting the +			status to NL_MMAP_STATUS_SKIP. + +	TX ring:	the frame contains a valid message from user-space to +			be processed by the kernel. After completing processing +			the kernel will release the frame back to user-space by +			setting the status to NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_COPY: +	RX ring only:	a message is ready to be processed but could not be +			stored in the ring, either because it exceeded the +			frame size or because the originating subsystem does +			not support memory mapped I/O. Appropriate action is +			to invoke recvmsg() to receive the message and release +			the frame back to the kernel by setting the status to +			NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_SKIP: +	RX ring only:	user-space queued the message for later processing, but +			processed some messages following it in the ring. The +			kernel should skip this frame when looking for unused +			frames. + +The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the +frame header. + +TX limitations +-------------- + +Kernel processing usually involves validation of the message received by +user-space, then processing its contents. The kernel must assure that +userspace is not able to modify the message contents after they have been +validated. In order to do so, the message is copied from the ring frame +to an allocated buffer if either of these conditions is false: + +- only a single mapping of the ring exists +- the file descriptor is not shared between processes + +This means that for threaded programs, the kernel will fall back to copying. + +Example +------- + +Ring setup: + +	unsigned int block_size = 16 * getpagesize(); +	struct nl_mmap_req req = { +		.nm_block_size		= block_size, +		.nm_block_nr		= 64, +		.nm_frame_size		= 16384, +		.nm_frame_nr		= 64 * block_size / 16384, +	}; +	unsigned int ring_size; +	void *rx_ring, *tx_ring; + +	/* Configure ring parameters */ +	if (setsockopt(fd, NETLINK_RX_RING, &req, sizeof(req)) < 0) +		exit(1); +	if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0) +		exit(1) + +	/* Calculate size of each invididual ring */ +	ring_size = req.nm_block_nr * req.nm_block_size; + +	/* Map RX/TX rings. The TX ring is located after the RX ring */ +	rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, +		       MAP_SHARED, fd, 0); +	if ((long)rx_ring == -1L) +		exit(1); +	tx_ring = rx_ring + ring_size: + +Message reception: + +This example assumes some ring parameters of the ring setup are available. + +	unsigned int frame_offset = 0; +	struct nl_mmap_hdr *hdr; +	struct nlmsghdr *nlh; +	unsigned char buf[16384]; +	ssize_t len; + +	while (1) { +		struct pollfd pfds[1]; + +		pfds[0].fd	= fd; +		pfds[0].events	= POLLIN | POLLERR; +		pfds[0].revents	= 0; + +		if (poll(pfds, 1, -1) < 0 && errno != -EINTR) +			exit(1); + +		/* Check for errors. Error handling omitted */ +		if (pfds[0].revents & POLLERR) +			<handle error> + +		/* If no new messages, poll again */ +		if (!(pfds[0].revents & POLLIN)) +			continue; + +		/* Process all frames */ +		while (1) { +			/* Get next frame header */ +			hdr = rx_ring + frame_offset; + +			if (hdr->nm_status == NL_MMAP_STATUS_VALID) +				/* Regular memory mapped frame */ +				nlh = (void *hdr) + NL_MMAP_HDRLEN; +				len = hdr->nm_len; + +				/* Release empty message immediately. May happen +				 * on error during message construction. +				 */ +				if (len == 0) +					goto release; +			} else if (hdr->nm_status == NL_MMAP_STATUS_COPY) { +				/* Frame queued to socket receive queue */ +				len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); +				if (len <= 0) +					break; +				nlh = buf; +			} else +				/* No more messages to process, continue polling */ +				break; + +			process_msg(nlh); +release: +			/* Release frame back to the kernel */ +			hdr->nm_status = NL_MMAP_STATUS_UNUSED; + +			/* Advance frame offset to next frame */ +			frame_offset = (frame_offset + frame_size) % ring_size; +		} +	} + +Message transmission: + +This example assumes some ring parameters of the ring setup are available. +A single message is constructed and transmitted, to send multiple messages +at once they would be constructed in consecutive frames before a final call +to sendto(). + +	unsigned int frame_offset = 0; +	struct nl_mmap_hdr *hdr; +	struct nlmsghdr *nlh; +	struct sockaddr_nl addr = { +		.nl_family	= AF_NETLINK, +	}; + +	hdr = tx_ring + frame_offset; +	if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) +		/* No frame available. Use poll() to avoid. */ +		exit(1); + +	nlh = (void *)hdr + NL_MMAP_HDRLEN; + +	/* Build message */ +	build_message(nlh); + +	/* Fill frame header: length and status need to be set */ +	hdr->nm_len	= nlh->nlmsg_len; +	hdr->nm_status	= NL_MMAP_STATUS_VALID; + +	if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0) +		exit(1); + +	/* Advance frame offset to next frame */ +	frame_offset = (frame_offset + frame_size) % ring_size; diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ecbb8e49591..cadb7402d7a 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,10 +29,13 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);  extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);  extern int nfnetlink_has_listeners(struct net *net, unsigned int group); -extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, -			  int echo, gfp_t flags); -extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); -extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); +extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, +					   u32 dst_portid, gfp_t gfp_mask); +extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, +			  unsigned int group, int echo, gfp_t flags); +extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); +extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, +			     u32 portid, int flags);  extern void nfnl_lock(__u8 subsys_id);  extern void nfnl_unlock(__u8 subsys_id); diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e0f746b7b95..6358da5eeee 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -15,11 +15,18 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)  	return (struct nlmsghdr *)skb->data;  } +enum netlink_skb_flags { +	NETLINK_SKB_MMAPED	= 0x1,		/* Packet data is mmaped */ +	NETLINK_SKB_TX		= 0x2,		/* Packet was sent by userspace */ +	NETLINK_SKB_DELIVERED	= 0x4,		/* Packet was delivered */ +}; +  struct netlink_skb_parms {  	struct scm_creds	creds;		/* Skb credentials	*/  	__u32			portid;  	__u32			dst_group; -	struct sock		*ssk; +	__u32			flags; +	struct sock		*sk;  };  #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb)) @@ -57,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group)  extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);  extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);  extern int netlink_has_listeners(struct sock *sk, unsigned int group); +extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, +					 u32 dst_portid, gfp_t gfp_mask);  extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);  extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,  			     __u32 group, gfp_t allocation); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5bed7b3195..2e0ced1af3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,  	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);  } +extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node); +static inline struct sk_buff *alloc_skb_head(gfp_t priority) +{ +	return __alloc_skb_head(priority, -1); +} +  extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);  extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);  extern struct sk_buff *skb_clone(struct sk_buff *skb, diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caca0c4d6b4..644d9c223d2 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);  extern void nf_ct_delete_from_lists(struct nf_conn *ct);  extern void nf_ct_dying_timeout(struct nf_conn *ct); -extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); +extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report);  extern bool nf_ct_get_tuplepr(const struct sk_buff *skb,  			      unsigned int nhoff, u_int16_t l3num, diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cbbae7621e2..3f3aecbc863 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,  		       const struct nf_conntrack_tuple *tuple);  void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, -				u32 pid, int report); +				u32 portid, int report);  static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)  {  	nf_ct_unlink_expect_report(exp, 0, 0); @@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t,  		       u_int8_t, const __be16 *, const __be16 *);  void nf_ct_expect_put(struct nf_conntrack_expect *exp);  int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  -				u32 pid, int report); +				u32 portid, int report);  static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect)  {  	return nf_ct_expect_related_report(expect, 0, 0); diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 32a354f67ba..1a85940f8ab 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -1,6 +1,7 @@  #ifndef _UAPI__LINUX_NETLINK_H  #define _UAPI__LINUX_NETLINK_H +#include <linux/kernel.h>  #include <linux/socket.h> /* for __kernel_sa_family_t */  #include <linux/types.h> @@ -105,11 +106,42 @@ struct nlmsgerr {  #define NETLINK_PKTINFO		3  #define NETLINK_BROADCAST_ERROR	4  #define NETLINK_NO_ENOBUFS	5 +#define NETLINK_RX_RING		6 +#define NETLINK_TX_RING		7  struct nl_pktinfo {  	__u32	group;  }; +struct nl_mmap_req { +	unsigned int	nm_block_size; +	unsigned int	nm_block_nr; +	unsigned int	nm_frame_size; +	unsigned int	nm_frame_nr; +}; + +struct nl_mmap_hdr { +	unsigned int	nm_status; +	unsigned int	nm_len; +	__u32		nm_group; +	/* credentials */ +	__u32		nm_pid; +	__u32		nm_uid; +	__u32		nm_gid; +}; + +enum nl_mmap_status { +	NL_MMAP_STATUS_UNUSED, +	NL_MMAP_STATUS_RESERVED, +	NL_MMAP_STATUS_VALID, +	NL_MMAP_STATUS_COPY, +	NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +  #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/  enum { diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 88009a31cd0..4e31db4eea4 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -25,9 +25,18 @@ struct netlink_diag_msg {  	__u32	ndiag_cookie[2];  }; +struct netlink_diag_ring { +	__u32	ndr_block_size; +	__u32	ndr_block_nr; +	__u32	ndr_frame_size; +	__u32	ndr_frame_nr; +}; +  enum {  	NETLINK_DIAG_MEMINFO,  	NETLINK_DIAG_GROUPS, +	NETLINK_DIAG_RX_RING, +	NETLINK_DIAG_TX_RING,  	__NETLINK_DIAG_MAX,  }; @@ -38,5 +47,6 @@ enum {  #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */  #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */ +#define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */  #endif diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868..1a2221630e6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -23,6 +23,15 @@ menuconfig NET  if NET +config NETLINK_MMAP +	bool "Netlink: mmaped IO" +	help +	  This option enables support for memory mapped netlink IO. This +	  reduces overhead by avoiding copying data between kernel- and +	  userspace. + +	  If unsure, say N. +  config WANT_COMPAT_NETLINK_MESSAGES  	bool  	help diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a92d9e7d10f..898cf5c566f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -179,6 +179,33 @@ out:   *   */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ +	struct sk_buff *skb; + +	/* Get the HEAD */ +	skb = kmem_cache_alloc_node(skbuff_head_cache, +				    gfp_mask & ~__GFP_DMA, node); +	if (!skb) +		goto out; + +	/* +	 * Only clear those fields we need to clear, not those that we will +	 * actually initialise below. Hence, don't put any more fields after +	 * the tail pointer in struct sk_buff! +	 */ +	memset(skb, 0, offsetof(struct sk_buff, tail)); +	skb->data = NULL; +	skb->truesize = sizeof(struct sk_buff); +	atomic_set(&skb->users, 1); + +#ifdef NET_SKBUFF_DATA_USES_OFFSET +	skb->mac_header = ~0U; +#endif +out: +	return skb; +} +  /**   *	__alloc_skb	-	allocate a network buffer   *	@size: size to allocate @@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb)  static void skb_release_all(struct sk_buff *skb)  {  	skb_release_head_state(skb); -	skb_release_data(skb); +	if (likely(skb->data)) +		skb_release_data(skb);  }  /** diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8620408af57..5f648751fce 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s  	}  	err = sk_diag_fill(sk, rep, req, -			   sk_user_ns(NETLINK_CB(in_skb).ssk), +			   sk_user_ns(NETLINK_CB(in_skb).sk),  			   NETLINK_CB(in_skb).portid,  			   nlh->nlmsg_seq, 0, nlh);  	if (err < 0) { @@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk,  		return 0;  	return inet_csk_diag_fill(sk, skb, r, -				  sk_user_ns(NETLINK_CB(cb->skb).ssk), +				  sk_user_ns(NETLINK_CB(cb->skb).sk),  				  NETLINK_CB(cb->skb).portid,  				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);  } @@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,  			}  			err = inet_diag_fill_req(skb, sk, req, -					       sk_user_ns(NETLINK_CB(cb->skb).ssk), +					       sk_user_ns(NETLINK_CB(cb->skb).sk),  					       NETLINK_CB(cb->skb).portid,  					       cb->nlh->nlmsg_seq, cb->nlh);  			if (err < 0) { diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 369a781851a..7927db0a927 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,  		return 0;  	return inet_sk_diag_fill(sk, NULL, skb, req, -			sk_user_ns(NETLINK_CB(cb->skb).ssk), +			sk_user_ns(NETLINK_CB(cb->skb).sk),  			NETLINK_CB(cb->skb).portid,  			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);  } @@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,  		goto out;  	err = inet_sk_diag_fill(sk, NULL, rep, req, -			   sk_user_ns(NETLINK_CB(in_skb).ssk), +			   sk_user_ns(NETLINK_CB(in_skb).sk),  			   NETLINK_CB(in_skb).portid,  			   nlh->nlmsg_seq, 0, nlh);  	if (err < 0) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 007e8c43d19..54ddc2f8e7c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net,  EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);  struct __nf_ct_flush_report { -	u32 pid; +	u32 portid;  	int report;  }; @@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data)  	/* If we fail to deliver the event, death_by_timeout() will retry */  	if (nf_conntrack_event_report(IPCT_DESTROY, i, -				      fr->pid, fr->report) < 0) +				      fr->portid, fr->report) < 0)  		return 1;  	/* Avoid the delivery of the destroy event in death_by_timeout(). */ @@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size)  }  EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report)  {  	struct __nf_ct_flush_report fr = { -		.pid 	= pid, +		.portid	= portid,  		.report = report,  	};  	nf_ct_iterate_cleanup(net, kill_report, &fr); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8c10e3db3d9..0adfdcc68ba 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;  /* nf_conntrack_expect helper functions */  void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, -				u32 pid, int report) +				u32 portid, int report)  {  	struct nf_conn_help *master_help = nfct_help(exp->master);  	struct net *net = nf_ct_exp_net(exp); @@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,  	hlist_del(&exp->lnode);  	master_help->expecting[exp->class]--; -	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); +	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);  	nf_ct_expect_put(exp);  	NF_CT_STAT_INC(net, expect_delete); @@ -412,7 +412,7 @@ out:  }  int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  -				u32 pid, int report) +				u32 portid, int report)  {  	int ret; @@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,  	if (ret < 0)  		goto out;  	spin_unlock_bh(&nf_conntrack_lock); -	nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); +	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);  	return ret;  out:  	spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index bc4c499adb1..572d87dc116 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -112,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)  }  EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, +				    u32 dst_portid, gfp_t gfp_mask) +{ +	return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,  		   unsigned int group, int echo, gfp_t flags)  { -	return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); +	return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);  }  EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)  { -	return netlink_set_err(net->nfnl, pid, group, error); +	return netlink_set_err(net->nfnl, portid, group, error);  }  EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, +		      int flags)  { -	return netlink_unicast(net->nfnl, skb, pid, flags); +	return netlink_unicast(net->nfnl, skb, portid, flags);  }  EXPORT_SYMBOL_GPL(nfnetlink_unicast); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1a0be2af1dd..d4199eb9b33 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)  }  static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size)  {  	struct sk_buff *skb;  	unsigned int n; @@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)  	 * message.  WARNING: has to be <= 128k due to slab restrictions */  	n = max(inst_size, pkt_size); -	skb = alloc_skb(n, GFP_ATOMIC); +	skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC);  	if (!skb) {  		if (n > pkt_size) {  			/* try to allocate only as much as we need for current  			 * packet */ -			skb = alloc_skb(pkt_size, GFP_ATOMIC); +			skb = nfnetlink_alloc_skb(&init_net, pkt_size, +						  peer_portid, GFP_ATOMIC);  			if (!skb)  				pr_err("nfnetlink_log: can't even alloc %u bytes\n",  				       pkt_size); @@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf,  	}  	if (!inst->skb) { -		inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); +		inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, +					     size);  		if (!inst->skb)  			goto alloc_failure;  	} @@ -824,7 +826,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,  			inst = instance_create(net, group_num,  					       NETLINK_CB(skb).portid, -					       sk_user_ns(NETLINK_CB(skb).ssk)); +					       sk_user_ns(NETLINK_CB(skb).sk));  			if (IS_ERR(inst)) {  				ret = PTR_ERR(inst);  				goto out; diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 5e280b3e154..ef3cdb4bfee 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,  	if (queue->flags & NFQA_CFG_F_CONNTRACK)  		ct = nfqnl_ct_get(entskb, &size, &ctinfo); -	skb = alloc_skb(size, GFP_ATOMIC); +	skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, +				  GFP_ATOMIC);  	if (!skb)  		return NULL; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ce2e0064e7f..2a3e9ba814c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@   *   * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>   * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * 				Patrick McHardy <kaber@trash.net>   *   *		This program is free software; you can redistribute it and/or   *		modify it under the terms of the GNU General Public License @@ -55,6 +56,8 @@  #include <linux/types.h>  #include <linux/audit.h>  #include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h>  #include <net/net_namespace.h>  #include <net/sock.h> @@ -68,6 +71,10 @@ struct listeners {  	unsigned long		masks[0];  }; +/* state bits */ +#define NETLINK_CONGESTED	0x0 + +/* flags */  #define NETLINK_KERNEL_SOCKET	0x1  #define NETLINK_RECV_PKTINFO	0x2  #define NETLINK_BROADCAST_SEND_ERROR	0x4 @@ -84,6 +91,7 @@ EXPORT_SYMBOL_GPL(nl_table);  static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);  static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb);  DEFINE_RWLOCK(nl_table_lock);  EXPORT_SYMBOL_GPL(nl_table_lock); @@ -103,6 +111,599 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u  	return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];  } +static void netlink_overrun(struct sock *sk) +{ +	struct netlink_sock *nlk = nlk_sk(sk); + +	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { +		if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { +			sk->sk_err = ENOBUFS; +			sk->sk_error_report(sk); +		} +	} +	atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ +	struct netlink_sock *nlk = nlk_sk(sk); + +	if (skb_queue_empty(&sk->sk_receive_queue)) +		clear_bit(NETLINK_CONGESTED, &nlk->state); +	if (!test_bit(NETLINK_CONGESTED, &nlk->state)) +		wake_up_interruptible(&nlk->wait); +} + +#ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + +static bool netlink_rx_is_mmaped(struct sock *sk) +{ +	return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + +static bool netlink_tx_is_mmaped(struct sock *sk) +{ +	return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + +static __pure struct page *pgvec_to_page(const void *addr) +{ +	if (is_vmalloc_addr(addr)) +		return vmalloc_to_page(addr); +	else +		return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ +	unsigned int i; + +	for (i = 0; i < len; i++) { +		if (pg_vec[i] != NULL) { +			if (is_vmalloc_addr(pg_vec[i])) +				vfree(pg_vec[i]); +			else +				free_pages((unsigned long)pg_vec[i], order); +		} +	} +	kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ +	void *buffer; +	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | +			  __GFP_NOWARN | __GFP_NORETRY; + +	buffer = (void *)__get_free_pages(gfp_flags, order); +	if (buffer != NULL) +		return buffer; + +	buffer = vzalloc((1 << order) * PAGE_SIZE); +	if (buffer != NULL) +		return buffer; + +	gfp_flags &= ~__GFP_NORETRY; +	return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, +			   struct nl_mmap_req *req, unsigned int order) +{ +	unsigned int block_nr = req->nm_block_nr; +	unsigned int i; +	void **pg_vec, *ptr; + +	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); +	if (pg_vec == NULL) +		return NULL; + +	for (i = 0; i < block_nr; i++) { +		pg_vec[i] = ptr = alloc_one_pg_vec_page(order); +		if (pg_vec[i] == NULL) +			goto err1; +	} + +	return pg_vec; +err1: +	free_pg_vec(pg_vec, order, block_nr); +	return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, +			    bool closing, bool tx_ring) +{ +	struct netlink_sock *nlk = nlk_sk(sk); +	struct netlink_ring *ring; +	struct sk_buff_head *queue; +	void **pg_vec = NULL; +	unsigned int order = 0; +	int err; + +	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; +	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + +	if (!closing) { +		if (atomic_read(&nlk->mapped)) +			return -EBUSY; +		if (atomic_read(&ring->pending)) +			return -EBUSY; +	} + +	if (req->nm_block_nr) { +		if (ring->pg_vec != NULL) +			return -EBUSY; + +		if ((int)req->nm_block_size <= 0) +			return -EINVAL; +		if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) +			return -EINVAL; +		if (req->nm_frame_size < NL_MMAP_HDRLEN) +			return -EINVAL; +		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) +			return -EINVAL; + +		ring->frames_per_block = req->nm_block_size / +					 req->nm_frame_size; +		if (ring->frames_per_block == 0) +			return -EINVAL; +		if (ring->frames_per_block * req->nm_block_nr != +		    req->nm_frame_nr) +			return -EINVAL; + +		order = get_order(req->nm_block_size); +		pg_vec = alloc_pg_vec(nlk, req, order); +		if (pg_vec == NULL) +			return -ENOMEM; +	} else { +		if (req->nm_frame_nr) +			return -EINVAL; +	} + +	err = -EBUSY; +	mutex_lock(&nlk->pg_vec_lock); +	if (closing || atomic_read(&nlk->mapped) == 0) { +		err = 0; +		spin_lock_bh(&queue->lock); + +		ring->frame_max		= req->nm_frame_nr - 1; +		ring->head		= 0; +		ring->frame_size	= req->nm_frame_size; +		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE; + +		swap(ring->pg_vec_len, req->nm_block_nr); +		swap(ring->pg_vec_order, order); +		swap(ring->pg_vec, pg_vec); + +		__skb_queue_purge(queue); +		spin_unlock_bh(&queue->lock); + +		WARN_ON(atomic_read(&nlk->mapped)); +	} +	mutex_unlock(&nlk->pg_vec_lock); + +	if (pg_vec) +		free_pg_vec(pg_vec, order, req->nm_block_nr); +	return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ +	struct file *file = vma->vm_file; +	struct socket *sock = file->private_data; +	struct sock *sk = sock->sk; + +	if (sk) +		atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ +	struct file *file = vma->vm_file; +	struct socket *sock = file->private_data; +	struct sock *sk = sock->sk; + +	if (sk) +		atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { +	.open	= netlink_mm_open, +	.close	= netlink_mm_close, +}; + +static int netlink_mmap(struct file *file, struct socket *sock, +			struct vm_area_struct *vma) +{ +	struct sock *sk = sock->sk; +	struct netlink_sock *nlk = nlk_sk(sk); +	struct netlink_ring *ring; +	unsigned long start, size, expected; +	unsigned int i; +	int err = -EINVAL; + +	if (vma->vm_pgoff) +		return -EINVAL; + +	mutex_lock(&nlk->pg_vec_lock); + +	expected = 0; +	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { +		if (ring->pg_vec == NULL) +			continue; +		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; +	} + +	if (expected == 0) +		goto out; + +	size = vma->vm_end - vma->vm_start; +	if (size != expected) +		goto out; + +	start = vma->vm_start; +	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { +		if (ring->pg_vec == NULL) +			continue; + +		for (i = 0; i < ring->pg_vec_len; i++) { +			struct page *page; +			void *kaddr = ring->pg_vec[i]; +			unsigned int pg_num; + +			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { +				page = pgvec_to_page(kaddr); +				err = vm_insert_page(vma, start, page); +				if (err < 0) +					goto out; +				start += PAGE_SIZE; +				kaddr += PAGE_SIZE; +			} +		} +	} + +	atomic_inc(&nlk->mapped); +	vma->vm_ops = &netlink_mmap_ops; +	err = 0; +out: +	mutex_unlock(&nlk->pg_vec_lock); +	return 0; +} + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) +{ +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 +	struct page *p_start, *p_end; + +	/* First page is flushed through netlink_{get,set}_status */ +	p_start = pgvec_to_page(hdr + PAGE_SIZE); +	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1); +	while (p_start <= p_end) { +		flush_dcache_page(p_start); +		p_start++; +	} +#endif +} + +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) +{ +	smp_rmb(); +	flush_dcache_page(pgvec_to_page(hdr)); +	return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, +			       enum nl_mmap_status status) +{ +	hdr->nm_status = status; +	flush_dcache_page(pgvec_to_page(hdr)); +	smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ +	unsigned int pg_vec_pos, frame_off; + +	pg_vec_pos = pos / ring->frames_per_block; +	frame_off  = pos % ring->frames_per_block; + +	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, +		     enum nl_mmap_status status) +{ +	struct nl_mmap_hdr *hdr; + +	hdr = __netlink_lookup_frame(ring, pos); +	if (netlink_get_status(hdr) != status) +		return NULL; + +	return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, +		      enum nl_mmap_status status) +{ +	return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, +		       enum nl_mmap_status status) +{ +	unsigned int prev; + +	prev = ring->head ? ring->head - 1 : ring->frame_max; +	return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ +	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ +	unsigned int head = ring->head, pos = head; +	const struct nl_mmap_hdr *hdr; + +	do { +		hdr = __netlink_lookup_frame(ring, pos); +		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) +			break; +		if (hdr->nm_status != NL_MMAP_STATUS_SKIP) +			break; +		netlink_increment_head(ring); +	} while (ring->head != head); +} + +static bool netlink_dump_space(struct netlink_sock *nlk) +{ +	struct netlink_ring *ring = &nlk->rx_ring; +	struct nl_mmap_hdr *hdr; +	unsigned int n; + +	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); +	if (hdr == NULL) +		return false; + +	n = ring->head + ring->frame_max / 2; +	if (n > ring->frame_max) +		n -= ring->frame_max; + +	hdr = __netlink_lookup_frame(ring, n); + +	return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, +				 poll_table *wait) +{ +	struct sock *sk = sock->sk; +	struct netlink_sock *nlk = nlk_sk(sk); +	unsigned int mask; +	int err; + +	if (nlk->rx_ring.pg_vec != NULL) { +		/* Memory mapped sockets don't call recvmsg(), so flow control +		 * for dumps is performed here. A dump is allowed to continue +		 * if at least half the ring is unused. +		 */ +		while (nlk->cb != NULL && netlink_dump_space(nlk)) { +			err = netlink_dump(sk); +			if (err < 0) { +				sk->sk_err = err; +				sk->sk_error_report(sk); +				break; +			} +		} +		netlink_rcv_wake(sk); +	} + +	mask = datagram_poll(file, sock, wait); + +	spin_lock_bh(&sk->sk_receive_queue.lock); +	if (nlk->rx_ring.pg_vec) { +		netlink_forward_ring(&nlk->rx_ring); +		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) +			mask |= POLLIN | POLLRDNORM; +	} +	spin_unlock_bh(&sk->sk_receive_queue.lock); + +	spin_lock_bh(&sk->sk_write_queue.lock); +	if (nlk->tx_ring.pg_vec) { +		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) +			mask |= POLLOUT | POLLWRNORM; +	} +	spin_unlock_bh(&sk->sk_write_queue.lock); + +	return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ +	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, +				   struct netlink_ring *ring, +				   struct nl_mmap_hdr *hdr) +{ +	unsigned int size; +	void *data; + +	size = ring->frame_size - NL_MMAP_HDRLEN; +	data = (void *)hdr + NL_MMAP_HDRLEN; + +	skb->head	= data; +	skb->data	= data; +	skb_reset_tail_pointer(skb); +	skb->end	= skb->tail + size; +	skb->len	= 0; + +	skb->destructor	= netlink_skb_destructor; +	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; +	NETLINK_CB(skb).sk = sk; +} + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, +				u32 dst_portid, u32 dst_group, +				struct sock_iocb *siocb) +{ +	struct netlink_sock *nlk = nlk_sk(sk); +	struct netlink_ring *ring; +	struct nl_mmap_hdr *hdr; +	struct sk_buff *skb; +	unsigned int maxlen; +	bool excl = true; +	int err = 0, len = 0; + +	/* Netlink messages are validated by the receiver before processing. +	 * In order to avoid userspace changing the contents of the message +	 * after validation, the socket and the ring may only be used by a +	 * single process, otherwise we fall back to copying. +	 */ +	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || +	    atomic_read(&nlk->mapped) > 1) +		excl = false; + +	mutex_lock(&nlk->pg_vec_lock); + +	ring   = &nlk->tx_ring; +	maxlen = ring->frame_size - NL_MMAP_HDRLEN; + +	do { +		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); +		if (hdr == NULL) { +			if (!(msg->msg_flags & MSG_DONTWAIT) && +			    atomic_read(&nlk->tx_ring.pending)) +				schedule(); +			continue; +		} +		if (hdr->nm_len > maxlen) { +			err = -EINVAL; +			goto out; +		} + +		netlink_frame_flush_dcache(hdr); + +		if (likely(dst_portid == 0 && dst_group == 0 && excl)) { +			skb = alloc_skb_head(GFP_KERNEL); +			if (skb == NULL) { +				err = -ENOBUFS; +				goto out; +			} +			sock_hold(sk); +			netlink_ring_setup_skb(skb, sk, ring, hdr); +			NETLINK_CB(skb).flags |= NETLINK_SKB_TX; +			__skb_put(skb, hdr->nm_len); +			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); +			atomic_inc(&ring->pending); +		} else { +			skb = alloc_skb(hdr->nm_len, GFP_KERNEL); +			if (skb == NULL) { +				err = -ENOBUFS; +				goto out; +			} +			__skb_put(skb, hdr->nm_len); +			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); +			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); +		} + +		netlink_increment_head(ring); + +		NETLINK_CB(skb).portid	  = nlk->portid; +		NETLINK_CB(skb).dst_group = dst_group; +		NETLINK_CB(skb).creds	  = siocb->scm->creds; + +		err = security_netlink_send(sk, skb); +		if (err) { +			kfree_skb(skb); +			goto out; +		} + +		if (unlikely(dst_group)) { +			atomic_inc(&skb->users); +			netlink_broadcast(sk, skb, dst_portid, dst_group, +					  GFP_KERNEL); +		} +		err = netlink_unicast(sk, skb, dst_portid, +				      msg->msg_flags & MSG_DONTWAIT); +		if (err < 0) +			goto out; +		len += err; + +	} while (hdr != NULL || +		 (!(msg->msg_flags & MSG_DONTWAIT) && +		  atomic_read(&nlk->tx_ring.pending))); + +	if (len > 0) +		err = len; +out: +	mutex_unlock(&nlk->pg_vec_lock); +	return err; +} + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct nl_mmap_hdr *hdr; + +	hdr = netlink_mmap_hdr(skb); +	hdr->nm_len	= skb->len; +	hdr->nm_group	= NETLINK_CB(skb).dst_group; +	hdr->nm_pid	= NETLINK_CB(skb).creds.pid; +	hdr->nm_uid	= NETLINK_CB(skb).creds.uid; +	hdr->nm_gid	= NETLINK_CB(skb).creds.gid; +	netlink_frame_flush_dcache(hdr); +	netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + +	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; +	kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ +	struct netlink_sock *nlk = nlk_sk(sk); +	struct netlink_ring *ring = &nlk->rx_ring; +	struct nl_mmap_hdr *hdr; + +	spin_lock_bh(&sk->sk_receive_queue.lock); +	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); +	if (hdr == NULL) { +		spin_unlock_bh(&sk->sk_receive_queue.lock); +		kfree_skb(skb); +		netlink_overrun(sk); +		return; +	} +	netlink_increment_head(ring); +	__skb_queue_tail(&sk->sk_receive_queue, skb); +	spin_unlock_bh(&sk->sk_receive_queue.lock); + +	hdr->nm_len	= skb->len; +	hdr->nm_group	= NETLINK_CB(skb).dst_group; +	hdr->nm_pid	= NETLINK_CB(skb).creds.pid; +	hdr->nm_uid	= NETLINK_CB(skb).creds.uid; +	hdr->nm_gid	= NETLINK_CB(skb).creds.gid; +	netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb)	false +#define netlink_rx_is_mmaped(sk)	false +#define netlink_tx_is_mmaped(sk)	false +#define netlink_mmap			sock_no_mmap +#define netlink_poll			datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0 +#endif /* CONFIG_NETLINK_MMAP */ +  static void netlink_destroy_callback(struct netlink_callback *cb)  {  	kfree_skb(cb->skb); @@ -115,6 +716,53 @@ static void netlink_consume_callback(struct netlink_callback *cb)  	kfree(cb);  } +static void netlink_skb_destructor(struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP +	struct nl_mmap_hdr *hdr; +	struct netlink_ring *ring; +	struct sock *sk; + +	/* If a packet from the kernel to userspace was freed because of an +	 * error without being delivered to userspace, the kernel must reset +	 * the status. In the direction userspace to kernel, the status is +	 * always reset here after the packet was processed and freed. +	 */ +	if (netlink_skb_is_mmaped(skb)) { +		hdr = netlink_mmap_hdr(skb); +		sk = NETLINK_CB(skb).sk; + +		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { +			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); +			ring = &nlk_sk(sk)->tx_ring; +		} else { +			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { +				hdr->nm_len = 0; +				netlink_set_status(hdr, NL_MMAP_STATUS_VALID); +			} +			ring = &nlk_sk(sk)->rx_ring; +		} + +		WARN_ON(atomic_read(&ring->pending) == 0); +		atomic_dec(&ring->pending); +		sock_put(sk); + +		skb->data = NULL; +	} +#endif +	if (skb->sk != NULL) +		sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ +	WARN_ON(skb->sk != NULL); +	skb->sk = sk; +	skb->destructor = netlink_skb_destructor; +	atomic_add(skb->truesize, &sk->sk_rmem_alloc); +	sk_mem_charge(sk, skb->truesize); +} +  static void netlink_sock_destruct(struct sock *sk)  {  	struct netlink_sock *nlk = nlk_sk(sk); @@ -128,6 +776,18 @@ static void netlink_sock_destruct(struct sock *sk)  	}  	skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP +	if (1) { +		struct nl_mmap_req req; + +		memset(&req, 0, sizeof(req)); +		if (nlk->rx_ring.pg_vec) +			netlink_set_ring(sk, &req, true, false); +		memset(&req, 0, sizeof(req)); +		if (nlk->tx_ring.pg_vec) +			netlink_set_ring(sk, &req, true, true); +	} +#endif /* CONFIG_NETLINK_MMAP */  	if (!sock_flag(sk, SOCK_DEAD)) {  		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -391,6 +1051,9 @@ static int __netlink_create(struct net *net, struct socket *sock,  		mutex_init(nlk->cb_mutex);  	}  	init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP +	mutex_init(&nlk->pg_vec_lock); +#endif  	sk->sk_destruct = netlink_sock_destruct;  	sk->sk_protocol = protocol; @@ -722,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,  	return 0;  } -static void netlink_overrun(struct sock *sk) -{ -	struct netlink_sock *nlk = nlk_sk(sk); - -	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { -		if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { -			sk->sk_err = ENOBUFS; -			sk->sk_error_report(sk); -		} -	} -	atomic_inc(&sk->sk_drops); -} -  static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)  {  	struct sock *sock; @@ -787,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,  	nlk = nlk_sk(sk); -	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || -	    test_bit(0, &nlk->state)) { +	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || +	     test_bit(NETLINK_CONGESTED, &nlk->state)) && +	    !netlink_skb_is_mmaped(skb)) {  		DECLARE_WAITQUEUE(wait, current);  		if (!*timeo) {  			if (!ssk || netlink_is_kernel(ssk)) @@ -802,7 +1453,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,  		add_wait_queue(&nlk->wait, &wait);  		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || -		     test_bit(0, &nlk->state)) && +		     test_bit(NETLINK_CONGESTED, &nlk->state)) &&  		    !sock_flag(sk, SOCK_DEAD))  			*timeo = schedule_timeout(*timeo); @@ -816,7 +1467,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,  		}  		return 1;  	} -	skb_set_owner_r(skb, sk); +	netlink_skb_set_owner_r(skb, sk);  	return 0;  } @@ -824,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)  {  	int len = skb->len; -	skb_queue_tail(&sk->sk_receive_queue, skb); +#ifdef CONFIG_NETLINK_MMAP +	if (netlink_skb_is_mmaped(skb)) +		netlink_queue_mmaped_skb(sk, skb); +	else if (netlink_rx_is_mmaped(sk)) +		netlink_ring_set_copied(sk, skb); +	else +#endif /* CONFIG_NETLINK_MMAP */ +		skb_queue_tail(&sk->sk_receive_queue, skb);  	sk->sk_data_ready(sk, len);  	return len;  } @@ -847,7 +1505,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)  {  	int delta; -	skb_orphan(skb); +	WARN_ON(skb->sk != NULL); +	if (netlink_skb_is_mmaped(skb)) +		return skb;  	delta = skb->end - skb->tail;  	if (delta * 2 < skb->truesize) @@ -867,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)  	return skb;  } -static void netlink_rcv_wake(struct sock *sk) -{ -	struct netlink_sock *nlk = nlk_sk(sk); - -	if (skb_queue_empty(&sk->sk_receive_queue)) -		clear_bit(0, &nlk->state); -	if (!test_bit(0, &nlk->state)) -		wake_up_interruptible(&nlk->wait); -} -  static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,  				  struct sock *ssk)  { @@ -886,8 +1536,8 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,  	ret = -ECONNREFUSED;  	if (nlk->netlink_rcv != NULL) {  		ret = skb->len; -		skb_set_owner_r(skb, sk); -		NETLINK_CB(skb).ssk = ssk; +		netlink_skb_set_owner_r(skb, sk); +		NETLINK_CB(skb).sk = ssk;  		nlk->netlink_rcv(skb);  		consume_skb(skb);  	} else { @@ -933,6 +1583,69 @@ retry:  }  EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, +				  u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP +	struct sock *sk = NULL; +	struct sk_buff *skb; +	struct netlink_ring *ring; +	struct nl_mmap_hdr *hdr; +	unsigned int maxlen; + +	sk = netlink_getsockbyportid(ssk, dst_portid); +	if (IS_ERR(sk)) +		goto out; + +	ring = &nlk_sk(sk)->rx_ring; +	/* fast-path without atomic ops for common case: non-mmaped receiver */ +	if (ring->pg_vec == NULL) +		goto out_put; + +	skb = alloc_skb_head(gfp_mask); +	if (skb == NULL) +		goto err1; + +	spin_lock_bh(&sk->sk_receive_queue.lock); +	/* check again under lock */ +	if (ring->pg_vec == NULL) +		goto out_free; + +	maxlen = ring->frame_size - NL_MMAP_HDRLEN; +	if (maxlen < size) +		goto out_free; + +	netlink_forward_ring(ring); +	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); +	if (hdr == NULL) +		goto err2; +	netlink_ring_setup_skb(skb, sk, ring, hdr); +	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); +	atomic_inc(&ring->pending); +	netlink_increment_head(ring); + +	spin_unlock_bh(&sk->sk_receive_queue.lock); +	return skb; + +err2: +	kfree_skb(skb); +	spin_unlock_bh(&sk->sk_receive_queue.lock); +	netlink_overrun(sk); +err1: +	sock_put(sk); +	return NULL; + +out_free: +	kfree_skb(skb); +	spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: +	sock_put(sk); +out: +#endif +	return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); +  int netlink_has_listeners(struct sock *sk, unsigned int group)  {  	int res = 0; @@ -957,8 +1670,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)  	struct netlink_sock *nlk = nlk_sk(sk);  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && -	    !test_bit(0, &nlk->state)) { -		skb_set_owner_r(skb, sk); +	    !test_bit(NETLINK_CONGESTED, &nlk->state)) { +		netlink_skb_set_owner_r(skb, sk);  		__netlink_sendskb(sk, skb);  		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);  	} @@ -1193,7 +1906,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,  	if (level != SOL_NETLINK)  		return -ENOPROTOOPT; -	if (optlen >= sizeof(int) && +	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && +	    optlen >= sizeof(int) &&  	    get_user(val, (unsigned int __user *)optval))  		return -EFAULT; @@ -1235,13 +1949,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,  	case NETLINK_NO_ENOBUFS:  		if (val) {  			nlk->flags |= NETLINK_RECV_NO_ENOBUFS; -			clear_bit(0, &nlk->state); +			clear_bit(NETLINK_CONGESTED, &nlk->state);  			wake_up_interruptible(&nlk->wait);  		} else {  			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;  		}  		err = 0;  		break; +#ifdef CONFIG_NETLINK_MMAP +	case NETLINK_RX_RING: +	case NETLINK_TX_RING: { +		struct nl_mmap_req req; + +		/* Rings might consume more memory than queue limits, require +		 * CAP_NET_ADMIN. +		 */ +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		if (optlen < sizeof(req)) +			return -EINVAL; +		if (copy_from_user(&req, optval, sizeof(req))) +			return -EFAULT; +		err = netlink_set_ring(sk, &req, false, +				       optname == NETLINK_TX_RING); +		break; +	} +#endif /* CONFIG_NETLINK_MMAP */  	default:  		err = -ENOPROTOOPT;  	} @@ -1352,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,  			goto out;  	} +	if (netlink_tx_is_mmaped(sk) && +	    msg->msg_iov->iov_base == NULL) { +		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, +					   siocb); +		goto out; +	} +  	err = -EMSGSIZE;  	if (len > sk->sk_sndbuf - 32)  		goto out; @@ -1684,9 +2424,13 @@ static int netlink_dump(struct sock *sk)  	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); -	skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); +	if (!netlink_rx_is_mmaped(sk) && +	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) +		goto errout_skb; +	skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);  	if (!skb)  		goto errout_skb; +	netlink_skb_set_owner_r(skb, sk);  	len = cb->dump(skb, cb); @@ -1741,6 +2485,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,  	if (cb == NULL)  		return -ENOBUFS; +	/* Memory mapped dump requests need to be copied to avoid looping +	 * on the pending state in netlink_mmap_sendmsg() while the CB hold +	 * a reference to the skb. +	 */ +	if (netlink_skb_is_mmaped(skb)) { +		skb = skb_copy(skb, GFP_KERNEL); +		if (skb == NULL) { +			kfree(cb); +			return -ENOBUFS; +		} +	} else +		atomic_inc(&skb->users); +  	cb->dump = control->dump;  	cb->done = control->done;  	cb->nlh = nlh; @@ -1801,7 +2558,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)  	if (err)  		payload += nlmsg_len(nlh); -	skb = nlmsg_new(payload, GFP_KERNEL); +	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), +				NETLINK_CB(in_skb).portid, GFP_KERNEL);  	if (!skb) {  		struct sock *sk; @@ -2067,7 +2825,7 @@ static const struct proto_ops netlink_ops = {  	.socketpair =	sock_no_socketpair,  	.accept =	sock_no_accept,  	.getname =	netlink_getname, -	.poll =		datagram_poll, +	.poll =		netlink_poll,  	.ioctl =	sock_no_ioctl,  	.listen =	sock_no_listen,  	.shutdown =	sock_no_shutdown, @@ -2075,7 +2833,7 @@ static const struct proto_ops netlink_ops = {  	.getsockopt =	netlink_getsockopt,  	.sendmsg =	netlink_sendmsg,  	.recvmsg =	netlink_recvmsg, -	.mmap =		sock_no_mmap, +	.mmap =		netlink_mmap,  	.sendpage =	sock_no_sendpage,  }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index d9acb2a1d85..ed8522265f4 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -6,6 +6,20 @@  #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)  #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long)) +struct netlink_ring { +	void			**pg_vec; +	unsigned int		head; +	unsigned int		frames_per_block; +	unsigned int		frame_size; +	unsigned int		frame_max; + +	unsigned int		pg_vec_order; +	unsigned int		pg_vec_pages; +	unsigned int		pg_vec_len; + +	atomic_t		pending; +}; +  struct netlink_sock {  	/* struct sock has to be the first member of netlink_sock */  	struct sock		sk; @@ -24,6 +38,12 @@ struct netlink_sock {  	void			(*netlink_rcv)(struct sk_buff *skb);  	void			(*netlink_bind)(int group);  	struct module		*module; +#ifdef CONFIG_NETLINK_MMAP +	struct mutex		pg_vec_lock; +	struct netlink_ring	rx_ring; +	struct netlink_ring	tx_ring; +	atomic_t		mapped; +#endif /* CONFIG_NETLINK_MMAP */  };  static inline struct netlink_sock *nlk_sk(struct sock *sk) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 5ffb1d1cf40..4e4aa471cd0 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -7,6 +7,34 @@  #include "af_netlink.h" +static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, +			    struct sk_buff *nlskb) +{ +	struct netlink_diag_ring ndr; + +	ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; +	ndr.ndr_block_nr   = ring->pg_vec_len; +	ndr.ndr_frame_size = ring->frame_size; +	ndr.ndr_frame_nr   = ring->frame_max + 1; + +	return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); +} + +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ +	struct netlink_sock *nlk = nlk_sk(sk); +	int ret; + +	mutex_lock(&nlk->pg_vec_lock); +	ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); +	if (!ret) +		ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, +				       nlskb); +	mutex_unlock(&nlk->pg_vec_lock); + +	return ret; +} +  static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)  {  	struct netlink_sock *nlk = nlk_sk(sk); @@ -51,6 +79,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,  	    sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))  		goto out_nlmsg_trim; +	if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && +	    sk_diag_put_rings_cfg(sk, skb)) +		goto out_nlmsg_trim; +  	return nlmsg_end(skb, nlh);  out_nlmsg_trim: diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index aa36a8c8b33..7881e2fccbc 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,  			return -EOPNOTSUPP;  		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && -		    sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) +		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)  			return -EOPNOTSUPP;  	}  |