diff options
| author | Andy Grover <andy.grover@oracle.com> | 2009-08-21 12:28:31 +0000 | 
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2009-08-23 19:13:02 -0700 | 
| commit | 70041088e3b976627ba9a183b812f39ef8a9ba0e (patch) | |
| tree | bad7b11763d7b02b185bd705fe5ed292397cbc7a /net/rds/tcp.c | |
| parent | 7d6fd5e7e97a2188d56441e4e96494c21c5994a7 (diff) | |
| download | olio-linux-3.10-70041088e3b976627ba9a183b812f39ef8a9ba0e.tar.xz olio-linux-3.10-70041088e3b976627ba9a183b812f39ef8a9ba0e.zip  | |
RDS: Add TCP transport to RDS
This code allows RDS to be tunneled over a TCP connection.
RDMA operations are disabled when using TCP transport,
but this frees RDS from the IB/RDMA stack dependency, and allows
it to be used with standard Ethernet adapters, or in a VM.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds/tcp.c')
| -rw-r--r-- | net/rds/tcp.c | 319 | 
1 files changed, 319 insertions, 0 deletions
diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 00000000000..e0ac9009db1 --- /dev/null +++ b/net/rds/tcp.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2006 Oracle.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <net/tcp.h> + +#include "rds.h" +#include "tcp.h" + +/* only for info exporting */ +static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); +static LIST_HEAD(rds_tcp_tc_list); +unsigned int rds_tcp_tc_count; + +/* Track rds_tcp_connection structs so they can be cleaned up */ +static DEFINE_SPINLOCK(rds_tcp_conn_lock); +static LIST_HEAD(rds_tcp_conn_list); + +static struct kmem_cache *rds_tcp_conn_slab; + +#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) + +/* doing it this way avoids calling tcp_sk() */ +void rds_tcp_nonagle(struct socket *sock) +{ +	mm_segment_t oldfs = get_fs(); +	int val = 1; + +	set_fs(KERNEL_DS); +	sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, +			      sizeof(val)); +	set_fs(oldfs); +} + +void rds_tcp_tune(struct socket *sock) +{ +	struct sock *sk = sock->sk; + +	rds_tcp_nonagle(sock); + +	/* +	 * We're trying to saturate gigabit with the default, +	 * see svc_sock_setbufsize(). +	 */ +	lock_sock(sk); +	sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; +	sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; +	sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; +	release_sock(sk); +} + +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +{ +	return tcp_sk(tc->t_sock->sk)->snd_nxt; +} + +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) +{ +	return tcp_sk(tc->t_sock->sk)->snd_una; +} + +void rds_tcp_restore_callbacks(struct socket *sock, +			       struct rds_tcp_connection *tc) +{ +	rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); +	write_lock_bh(&sock->sk->sk_callback_lock); + +	/* done under the callback_lock to serialize with write_space */ +	spin_lock(&rds_tcp_tc_list_lock); +	list_del_init(&tc->t_list_item); +	rds_tcp_tc_count--; +	spin_unlock(&rds_tcp_tc_list_lock); + +	tc->t_sock = NULL; + +	sock->sk->sk_write_space = tc->t_orig_write_space; +	sock->sk->sk_data_ready = tc->t_orig_data_ready; +	sock->sk->sk_state_change = tc->t_orig_state_change; +	sock->sk->sk_user_data = NULL; + +	write_unlock_bh(&sock->sk->sk_callback_lock); +} + +/* + * This is the only path that sets tc->t_sock.  Send and receive trust that + * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being + * called while it isn't set. + */ +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +{ +	struct rds_tcp_connection *tc = conn->c_transport_data; + +	rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); +	write_lock_bh(&sock->sk->sk_callback_lock); + +	/* done under the callback_lock to serialize with write_space */ +	spin_lock(&rds_tcp_tc_list_lock); +	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); +	rds_tcp_tc_count++; +	spin_unlock(&rds_tcp_tc_list_lock); + +	/* accepted sockets need our listen data ready undone */ +	if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) +		sock->sk->sk_data_ready = sock->sk->sk_user_data; + +	tc->t_sock = sock; +	tc->conn = conn; +	tc->t_orig_data_ready = sock->sk->sk_data_ready; +	tc->t_orig_write_space = sock->sk->sk_write_space; +	tc->t_orig_state_change = sock->sk->sk_state_change; + +	sock->sk->sk_user_data = conn; +	sock->sk->sk_data_ready = rds_tcp_data_ready; +	sock->sk->sk_write_space = rds_tcp_write_space; +	sock->sk->sk_state_change = rds_tcp_state_change; + +	write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void rds_tcp_tc_info(struct socket *sock, unsigned int len, +			    struct rds_info_iterator *iter, +			    struct rds_info_lengths *lens) +{ +	struct rds_info_tcp_socket tsinfo; +	struct rds_tcp_connection *tc; +	unsigned long flags; +	struct sockaddr_in sin; +	int sinlen; + +	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + +	if (len / sizeof(tsinfo) < rds_tcp_tc_count) +		goto out; + +	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + +		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); +		tsinfo.local_addr = sin.sin_addr.s_addr; +		tsinfo.local_port = sin.sin_port; +		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); +		tsinfo.peer_addr = sin.sin_addr.s_addr; +		tsinfo.peer_port = sin.sin_port; + +		tsinfo.hdr_rem = tc->t_tinc_hdr_rem; +		tsinfo.data_rem = tc->t_tinc_data_rem; +		tsinfo.last_sent_nxt = tc->t_last_sent_nxt; +		tsinfo.last_expected_una = tc->t_last_expected_una; +		tsinfo.last_seen_una = tc->t_last_seen_una; + +		rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); +	} + +out: +	lens->nr = rds_tcp_tc_count; +	lens->each = sizeof(tsinfo); + +	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + +static int rds_tcp_laddr_check(__be32 addr) +{ +	if (inet_addr_type(&init_net, addr) == RTN_LOCAL) +		return 0; +	return -EADDRNOTAVAIL; +} + +static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ +	struct rds_tcp_connection *tc; + +	tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); +	if (tc == NULL) +		return -ENOMEM; + +	tc->t_sock = NULL; +	tc->t_tinc = NULL; +	tc->t_tinc_hdr_rem = sizeof(struct rds_header); +	tc->t_tinc_data_rem = 0; + +	conn->c_transport_data = tc; + +	spin_lock_irq(&rds_tcp_conn_lock); +	list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); +	spin_unlock_irq(&rds_tcp_conn_lock); + +	rdsdebug("alloced tc %p\n", conn->c_transport_data); +	return 0; +} + +static void rds_tcp_conn_free(void *arg) +{ +	struct rds_tcp_connection *tc = arg; +	rdsdebug("freeing tc %p\n", tc); +	kmem_cache_free(rds_tcp_conn_slab, tc); +} + +static void rds_tcp_destroy_conns(void) +{ +	struct rds_tcp_connection *tc, *_tc; +	LIST_HEAD(tmp_list); + +	/* avoid calling conn_destroy with irqs off */ +	spin_lock_irq(&rds_tcp_conn_lock); +	list_splice(&rds_tcp_conn_list, &tmp_list); +	INIT_LIST_HEAD(&rds_tcp_conn_list); +	spin_unlock_irq(&rds_tcp_conn_lock); + +	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { +		if (tc->conn->c_passive) +			rds_conn_destroy(tc->conn->c_passive); +		rds_conn_destroy(tc->conn); +	} +} + +void rds_tcp_exit(void) +{ +	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); +	rds_tcp_listen_stop(); +	rds_tcp_destroy_conns(); +	rds_trans_unregister(&rds_tcp_transport); +	rds_tcp_recv_exit(); +	kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + +struct rds_transport rds_tcp_transport = { +	.laddr_check		= rds_tcp_laddr_check, +	.xmit_prepare		= rds_tcp_xmit_prepare, +	.xmit_complete		= rds_tcp_xmit_complete, +	.xmit_cong_map		= rds_tcp_xmit_cong_map, +	.xmit			= rds_tcp_xmit, +	.recv			= rds_tcp_recv, +	.conn_alloc		= rds_tcp_conn_alloc, +	.conn_free		= rds_tcp_conn_free, +	.conn_connect		= rds_tcp_conn_connect, +	.conn_shutdown		= rds_tcp_conn_shutdown, +	.inc_copy_to_user	= rds_tcp_inc_copy_to_user, +	.inc_purge		= rds_tcp_inc_purge, +	.inc_free		= rds_tcp_inc_free, +	.stats_info_copy	= rds_tcp_stats_info_copy, +	.exit			= rds_tcp_exit, +	.t_owner		= THIS_MODULE, +	.t_name			= "tcp", +	.t_prefer_loopback	= 1, +}; + +int __init rds_tcp_init(void) +{ +	int ret; + +	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", +					      sizeof(struct rds_tcp_connection), +					      0, 0, NULL); +	if (rds_tcp_conn_slab == NULL) { +		ret = -ENOMEM; +		goto out; +	} + +	ret = rds_tcp_recv_init(); +	if (ret) +		goto out_slab; + +	ret = rds_trans_register(&rds_tcp_transport); +	if (ret) +		goto out_recv; + +	ret = rds_tcp_listen_init(); +	if (ret) +		goto out_register; + +	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + +	goto out; + +out_register: +	rds_trans_unregister(&rds_tcp_transport); +out_recv: +	rds_tcp_recv_exit(); +out_slab: +	kmem_cache_destroy(rds_tcp_conn_slab); +out: +	return ret; +} +module_init(rds_tcp_init); + +MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); +MODULE_DESCRIPTION("RDS: TCP transport"); +MODULE_LICENSE("Dual BSD/GPL"); +  |