diff options
Diffstat (limited to 'arch/sparc/kernel/ldc.c')
| -rw-r--r-- | arch/sparc/kernel/ldc.c | 2378 | 
1 files changed, 2378 insertions, 0 deletions
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c new file mode 100644 index 00000000000..d68982330f6 --- /dev/null +++ b/arch/sparc/kernel/ldc.c @@ -0,0 +1,2378 @@ +/* ldc.c: Logical Domain Channel link-layer protocol driver. + * + * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/scatterlist.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/init.h> + +#include <asm/hypervisor.h> +#include <asm/iommu.h> +#include <asm/page.h> +#include <asm/ldc.h> +#include <asm/mdesc.h> + +#define DRV_MODULE_NAME		"ldc" +#define PFX DRV_MODULE_NAME	": " +#define DRV_MODULE_VERSION	"1.1" +#define DRV_MODULE_RELDATE	"July 22, 2008" + +static char version[] __devinitdata = +	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +#define LDC_PACKET_SIZE		64 + +/* Packet header layout for unreliable and reliable mode frames. + * When in RAW mode, packets are simply straight 64-byte payloads + * with no headers. + */ +struct ldc_packet { +	u8			type; +#define LDC_CTRL		0x01 +#define LDC_DATA		0x02 +#define LDC_ERR			0x10 + +	u8			stype; +#define LDC_INFO		0x01 +#define LDC_ACK			0x02 +#define LDC_NACK		0x04 + +	u8			ctrl; +#define LDC_VERS		0x01 /* Link Version		*/ +#define LDC_RTS			0x02 /* Request To Send		*/ +#define LDC_RTR			0x03 /* Ready To Receive	*/ +#define LDC_RDX			0x04 /* Ready for Data eXchange	*/ +#define LDC_CTRL_MSK		0x0f + +	u8			env; +#define LDC_LEN			0x3f +#define LDC_FRAG_MASK		0xc0 +#define LDC_START		0x40 +#define LDC_STOP		0x80 + +	u32			seqid; + +	union { +		u8		u_data[LDC_PACKET_SIZE - 8]; +		struct { +			u32	pad; +			u32	ackid; +			u8	r_data[LDC_PACKET_SIZE - 8 - 8]; +		} r; +	} u; +}; + +struct ldc_version { +	u16 major; +	u16 minor; +}; + +/* Ordered from largest major to lowest.  */ +static struct ldc_version ver_arr[] = { +	{ .major = 1, .minor = 0 }, +}; + +#define LDC_DEFAULT_MTU			(4 * LDC_PACKET_SIZE) +#define LDC_DEFAULT_NUM_ENTRIES		(PAGE_SIZE / LDC_PACKET_SIZE) + +struct ldc_channel; + +struct ldc_mode_ops { +	int (*write)(struct ldc_channel *, const void *, unsigned int); +	int (*read)(struct ldc_channel *, void *, unsigned int); +}; + +static const struct ldc_mode_ops raw_ops; +static const struct ldc_mode_ops nonraw_ops; +static const struct ldc_mode_ops stream_ops; + +int ldom_domaining_enabled; + +struct ldc_iommu { +	/* Protects arena alloc/free.  */ +	spinlock_t			lock; +	struct iommu_arena		arena; +	struct ldc_mtable_entry		*page_table; +}; + +struct ldc_channel { +	/* Protects all operations that depend upon channel state.  */ +	spinlock_t			lock; + +	unsigned long			id; + +	u8				*mssbuf; +	u32				mssbuf_len; +	u32				mssbuf_off; + +	struct ldc_packet		*tx_base; +	unsigned long			tx_head; +	unsigned long			tx_tail; +	unsigned long			tx_num_entries; +	unsigned long			tx_ra; + +	unsigned long			tx_acked; + +	struct ldc_packet		*rx_base; +	unsigned long			rx_head; +	unsigned long			rx_tail; +	unsigned long			rx_num_entries; +	unsigned long			rx_ra; + +	u32				rcv_nxt; +	u32				snd_nxt; + +	unsigned long			chan_state; + +	struct ldc_channel_config	cfg; +	void				*event_arg; + +	const struct ldc_mode_ops	*mops; + +	struct ldc_iommu		iommu; + +	struct ldc_version		ver; + +	u8				hs_state; +#define LDC_HS_CLOSED			0x00 +#define LDC_HS_OPEN			0x01 +#define LDC_HS_GOTVERS			0x02 +#define LDC_HS_SENTRTR			0x03 +#define LDC_HS_GOTRTR			0x04 +#define LDC_HS_COMPLETE			0x10 + +	u8				flags; +#define LDC_FLAG_ALLOCED_QUEUES		0x01 +#define LDC_FLAG_REGISTERED_QUEUES	0x02 +#define LDC_FLAG_REGISTERED_IRQS	0x04 +#define LDC_FLAG_RESET			0x10 + +	u8				mss; +	u8				state; + +#define LDC_IRQ_NAME_MAX		32 +	char				rx_irq_name[LDC_IRQ_NAME_MAX]; +	char				tx_irq_name[LDC_IRQ_NAME_MAX]; + +	struct hlist_head		mh_list; + +	struct hlist_node		list; +}; + +#define ldcdbg(TYPE, f, a...) \ +do {	if (lp->cfg.debug & LDC_DEBUG_##TYPE) \ +		printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \ +} while (0) + +static const char *state_to_str(u8 state) +{ +	switch (state) { +	case LDC_STATE_INVALID: +		return "INVALID"; +	case LDC_STATE_INIT: +		return "INIT"; +	case LDC_STATE_BOUND: +		return "BOUND"; +	case LDC_STATE_READY: +		return "READY"; +	case LDC_STATE_CONNECTED: +		return "CONNECTED"; +	default: +		return "<UNKNOWN>"; +	} +} + +static void ldc_set_state(struct ldc_channel *lp, u8 state) +{ +	ldcdbg(STATE, "STATE (%s) --> (%s)\n", +	       state_to_str(lp->state), +	       state_to_str(state)); + +	lp->state = state; +} + +static unsigned long __advance(unsigned long off, unsigned long num_entries) +{ +	off += LDC_PACKET_SIZE; +	if (off == (num_entries * LDC_PACKET_SIZE)) +		off = 0; + +	return off; +} + +static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off) +{ +	return __advance(off, lp->rx_num_entries); +} + +static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off) +{ +	return __advance(off, lp->tx_num_entries); +} + +static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp, +						  unsigned long *new_tail) +{ +	struct ldc_packet *p; +	unsigned long t; + +	t = tx_advance(lp, lp->tx_tail); +	if (t == lp->tx_head) +		return NULL; + +	*new_tail = t; + +	p = lp->tx_base; +	return p + (lp->tx_tail / LDC_PACKET_SIZE); +} + +/* When we are in reliable or stream mode, have to track the next packet + * we haven't gotten an ACK for in the TX queue using tx_acked.  We have + * to be careful not to stomp over the queue past that point.  During + * the handshake, we don't have TX data packets pending in the queue + * and that's why handshake_get_tx_packet() need not be mindful of + * lp->tx_acked. + */ +static unsigned long head_for_data(struct ldc_channel *lp) +{ +	if (lp->cfg.mode == LDC_MODE_STREAM) +		return lp->tx_acked; +	return lp->tx_head; +} + +static int tx_has_space_for(struct ldc_channel *lp, unsigned int size) +{ +	unsigned long limit, tail, new_tail, diff; +	unsigned int mss; + +	limit = head_for_data(lp); +	tail = lp->tx_tail; +	new_tail = tx_advance(lp, tail); +	if (new_tail == limit) +		return 0; + +	if (limit > new_tail) +		diff = limit - new_tail; +	else +		diff = (limit + +			((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail)); +	diff /= LDC_PACKET_SIZE; +	mss = lp->mss; + +	if (diff * mss < size) +		return 0; + +	return 1; +} + +static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp, +					     unsigned long *new_tail) +{ +	struct ldc_packet *p; +	unsigned long h, t; + +	h = head_for_data(lp); +	t = tx_advance(lp, lp->tx_tail); +	if (t == h) +		return NULL; + +	*new_tail = t; + +	p = lp->tx_base; +	return p + (lp->tx_tail / LDC_PACKET_SIZE); +} + +static int set_tx_tail(struct ldc_channel *lp, unsigned long tail) +{ +	unsigned long orig_tail = lp->tx_tail; +	int limit = 1000; + +	lp->tx_tail = tail; +	while (limit-- > 0) { +		unsigned long err; + +		err = sun4v_ldc_tx_set_qtail(lp->id, tail); +		if (!err) +			return 0; + +		if (err != HV_EWOULDBLOCK) { +			lp->tx_tail = orig_tail; +			return -EINVAL; +		} +		udelay(1); +	} + +	lp->tx_tail = orig_tail; +	return -EBUSY; +} + +/* This just updates the head value in the hypervisor using + * a polling loop with a timeout.  The caller takes care of + * upating software state representing the head change, if any. + */ +static int __set_rx_head(struct ldc_channel *lp, unsigned long head) +{ +	int limit = 1000; + +	while (limit-- > 0) { +		unsigned long err; + +		err = sun4v_ldc_rx_set_qhead(lp->id, head); +		if (!err) +			return 0; + +		if (err != HV_EWOULDBLOCK) +			return -EINVAL; + +		udelay(1); +	} + +	return -EBUSY; +} + +static int send_tx_packet(struct ldc_channel *lp, +			  struct ldc_packet *p, +			  unsigned long new_tail) +{ +	BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE))); + +	return set_tx_tail(lp, new_tail); +} + +static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp, +						 u8 stype, u8 ctrl, +						 void *data, int dlen, +						 unsigned long *new_tail) +{ +	struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail); + +	if (p) { +		memset(p, 0, sizeof(*p)); +		p->type = LDC_CTRL; +		p->stype = stype; +		p->ctrl = ctrl; +		if (data) +			memcpy(p->u.u_data, data, dlen); +	} +	return p; +} + +static int start_handshake(struct ldc_channel *lp) +{ +	struct ldc_packet *p; +	struct ldc_version *ver; +	unsigned long new_tail; + +	ver = &ver_arr[0]; + +	ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n", +	       ver->major, ver->minor); + +	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, +				   ver, sizeof(*ver), &new_tail); +	if (p) { +		int err = send_tx_packet(lp, p, new_tail); +		if (!err) +			lp->flags &= ~LDC_FLAG_RESET; +		return err; +	} +	return -EBUSY; +} + +static int send_version_nack(struct ldc_channel *lp, +			     u16 major, u16 minor) +{ +	struct ldc_packet *p; +	struct ldc_version ver; +	unsigned long new_tail; + +	ver.major = major; +	ver.minor = minor; + +	p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS, +				   &ver, sizeof(ver), &new_tail); +	if (p) { +		ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n", +		       ver.major, ver.minor); + +		return send_tx_packet(lp, p, new_tail); +	} +	return -EBUSY; +} + +static int send_version_ack(struct ldc_channel *lp, +			    struct ldc_version *vp) +{ +	struct ldc_packet *p; +	unsigned long new_tail; + +	p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS, +				   vp, sizeof(*vp), &new_tail); +	if (p) { +		ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n", +		       vp->major, vp->minor); + +		return send_tx_packet(lp, p, new_tail); +	} +	return -EBUSY; +} + +static int send_rts(struct ldc_channel *lp) +{ +	struct ldc_packet *p; +	unsigned long new_tail; + +	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0, +				   &new_tail); +	if (p) { +		p->env = lp->cfg.mode; +		p->seqid = 0; +		lp->rcv_nxt = 0; + +		ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n", +		       p->env, p->seqid); + +		return send_tx_packet(lp, p, new_tail); +	} +	return -EBUSY; +} + +static int send_rtr(struct ldc_channel *lp) +{ +	struct ldc_packet *p; +	unsigned long new_tail; + +	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0, +				   &new_tail); +	if (p) { +		p->env = lp->cfg.mode; +		p->seqid = 0; + +		ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n", +		       p->env, p->seqid); + +		return send_tx_packet(lp, p, new_tail); +	} +	return -EBUSY; +} + +static int send_rdx(struct ldc_channel *lp) +{ +	struct ldc_packet *p; +	unsigned long new_tail; + +	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0, +				   &new_tail); +	if (p) { +		p->env = 0; +		p->seqid = ++lp->snd_nxt; +		p->u.r.ackid = lp->rcv_nxt; + +		ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n", +		       p->env, p->seqid, p->u.r.ackid); + +		return send_tx_packet(lp, p, new_tail); +	} +	return -EBUSY; +} + +static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt) +{ +	struct ldc_packet *p; +	unsigned long new_tail; +	int err; + +	p = data_get_tx_packet(lp, &new_tail); +	if (!p) +		return -EBUSY; +	memset(p, 0, sizeof(*p)); +	p->type = data_pkt->type; +	p->stype = LDC_NACK; +	p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK; +	p->seqid = lp->snd_nxt + 1; +	p->u.r.ackid = lp->rcv_nxt; + +	ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n", +	       p->type, p->ctrl, p->seqid, p->u.r.ackid); + +	err = send_tx_packet(lp, p, new_tail); +	if (!err) +		lp->snd_nxt++; + +	return err; +} + +static int ldc_abort(struct ldc_channel *lp) +{ +	unsigned long hv_err; + +	ldcdbg(STATE, "ABORT\n"); + +	/* We report but do not act upon the hypervisor errors because +	 * there really isn't much we can do if they fail at this point. +	 */ +	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); +	if (hv_err) +		printk(KERN_ERR PFX "ldc_abort: " +		       "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n", +		       lp->id, lp->tx_ra, lp->tx_num_entries, hv_err); + +	hv_err = sun4v_ldc_tx_get_state(lp->id, +					&lp->tx_head, +					&lp->tx_tail, +					&lp->chan_state); +	if (hv_err) +		printk(KERN_ERR PFX "ldc_abort: " +		       "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n", +		       lp->id, hv_err); + +	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); +	if (hv_err) +		printk(KERN_ERR PFX "ldc_abort: " +		       "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n", +		       lp->id, lp->rx_ra, lp->rx_num_entries, hv_err); + +	/* Refetch the RX queue state as well, because we could be invoked +	 * here in the queue processing context. +	 */ +	hv_err = sun4v_ldc_rx_get_state(lp->id, +					&lp->rx_head, +					&lp->rx_tail, +					&lp->chan_state); +	if (hv_err) +		printk(KERN_ERR PFX "ldc_abort: " +		       "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n", +		       lp->id, hv_err); + +	return -ECONNRESET; +} + +static struct ldc_version *find_by_major(u16 major) +{ +	struct ldc_version *ret = NULL; +	int i; + +	for (i = 0; i < ARRAY_SIZE(ver_arr); i++) { +		struct ldc_version *v = &ver_arr[i]; +		if (v->major <= major) { +			ret = v; +			break; +		} +	} +	return ret; +} + +static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp) +{ +	struct ldc_version *vap; +	int err; + +	ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n", +	       vp->major, vp->minor); + +	if (lp->hs_state == LDC_HS_GOTVERS) { +		lp->hs_state = LDC_HS_OPEN; +		memset(&lp->ver, 0, sizeof(lp->ver)); +	} + +	vap = find_by_major(vp->major); +	if (!vap) { +		err = send_version_nack(lp, 0, 0); +	} else if (vap->major != vp->major) { +		err = send_version_nack(lp, vap->major, vap->minor); +	} else { +		struct ldc_version ver = *vp; +		if (ver.minor > vap->minor) +			ver.minor = vap->minor; +		err = send_version_ack(lp, &ver); +		if (!err) { +			lp->ver = ver; +			lp->hs_state = LDC_HS_GOTVERS; +		} +	} +	if (err) +		return ldc_abort(lp); + +	return 0; +} + +static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp) +{ +	ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n", +	       vp->major, vp->minor); + +	if (lp->hs_state == LDC_HS_GOTVERS) { +		if (lp->ver.major != vp->major || +		    lp->ver.minor != vp->minor) +			return ldc_abort(lp); +	} else { +		lp->ver = *vp; +		lp->hs_state = LDC_HS_GOTVERS; +	} +	if (send_rts(lp)) +		return ldc_abort(lp); +	return 0; +} + +static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp) +{ +	struct ldc_version *vap; + +	if ((vp->major == 0 && vp->minor == 0) || +	    !(vap = find_by_major(vp->major))) { +		return ldc_abort(lp); +	} else { +		struct ldc_packet *p; +		unsigned long new_tail; + +		p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, +					   vap, sizeof(*vap), +					   &new_tail); +		if (p) +			return send_tx_packet(lp, p, new_tail); +		else +			return ldc_abort(lp); +	} +} + +static int process_version(struct ldc_channel *lp, +			   struct ldc_packet *p) +{ +	struct ldc_version *vp; + +	vp = (struct ldc_version *) p->u.u_data; + +	switch (p->stype) { +	case LDC_INFO: +		return process_ver_info(lp, vp); + +	case LDC_ACK: +		return process_ver_ack(lp, vp); + +	case LDC_NACK: +		return process_ver_nack(lp, vp); + +	default: +		return ldc_abort(lp); +	} +} + +static int process_rts(struct ldc_channel *lp, +		       struct ldc_packet *p) +{ +	ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n", +	       p->stype, p->seqid, p->env); + +	if (p->stype     != LDC_INFO	   || +	    lp->hs_state != LDC_HS_GOTVERS || +	    p->env       != lp->cfg.mode) +		return ldc_abort(lp); + +	lp->snd_nxt = p->seqid; +	lp->rcv_nxt = p->seqid; +	lp->hs_state = LDC_HS_SENTRTR; +	if (send_rtr(lp)) +		return ldc_abort(lp); + +	return 0; +} + +static int process_rtr(struct ldc_channel *lp, +		       struct ldc_packet *p) +{ +	ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n", +	       p->stype, p->seqid, p->env); + +	if (p->stype     != LDC_INFO || +	    p->env       != lp->cfg.mode) +		return ldc_abort(lp); + +	lp->snd_nxt = p->seqid; +	lp->hs_state = LDC_HS_COMPLETE; +	ldc_set_state(lp, LDC_STATE_CONNECTED); +	send_rdx(lp); + +	return LDC_EVENT_UP; +} + +static int rx_seq_ok(struct ldc_channel *lp, u32 seqid) +{ +	return lp->rcv_nxt + 1 == seqid; +} + +static int process_rdx(struct ldc_channel *lp, +		       struct ldc_packet *p) +{ +	ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n", +	       p->stype, p->seqid, p->env, p->u.r.ackid); + +	if (p->stype != LDC_INFO || +	    !(rx_seq_ok(lp, p->seqid))) +		return ldc_abort(lp); + +	lp->rcv_nxt = p->seqid; + +	lp->hs_state = LDC_HS_COMPLETE; +	ldc_set_state(lp, LDC_STATE_CONNECTED); + +	return LDC_EVENT_UP; +} + +static int process_control_frame(struct ldc_channel *lp, +				 struct ldc_packet *p) +{ +	switch (p->ctrl) { +	case LDC_VERS: +		return process_version(lp, p); + +	case LDC_RTS: +		return process_rts(lp, p); + +	case LDC_RTR: +		return process_rtr(lp, p); + +	case LDC_RDX: +		return process_rdx(lp, p); + +	default: +		return ldc_abort(lp); +	} +} + +static int process_error_frame(struct ldc_channel *lp, +			       struct ldc_packet *p) +{ +	return ldc_abort(lp); +} + +static int process_data_ack(struct ldc_channel *lp, +			    struct ldc_packet *ack) +{ +	unsigned long head = lp->tx_acked; +	u32 ackid = ack->u.r.ackid; + +	while (1) { +		struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE); + +		head = tx_advance(lp, head); + +		if (p->seqid == ackid) { +			lp->tx_acked = head; +			return 0; +		} +		if (head == lp->tx_tail) +			return ldc_abort(lp); +	} + +	return 0; +} + +static void send_events(struct ldc_channel *lp, unsigned int event_mask) +{ +	if (event_mask & LDC_EVENT_RESET) +		lp->cfg.event(lp->event_arg, LDC_EVENT_RESET); +	if (event_mask & LDC_EVENT_UP) +		lp->cfg.event(lp->event_arg, LDC_EVENT_UP); +	if (event_mask & LDC_EVENT_DATA_READY) +		lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY); +} + +static irqreturn_t ldc_rx(int irq, void *dev_id) +{ +	struct ldc_channel *lp = dev_id; +	unsigned long orig_state, hv_err, flags; +	unsigned int event_mask; + +	spin_lock_irqsave(&lp->lock, flags); + +	orig_state = lp->chan_state; +	hv_err = sun4v_ldc_rx_get_state(lp->id, +					&lp->rx_head, +					&lp->rx_tail, +					&lp->chan_state); + +	ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", +	       orig_state, lp->chan_state, lp->rx_head, lp->rx_tail); + +	event_mask = 0; + +	if (lp->cfg.mode == LDC_MODE_RAW && +	    lp->chan_state == LDC_CHANNEL_UP) { +		lp->hs_state = LDC_HS_COMPLETE; +		ldc_set_state(lp, LDC_STATE_CONNECTED); + +		event_mask |= LDC_EVENT_UP; + +		orig_state = lp->chan_state; +	} + +	/* If we are in reset state, flush the RX queue and ignore +	 * everything. +	 */ +	if (lp->flags & LDC_FLAG_RESET) { +		(void) __set_rx_head(lp, lp->rx_tail); +		goto out; +	} + +	/* Once we finish the handshake, we let the ldc_read() +	 * paths do all of the control frame and state management. +	 * Just trigger the callback. +	 */ +	if (lp->hs_state == LDC_HS_COMPLETE) { +handshake_complete: +		if (lp->chan_state != orig_state) { +			unsigned int event = LDC_EVENT_RESET; + +			if (lp->chan_state == LDC_CHANNEL_UP) +				event = LDC_EVENT_UP; + +			event_mask |= event; +		} +		if (lp->rx_head != lp->rx_tail) +			event_mask |= LDC_EVENT_DATA_READY; + +		goto out; +	} + +	if (lp->chan_state != orig_state) +		goto out; + +	while (lp->rx_head != lp->rx_tail) { +		struct ldc_packet *p; +		unsigned long new; +		int err; + +		p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); + +		switch (p->type) { +		case LDC_CTRL: +			err = process_control_frame(lp, p); +			if (err > 0) +				event_mask |= err; +			break; + +		case LDC_DATA: +			event_mask |= LDC_EVENT_DATA_READY; +			err = 0; +			break; + +		case LDC_ERR: +			err = process_error_frame(lp, p); +			break; + +		default: +			err = ldc_abort(lp); +			break; +		} + +		if (err < 0) +			break; + +		new = lp->rx_head; +		new += LDC_PACKET_SIZE; +		if (new == (lp->rx_num_entries * LDC_PACKET_SIZE)) +			new = 0; +		lp->rx_head = new; + +		err = __set_rx_head(lp, new); +		if (err < 0) { +			(void) ldc_abort(lp); +			break; +		} +		if (lp->hs_state == LDC_HS_COMPLETE) +			goto handshake_complete; +	} + +out: +	spin_unlock_irqrestore(&lp->lock, flags); + +	send_events(lp, event_mask); + +	return IRQ_HANDLED; +} + +static irqreturn_t ldc_tx(int irq, void *dev_id) +{ +	struct ldc_channel *lp = dev_id; +	unsigned long flags, hv_err, orig_state; +	unsigned int event_mask = 0; + +	spin_lock_irqsave(&lp->lock, flags); + +	orig_state = lp->chan_state; +	hv_err = sun4v_ldc_tx_get_state(lp->id, +					&lp->tx_head, +					&lp->tx_tail, +					&lp->chan_state); + +	ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", +	       orig_state, lp->chan_state, lp->tx_head, lp->tx_tail); + +	if (lp->cfg.mode == LDC_MODE_RAW && +	    lp->chan_state == LDC_CHANNEL_UP) { +		lp->hs_state = LDC_HS_COMPLETE; +		ldc_set_state(lp, LDC_STATE_CONNECTED); + +		event_mask |= LDC_EVENT_UP; +	} + +	spin_unlock_irqrestore(&lp->lock, flags); + +	send_events(lp, event_mask); + +	return IRQ_HANDLED; +} + +/* XXX ldc_alloc() and ldc_free() needs to run under a mutex so + * XXX that addition and removal from the ldc_channel_list has + * XXX atomicity, otherwise the __ldc_channel_exists() check is + * XXX totally pointless as another thread can slip into ldc_alloc() + * XXX and add a channel with the same ID.  There also needs to be + * XXX a spinlock for ldc_channel_list. + */ +static HLIST_HEAD(ldc_channel_list); + +static int __ldc_channel_exists(unsigned long id) +{ +	struct ldc_channel *lp; +	struct hlist_node *n; + +	hlist_for_each_entry(lp, n, &ldc_channel_list, list) { +		if (lp->id == id) +			return 1; +	} +	return 0; +} + +static int alloc_queue(const char *name, unsigned long num_entries, +		       struct ldc_packet **base, unsigned long *ra) +{ +	unsigned long size, order; +	void *q; + +	size = num_entries * LDC_PACKET_SIZE; +	order = get_order(size); + +	q = (void *) __get_free_pages(GFP_KERNEL, order); +	if (!q) { +		printk(KERN_ERR PFX "Alloc of %s queue failed with " +		       "size=%lu order=%lu\n", name, size, order); +		return -ENOMEM; +	} + +	memset(q, 0, PAGE_SIZE << order); + +	*base = q; +	*ra = __pa(q); + +	return 0; +} + +static void free_queue(unsigned long num_entries, struct ldc_packet *q) +{ +	unsigned long size, order; + +	if (!q) +		return; + +	size = num_entries * LDC_PACKET_SIZE; +	order = get_order(size); + +	free_pages((unsigned long)q, order); +} + +/* XXX Make this configurable... XXX */ +#define LDC_IOTABLE_SIZE	(8 * 1024) + +static int ldc_iommu_init(struct ldc_channel *lp) +{ +	unsigned long sz, num_tsb_entries, tsbsize, order; +	struct ldc_iommu *iommu = &lp->iommu; +	struct ldc_mtable_entry *table; +	unsigned long hv_err; +	int err; + +	num_tsb_entries = LDC_IOTABLE_SIZE; +	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); + +	spin_lock_init(&iommu->lock); + +	sz = num_tsb_entries / 8; +	sz = (sz + 7UL) & ~7UL; +	iommu->arena.map = kzalloc(sz, GFP_KERNEL); +	if (!iommu->arena.map) { +		printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz); +		return -ENOMEM; +	} + +	iommu->arena.limit = num_tsb_entries; + +	order = get_order(tsbsize); + +	table = (struct ldc_mtable_entry *) +		__get_free_pages(GFP_KERNEL, order); +	err = -ENOMEM; +	if (!table) { +		printk(KERN_ERR PFX "Alloc of MTE table failed, " +		       "size=%lu order=%lu\n", tsbsize, order); +		goto out_free_map; +	} + +	memset(table, 0, PAGE_SIZE << order); + +	iommu->page_table = table; + +	hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table), +					 num_tsb_entries); +	err = -EINVAL; +	if (hv_err) +		goto out_free_table; + +	return 0; + +out_free_table: +	free_pages((unsigned long) table, order); +	iommu->page_table = NULL; + +out_free_map: +	kfree(iommu->arena.map); +	iommu->arena.map = NULL; + +	return err; +} + +static void ldc_iommu_release(struct ldc_channel *lp) +{ +	struct ldc_iommu *iommu = &lp->iommu; +	unsigned long num_tsb_entries, tsbsize, order; + +	(void) sun4v_ldc_set_map_table(lp->id, 0, 0); + +	num_tsb_entries = iommu->arena.limit; +	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); +	order = get_order(tsbsize); + +	free_pages((unsigned long) iommu->page_table, order); +	iommu->page_table = NULL; + +	kfree(iommu->arena.map); +	iommu->arena.map = NULL; +} + +struct ldc_channel *ldc_alloc(unsigned long id, +			      const struct ldc_channel_config *cfgp, +			      void *event_arg) +{ +	struct ldc_channel *lp; +	const struct ldc_mode_ops *mops; +	unsigned long dummy1, dummy2, hv_err; +	u8 mss, *mssbuf; +	int err; + +	err = -ENODEV; +	if (!ldom_domaining_enabled) +		goto out_err; + +	err = -EINVAL; +	if (!cfgp) +		goto out_err; + +	switch (cfgp->mode) { +	case LDC_MODE_RAW: +		mops = &raw_ops; +		mss = LDC_PACKET_SIZE; +		break; + +	case LDC_MODE_UNRELIABLE: +		mops = &nonraw_ops; +		mss = LDC_PACKET_SIZE - 8; +		break; + +	case LDC_MODE_STREAM: +		mops = &stream_ops; +		mss = LDC_PACKET_SIZE - 8 - 8; +		break; + +	default: +		goto out_err; +	} + +	if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq) +		goto out_err; + +	hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2); +	err = -ENODEV; +	if (hv_err == HV_ECHANNEL) +		goto out_err; + +	err = -EEXIST; +	if (__ldc_channel_exists(id)) +		goto out_err; + +	mssbuf = NULL; + +	lp = kzalloc(sizeof(*lp), GFP_KERNEL); +	err = -ENOMEM; +	if (!lp) +		goto out_err; + +	spin_lock_init(&lp->lock); + +	lp->id = id; + +	err = ldc_iommu_init(lp); +	if (err) +		goto out_free_ldc; + +	lp->mops = mops; +	lp->mss = mss; + +	lp->cfg = *cfgp; +	if (!lp->cfg.mtu) +		lp->cfg.mtu = LDC_DEFAULT_MTU; + +	if (lp->cfg.mode == LDC_MODE_STREAM) { +		mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL); +		if (!mssbuf) { +			err = -ENOMEM; +			goto out_free_iommu; +		} +		lp->mssbuf = mssbuf; +	} + +	lp->event_arg = event_arg; + +	/* XXX allow setting via ldc_channel_config to override defaults +	 * XXX or use some formula based upon mtu +	 */ +	lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES; +	lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES; + +	err = alloc_queue("TX", lp->tx_num_entries, +			  &lp->tx_base, &lp->tx_ra); +	if (err) +		goto out_free_mssbuf; + +	err = alloc_queue("RX", lp->rx_num_entries, +			  &lp->rx_base, &lp->rx_ra); +	if (err) +		goto out_free_txq; + +	lp->flags |= LDC_FLAG_ALLOCED_QUEUES; + +	lp->hs_state = LDC_HS_CLOSED; +	ldc_set_state(lp, LDC_STATE_INIT); + +	INIT_HLIST_NODE(&lp->list); +	hlist_add_head(&lp->list, &ldc_channel_list); + +	INIT_HLIST_HEAD(&lp->mh_list); + +	return lp; + +out_free_txq: +	free_queue(lp->tx_num_entries, lp->tx_base); + +out_free_mssbuf: +	if (mssbuf) +		kfree(mssbuf); + +out_free_iommu: +	ldc_iommu_release(lp); + +out_free_ldc: +	kfree(lp); + +out_err: +	return ERR_PTR(err); +} +EXPORT_SYMBOL(ldc_alloc); + +void ldc_free(struct ldc_channel *lp) +{ +	if (lp->flags & LDC_FLAG_REGISTERED_IRQS) { +		free_irq(lp->cfg.rx_irq, lp); +		free_irq(lp->cfg.tx_irq, lp); +	} + +	if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) { +		sun4v_ldc_tx_qconf(lp->id, 0, 0); +		sun4v_ldc_rx_qconf(lp->id, 0, 0); +		lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; +	} +	if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) { +		free_queue(lp->tx_num_entries, lp->tx_base); +		free_queue(lp->rx_num_entries, lp->rx_base); +		lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES; +	} + +	hlist_del(&lp->list); + +	if (lp->mssbuf) +		kfree(lp->mssbuf); + +	ldc_iommu_release(lp); + +	kfree(lp); +} +EXPORT_SYMBOL(ldc_free); + +/* Bind the channel.  This registers the LDC queues with + * the hypervisor and puts the channel into a pseudo-listening + * state.  This does not initiate a handshake, ldc_connect() does + * that. + */ +int ldc_bind(struct ldc_channel *lp, const char *name) +{ +	unsigned long hv_err, flags; +	int err = -EINVAL; + +	if (!name || +	    (lp->state != LDC_STATE_INIT)) +		return -EINVAL; + +	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name); +	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name); + +	err = request_irq(lp->cfg.rx_irq, ldc_rx, +			  IRQF_SAMPLE_RANDOM | IRQF_SHARED, +			  lp->rx_irq_name, lp); +	if (err) +		return err; + +	err = request_irq(lp->cfg.tx_irq, ldc_tx, +			  IRQF_SAMPLE_RANDOM | IRQF_SHARED, +			  lp->tx_irq_name, lp); +	if (err) { +		free_irq(lp->cfg.rx_irq, lp); +		return err; +	} + + +	spin_lock_irqsave(&lp->lock, flags); + +	enable_irq(lp->cfg.rx_irq); +	enable_irq(lp->cfg.tx_irq); + +	lp->flags |= LDC_FLAG_REGISTERED_IRQS; + +	err = -ENODEV; +	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); +	if (hv_err) +		goto out_free_irqs; + +	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); +	if (hv_err) +		goto out_free_irqs; + +	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); +	if (hv_err) +		goto out_unmap_tx; + +	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); +	if (hv_err) +		goto out_unmap_tx; + +	lp->flags |= LDC_FLAG_REGISTERED_QUEUES; + +	hv_err = sun4v_ldc_tx_get_state(lp->id, +					&lp->tx_head, +					&lp->tx_tail, +					&lp->chan_state); +	err = -EBUSY; +	if (hv_err) +		goto out_unmap_rx; + +	lp->tx_acked = lp->tx_head; + +	lp->hs_state = LDC_HS_OPEN; +	ldc_set_state(lp, LDC_STATE_BOUND); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return 0; + +out_unmap_rx: +	lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; +	sun4v_ldc_rx_qconf(lp->id, 0, 0); + +out_unmap_tx: +	sun4v_ldc_tx_qconf(lp->id, 0, 0); + +out_free_irqs: +	lp->flags &= ~LDC_FLAG_REGISTERED_IRQS; +	free_irq(lp->cfg.tx_irq, lp); +	free_irq(lp->cfg.rx_irq, lp); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return err; +} +EXPORT_SYMBOL(ldc_bind); + +int ldc_connect(struct ldc_channel *lp) +{ +	unsigned long flags; +	int err; + +	if (lp->cfg.mode == LDC_MODE_RAW) +		return -EINVAL; + +	spin_lock_irqsave(&lp->lock, flags); + +	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || +	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) || +	    lp->hs_state != LDC_HS_OPEN) +		err = -EINVAL; +	else +		err = start_handshake(lp); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return err; +} +EXPORT_SYMBOL(ldc_connect); + +int ldc_disconnect(struct ldc_channel *lp) +{ +	unsigned long hv_err, flags; +	int err; + +	if (lp->cfg.mode == LDC_MODE_RAW) +		return -EINVAL; + +	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || +	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES)) +		return -EINVAL; + +	spin_lock_irqsave(&lp->lock, flags); + +	err = -ENODEV; +	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); +	if (hv_err) +		goto out_err; + +	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); +	if (hv_err) +		goto out_err; + +	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); +	if (hv_err) +		goto out_err; + +	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); +	if (hv_err) +		goto out_err; + +	ldc_set_state(lp, LDC_STATE_BOUND); +	lp->hs_state = LDC_HS_OPEN; +	lp->flags |= LDC_FLAG_RESET; + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return 0; + +out_err: +	sun4v_ldc_tx_qconf(lp->id, 0, 0); +	sun4v_ldc_rx_qconf(lp->id, 0, 0); +	free_irq(lp->cfg.tx_irq, lp); +	free_irq(lp->cfg.rx_irq, lp); +	lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS | +		       LDC_FLAG_REGISTERED_QUEUES); +	ldc_set_state(lp, LDC_STATE_INIT); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return err; +} +EXPORT_SYMBOL(ldc_disconnect); + +int ldc_state(struct ldc_channel *lp) +{ +	return lp->state; +} +EXPORT_SYMBOL(ldc_state); + +static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size) +{ +	struct ldc_packet *p; +	unsigned long new_tail; +	int err; + +	if (size > LDC_PACKET_SIZE) +		return -EMSGSIZE; + +	p = data_get_tx_packet(lp, &new_tail); +	if (!p) +		return -EAGAIN; + +	memcpy(p, buf, size); + +	err = send_tx_packet(lp, p, new_tail); +	if (!err) +		err = size; + +	return err; +} + +static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size) +{ +	struct ldc_packet *p; +	unsigned long hv_err, new; +	int err; + +	if (size < LDC_PACKET_SIZE) +		return -EINVAL; + +	hv_err = sun4v_ldc_rx_get_state(lp->id, +					&lp->rx_head, +					&lp->rx_tail, +					&lp->chan_state); +	if (hv_err) +		return ldc_abort(lp); + +	if (lp->chan_state == LDC_CHANNEL_DOWN || +	    lp->chan_state == LDC_CHANNEL_RESETTING) +		return -ECONNRESET; + +	if (lp->rx_head == lp->rx_tail) +		return 0; + +	p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); +	memcpy(buf, p, LDC_PACKET_SIZE); + +	new = rx_advance(lp, lp->rx_head); +	lp->rx_head = new; + +	err = __set_rx_head(lp, new); +	if (err < 0) +		err = -ECONNRESET; +	else +		err = LDC_PACKET_SIZE; + +	return err; +} + +static const struct ldc_mode_ops raw_ops = { +	.write		=	write_raw, +	.read		=	read_raw, +}; + +static int write_nonraw(struct ldc_channel *lp, const void *buf, +			unsigned int size) +{ +	unsigned long hv_err, tail; +	unsigned int copied; +	u32 seq; +	int err; + +	hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, +					&lp->chan_state); +	if (unlikely(hv_err)) +		return -EBUSY; + +	if (unlikely(lp->chan_state != LDC_CHANNEL_UP)) +		return ldc_abort(lp); + +	if (!tx_has_space_for(lp, size)) +		return -EAGAIN; + +	seq = lp->snd_nxt; +	copied = 0; +	tail = lp->tx_tail; +	while (copied < size) { +		struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE); +		u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ? +			    p->u.u_data : +			    p->u.r.r_data); +		int data_len; + +		p->type = LDC_DATA; +		p->stype = LDC_INFO; +		p->ctrl = 0; + +		data_len = size - copied; +		if (data_len > lp->mss) +			data_len = lp->mss; + +		BUG_ON(data_len > LDC_LEN); + +		p->env = (data_len | +			  (copied == 0 ? LDC_START : 0) | +			  (data_len == size - copied ? LDC_STOP : 0)); + +		p->seqid = ++seq; + +		ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n", +		       p->type, +		       p->stype, +		       p->ctrl, +		       p->env, +		       p->seqid); + +		memcpy(data, buf, data_len); +		buf += data_len; +		copied += data_len; + +		tail = tx_advance(lp, tail); +	} + +	err = set_tx_tail(lp, tail); +	if (!err) { +		lp->snd_nxt = seq; +		err = size; +	} + +	return err; +} + +static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p, +		      struct ldc_packet *first_frag) +{ +	int err; + +	if (first_frag) +		lp->rcv_nxt = first_frag->seqid - 1; + +	err = send_data_nack(lp, p); +	if (err) +		return err; + +	err = __set_rx_head(lp, lp->rx_tail); +	if (err < 0) +		return ldc_abort(lp); + +	return 0; +} + +static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p) +{ +	if (p->stype & LDC_ACK) { +		int err = process_data_ack(lp, p); +		if (err) +			return err; +	} +	if (p->stype & LDC_NACK) +		return ldc_abort(lp); + +	return 0; +} + +static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head) +{ +	unsigned long dummy; +	int limit = 1000; + +	ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n", +	       cur_head, lp->rx_head, lp->rx_tail); +	while (limit-- > 0) { +		unsigned long hv_err; + +		hv_err = sun4v_ldc_rx_get_state(lp->id, +						&dummy, +						&lp->rx_tail, +						&lp->chan_state); +		if (hv_err) +			return ldc_abort(lp); + +		if (lp->chan_state == LDC_CHANNEL_DOWN || +		    lp->chan_state == LDC_CHANNEL_RESETTING) +			return -ECONNRESET; + +		if (cur_head != lp->rx_tail) { +			ldcdbg(DATA, "DATA WAIT DONE " +			       "head[%lx] tail[%lx] chan_state[%lx]\n", +			       dummy, lp->rx_tail, lp->chan_state); +			return 0; +		} + +		udelay(1); +	} +	return -EAGAIN; +} + +static int rx_set_head(struct ldc_channel *lp, unsigned long head) +{ +	int err = __set_rx_head(lp, head); + +	if (err < 0) +		return ldc_abort(lp); + +	lp->rx_head = head; +	return 0; +} + +static void send_data_ack(struct ldc_channel *lp) +{ +	unsigned long new_tail; +	struct ldc_packet *p; + +	p = data_get_tx_packet(lp, &new_tail); +	if (likely(p)) { +		int err; + +		memset(p, 0, sizeof(*p)); +		p->type = LDC_DATA; +		p->stype = LDC_ACK; +		p->ctrl = 0; +		p->seqid = lp->snd_nxt + 1; +		p->u.r.ackid = lp->rcv_nxt; + +		err = send_tx_packet(lp, p, new_tail); +		if (!err) +			lp->snd_nxt++; +	} +} + +static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size) +{ +	struct ldc_packet *first_frag; +	unsigned long hv_err, new; +	int err, copied; + +	hv_err = sun4v_ldc_rx_get_state(lp->id, +					&lp->rx_head, +					&lp->rx_tail, +					&lp->chan_state); +	if (hv_err) +		return ldc_abort(lp); + +	if (lp->chan_state == LDC_CHANNEL_DOWN || +	    lp->chan_state == LDC_CHANNEL_RESETTING) +		return -ECONNRESET; + +	if (lp->rx_head == lp->rx_tail) +		return 0; + +	first_frag = NULL; +	copied = err = 0; +	new = lp->rx_head; +	while (1) { +		struct ldc_packet *p; +		int pkt_len; + +		BUG_ON(new == lp->rx_tail); +		p = lp->rx_base + (new / LDC_PACKET_SIZE); + +		ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] " +		       "rcv_nxt[%08x]\n", +		       p->type, +		       p->stype, +		       p->ctrl, +		       p->env, +		       p->seqid, +		       p->u.r.ackid, +		       lp->rcv_nxt); + +		if (unlikely(!rx_seq_ok(lp, p->seqid))) { +			err = rx_bad_seq(lp, p, first_frag); +			copied = 0; +			break; +		} + +		if (p->type & LDC_CTRL) { +			err = process_control_frame(lp, p); +			if (err < 0) +				break; +			err = 0; +		} + +		lp->rcv_nxt = p->seqid; + +		if (!(p->type & LDC_DATA)) { +			new = rx_advance(lp, new); +			goto no_data; +		} +		if (p->stype & (LDC_ACK | LDC_NACK)) { +			err = data_ack_nack(lp, p); +			if (err) +				break; +		} +		if (!(p->stype & LDC_INFO)) { +			new = rx_advance(lp, new); +			err = rx_set_head(lp, new); +			if (err) +				break; +			goto no_data; +		} + +		pkt_len = p->env & LDC_LEN; + +		/* Every initial packet starts with the START bit set. +		 * +		 * Singleton packets will have both START+STOP set. +		 * +		 * Fragments will have START set in the first frame, STOP +		 * set in the last frame, and neither bit set in middle +		 * frames of the packet. +		 * +		 * Therefore if we are at the beginning of a packet and +		 * we don't see START, or we are in the middle of a fragmented +		 * packet and do see START, we are unsynchronized and should +		 * flush the RX queue. +		 */ +		if ((first_frag == NULL && !(p->env & LDC_START)) || +		    (first_frag != NULL &&  (p->env & LDC_START))) { +			if (!first_frag) +				new = rx_advance(lp, new); + +			err = rx_set_head(lp, new); +			if (err) +				break; + +			if (!first_frag) +				goto no_data; +		} +		if (!first_frag) +			first_frag = p; + +		if (pkt_len > size - copied) { +			/* User didn't give us a big enough buffer, +			 * what to do?  This is a pretty serious error. +			 * +			 * Since we haven't updated the RX ring head to +			 * consume any of the packets, signal the error +			 * to the user and just leave the RX ring alone. +			 * +			 * This seems the best behavior because this allows +			 * a user of the LDC layer to start with a small +			 * RX buffer for ldc_read() calls and use -EMSGSIZE +			 * as a cue to enlarge it's read buffer. +			 */ +			err = -EMSGSIZE; +			break; +		} + +		/* Ok, we are gonna eat this one.  */ +		new = rx_advance(lp, new); + +		memcpy(buf, +		       (lp->cfg.mode == LDC_MODE_UNRELIABLE ? +			p->u.u_data : p->u.r.r_data), pkt_len); +		buf += pkt_len; +		copied += pkt_len; + +		if (p->env & LDC_STOP) +			break; + +no_data: +		if (new == lp->rx_tail) { +			err = rx_data_wait(lp, new); +			if (err) +				break; +		} +	} + +	if (!err) +		err = rx_set_head(lp, new); + +	if (err && first_frag) +		lp->rcv_nxt = first_frag->seqid - 1; + +	if (!err) { +		err = copied; +		if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE) +			send_data_ack(lp); +	} + +	return err; +} + +static const struct ldc_mode_ops nonraw_ops = { +	.write		=	write_nonraw, +	.read		=	read_nonraw, +}; + +static int write_stream(struct ldc_channel *lp, const void *buf, +			unsigned int size) +{ +	if (size > lp->cfg.mtu) +		size = lp->cfg.mtu; +	return write_nonraw(lp, buf, size); +} + +static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size) +{ +	if (!lp->mssbuf_len) { +		int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu); +		if (err < 0) +			return err; + +		lp->mssbuf_len = err; +		lp->mssbuf_off = 0; +	} + +	if (size > lp->mssbuf_len) +		size = lp->mssbuf_len; +	memcpy(buf, lp->mssbuf + lp->mssbuf_off, size); + +	lp->mssbuf_off += size; +	lp->mssbuf_len -= size; + +	return size; +} + +static const struct ldc_mode_ops stream_ops = { +	.write		=	write_stream, +	.read		=	read_stream, +}; + +int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size) +{ +	unsigned long flags; +	int err; + +	if (!buf) +		return -EINVAL; + +	if (!size) +		return 0; + +	spin_lock_irqsave(&lp->lock, flags); + +	if (lp->hs_state != LDC_HS_COMPLETE) +		err = -ENOTCONN; +	else +		err = lp->mops->write(lp, buf, size); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return err; +} +EXPORT_SYMBOL(ldc_write); + +int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size) +{ +	unsigned long flags; +	int err; + +	if (!buf) +		return -EINVAL; + +	if (!size) +		return 0; + +	spin_lock_irqsave(&lp->lock, flags); + +	if (lp->hs_state != LDC_HS_COMPLETE) +		err = -ENOTCONN; +	else +		err = lp->mops->read(lp, buf, size); + +	spin_unlock_irqrestore(&lp->lock, flags); + +	return err; +} +EXPORT_SYMBOL(ldc_read); + +static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages) +{ +	struct iommu_arena *arena = &iommu->arena; +	unsigned long n, i, start, end, limit; +	int pass; + +	limit = arena->limit; +	start = arena->hint; +	pass = 0; + +again: +	n = find_next_zero_bit(arena->map, limit, start); +	end = n + npages; +	if (unlikely(end >= limit)) { +		if (likely(pass < 1)) { +			limit = start; +			start = 0; +			pass++; +			goto again; +		} else { +			/* Scanned the whole thing, give up. */ +			return -1; +		} +	} + +	for (i = n; i < end; i++) { +		if (test_bit(i, arena->map)) { +			start = i + 1; +			goto again; +		} +	} + +	for (i = n; i < end; i++) +		__set_bit(i, arena->map); + +	arena->hint = end; + +	return n; +} + +#define COOKIE_PGSZ_CODE	0xf000000000000000ULL +#define COOKIE_PGSZ_CODE_SHIFT	60ULL + +static u64 pagesize_code(void) +{ +	switch (PAGE_SIZE) { +	default: +	case (8ULL * 1024ULL): +		return 0; +	case (64ULL * 1024ULL): +		return 1; +	case (512ULL * 1024ULL): +		return 2; +	case (4ULL * 1024ULL * 1024ULL): +		return 3; +	case (32ULL * 1024ULL * 1024ULL): +		return 4; +	case (256ULL * 1024ULL * 1024ULL): +		return 5; +	} +} + +static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset) +{ +	return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) | +		(index << PAGE_SHIFT) | +		page_offset); +} + +static u64 cookie_to_index(u64 cookie, unsigned long *shift) +{ +	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT; + +	cookie &= ~COOKIE_PGSZ_CODE; + +	*shift = szcode * 3; + +	return (cookie >> (13ULL + (szcode * 3ULL))); +} + +static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu, +					     unsigned long npages) +{ +	long entry; + +	entry = arena_alloc(iommu, npages); +	if (unlikely(entry < 0)) +		return NULL; + +	return iommu->page_table + entry; +} + +static u64 perm_to_mte(unsigned int map_perm) +{ +	u64 mte_base; + +	mte_base = pagesize_code(); + +	if (map_perm & LDC_MAP_SHADOW) { +		if (map_perm & LDC_MAP_R) +			mte_base |= LDC_MTE_COPY_R; +		if (map_perm & LDC_MAP_W) +			mte_base |= LDC_MTE_COPY_W; +	} +	if (map_perm & LDC_MAP_DIRECT) { +		if (map_perm & LDC_MAP_R) +			mte_base |= LDC_MTE_READ; +		if (map_perm & LDC_MAP_W) +			mte_base |= LDC_MTE_WRITE; +		if (map_perm & LDC_MAP_X) +			mte_base |= LDC_MTE_EXEC; +	} +	if (map_perm & LDC_MAP_IO) { +		if (map_perm & LDC_MAP_R) +			mte_base |= LDC_MTE_IOMMU_R; +		if (map_perm & LDC_MAP_W) +			mte_base |= LDC_MTE_IOMMU_W; +	} + +	return mte_base; +} + +static int pages_in_region(unsigned long base, long len) +{ +	int count = 0; + +	do { +		unsigned long new = (base + PAGE_SIZE) & PAGE_MASK; + +		len -= (new - base); +		base = new; +		count++; +	} while (len > 0); + +	return count; +} + +struct cookie_state { +	struct ldc_mtable_entry		*page_table; +	struct ldc_trans_cookie		*cookies; +	u64				mte_base; +	u64				prev_cookie; +	u32				pte_idx; +	u32				nc; +}; + +static void fill_cookies(struct cookie_state *sp, unsigned long pa, +			 unsigned long off, unsigned long len) +{ +	do { +		unsigned long tlen, new = pa + PAGE_SIZE; +		u64 this_cookie; + +		sp->page_table[sp->pte_idx].mte = sp->mte_base | pa; + +		tlen = PAGE_SIZE; +		if (off) +			tlen = PAGE_SIZE - off; +		if (tlen > len) +			tlen = len; + +		this_cookie = make_cookie(sp->pte_idx, +					  pagesize_code(), off); + +		off = 0; + +		if (this_cookie == sp->prev_cookie) { +			sp->cookies[sp->nc - 1].cookie_size += tlen; +		} else { +			sp->cookies[sp->nc].cookie_addr = this_cookie; +			sp->cookies[sp->nc].cookie_size = tlen; +			sp->nc++; +		} +		sp->prev_cookie = this_cookie + tlen; + +		sp->pte_idx++; + +		len -= tlen; +		pa = new; +	} while (len > 0); +} + +static int sg_count_one(struct scatterlist *sg) +{ +	unsigned long base = page_to_pfn(sg_page(sg)) << PAGE_SHIFT; +	long len = sg->length; + +	if ((sg->offset | len) & (8UL - 1)) +		return -EFAULT; + +	return pages_in_region(base + sg->offset, len); +} + +static int sg_count_pages(struct scatterlist *sg, int num_sg) +{ +	int count; +	int i; + +	count = 0; +	for (i = 0; i < num_sg; i++) { +		int err = sg_count_one(sg + i); +		if (err < 0) +			return err; +		count += err; +	} + +	return count; +} + +int ldc_map_sg(struct ldc_channel *lp, +	       struct scatterlist *sg, int num_sg, +	       struct ldc_trans_cookie *cookies, int ncookies, +	       unsigned int map_perm) +{ +	unsigned long i, npages, flags; +	struct ldc_mtable_entry *base; +	struct cookie_state state; +	struct ldc_iommu *iommu; +	int err; + +	if (map_perm & ~LDC_MAP_ALL) +		return -EINVAL; + +	err = sg_count_pages(sg, num_sg); +	if (err < 0) +		return err; + +	npages = err; +	if (err > ncookies) +		return -EMSGSIZE; + +	iommu = &lp->iommu; + +	spin_lock_irqsave(&iommu->lock, flags); +	base = alloc_npages(iommu, npages); +	spin_unlock_irqrestore(&iommu->lock, flags); + +	if (!base) +		return -ENOMEM; + +	state.page_table = iommu->page_table; +	state.cookies = cookies; +	state.mte_base = perm_to_mte(map_perm); +	state.prev_cookie = ~(u64)0; +	state.pte_idx = (base - iommu->page_table); +	state.nc = 0; + +	for (i = 0; i < num_sg; i++) +		fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT, +			     sg[i].offset, sg[i].length); + +	return state.nc; +} +EXPORT_SYMBOL(ldc_map_sg); + +int ldc_map_single(struct ldc_channel *lp, +		   void *buf, unsigned int len, +		   struct ldc_trans_cookie *cookies, int ncookies, +		   unsigned int map_perm) +{ +	unsigned long npages, pa, flags; +	struct ldc_mtable_entry *base; +	struct cookie_state state; +	struct ldc_iommu *iommu; + +	if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1)) +		return -EINVAL; + +	pa = __pa(buf); +	if ((pa | len) & (8UL - 1)) +		return -EFAULT; + +	npages = pages_in_region(pa, len); + +	iommu = &lp->iommu; + +	spin_lock_irqsave(&iommu->lock, flags); +	base = alloc_npages(iommu, npages); +	spin_unlock_irqrestore(&iommu->lock, flags); + +	if (!base) +		return -ENOMEM; + +	state.page_table = iommu->page_table; +	state.cookies = cookies; +	state.mte_base = perm_to_mte(map_perm); +	state.prev_cookie = ~(u64)0; +	state.pte_idx = (base - iommu->page_table); +	state.nc = 0; +	fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len); +	BUG_ON(state.nc != 1); + +	return state.nc; +} +EXPORT_SYMBOL(ldc_map_single); + +static void free_npages(unsigned long id, struct ldc_iommu *iommu, +			u64 cookie, u64 size) +{ +	struct iommu_arena *arena = &iommu->arena; +	unsigned long i, shift, index, npages; +	struct ldc_mtable_entry *base; + +	npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT; +	index = cookie_to_index(cookie, &shift); +	base = iommu->page_table + index; + +	BUG_ON(index > arena->limit || +	       (index + npages) > arena->limit); + +	for (i = 0; i < npages; i++) { +		if (base->cookie) +			sun4v_ldc_revoke(id, cookie + (i << shift), +					 base->cookie); +		base->mte = 0; +		__clear_bit(index + i, arena->map); +	} +} + +void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies, +	       int ncookies) +{ +	struct ldc_iommu *iommu = &lp->iommu; +	unsigned long flags; +	int i; + +	spin_lock_irqsave(&iommu->lock, flags); +	for (i = 0; i < ncookies; i++) { +		u64 addr = cookies[i].cookie_addr; +		u64 size = cookies[i].cookie_size; + +		free_npages(lp->id, iommu, addr, size); +	} +	spin_unlock_irqrestore(&iommu->lock, flags); +} +EXPORT_SYMBOL(ldc_unmap); + +int ldc_copy(struct ldc_channel *lp, int copy_dir, +	     void *buf, unsigned int len, unsigned long offset, +	     struct ldc_trans_cookie *cookies, int ncookies) +{ +	unsigned int orig_len; +	unsigned long ra; +	int i; + +	if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) { +		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n", +		       lp->id, copy_dir); +		return -EINVAL; +	} + +	ra = __pa(buf); +	if ((ra | len | offset) & (8UL - 1)) { +		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer " +		       "ra[%lx] len[%x] offset[%lx]\n", +		       lp->id, ra, len, offset); +		return -EFAULT; +	} + +	if (lp->hs_state != LDC_HS_COMPLETE || +	    (lp->flags & LDC_FLAG_RESET)) { +		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] " +		       "flags[%x]\n", lp->id, lp->hs_state, lp->flags); +		return -ECONNRESET; +	} + +	orig_len = len; +	for (i = 0; i < ncookies; i++) { +		unsigned long cookie_raddr = cookies[i].cookie_addr; +		unsigned long this_len = cookies[i].cookie_size; +		unsigned long actual_len; + +		if (unlikely(offset)) { +			unsigned long this_off = offset; + +			if (this_off > this_len) +				this_off = this_len; + +			offset -= this_off; +			this_len -= this_off; +			if (!this_len) +				continue; +			cookie_raddr += this_off; +		} + +		if (this_len > len) +			this_len = len; + +		while (1) { +			unsigned long hv_err; + +			hv_err = sun4v_ldc_copy(lp->id, copy_dir, +						cookie_raddr, ra, +						this_len, &actual_len); +			if (unlikely(hv_err)) { +				printk(KERN_ERR PFX "ldc_copy: ID[%lu] " +				       "HV error %lu\n", +				       lp->id, hv_err); +				if (lp->hs_state != LDC_HS_COMPLETE || +				    (lp->flags & LDC_FLAG_RESET)) +					return -ECONNRESET; +				else +					return -EFAULT; +			} + +			cookie_raddr += actual_len; +			ra += actual_len; +			len -= actual_len; +			if (actual_len == this_len) +				break; + +			this_len -= actual_len; +		} + +		if (!len) +			break; +	} + +	/* It is caller policy what to do about short copies. +	 * For example, a networking driver can declare the +	 * packet a runt and drop it. +	 */ + +	return orig_len - len; +} +EXPORT_SYMBOL(ldc_copy); + +void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len, +			  struct ldc_trans_cookie *cookies, int *ncookies, +			  unsigned int map_perm) +{ +	void *buf; +	int err; + +	if (len & (8UL - 1)) +		return ERR_PTR(-EINVAL); + +	buf = kzalloc(len, GFP_KERNEL); +	if (!buf) +		return ERR_PTR(-ENOMEM); + +	err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm); +	if (err < 0) { +		kfree(buf); +		return ERR_PTR(err); +	} +	*ncookies = err; + +	return buf; +} +EXPORT_SYMBOL(ldc_alloc_exp_dring); + +void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len, +			struct ldc_trans_cookie *cookies, int ncookies) +{ +	ldc_unmap(lp, cookies, ncookies); +	kfree(buf); +} +EXPORT_SYMBOL(ldc_free_exp_dring); + +static int __init ldc_init(void) +{ +	unsigned long major, minor; +	struct mdesc_handle *hp; +	const u64 *v; +	int err; +	u64 mp; + +	hp = mdesc_grab(); +	if (!hp) +		return -ENODEV; + +	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform"); +	err = -ENODEV; +	if (mp == MDESC_NODE_NULL) +		goto out; + +	v = mdesc_get_property(hp, mp, "domaining-enabled", NULL); +	if (!v) +		goto out; + +	major = 1; +	minor = 0; +	if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) { +		printk(KERN_INFO PFX "Could not register LDOM hvapi.\n"); +		goto out; +	} + +	printk(KERN_INFO "%s", version); + +	if (!*v) { +		printk(KERN_INFO PFX "Domaining disabled.\n"); +		goto out; +	} +	ldom_domaining_enabled = 1; +	err = 0; + +out: +	mdesc_release(hp); +	return err; +} + +core_initcall(ldc_init);  |