[SPARC64]: Add LDOM virtual channel driver and VIO device layer.
authorDavid S. Miller <davem@sunset.davemloft.net>
Tue, 10 Jul 2007 05:22:44 +0000 (22:22 -0700)
committerDavid S. Miller <davem@sunset.davemloft.net>
Mon, 16 Jul 2007 11:03:18 +0000 (04:03 -0700)
Virtual devices on Sun Logical Domains are built on top
of a virtual channel framework.  This, with help of hypervisor
interfaces, provides a link layer protocol with basic
handshaking over which virtual device clients and servers
communicate.

Built on top of this is a VIO device protocol which has it's
own handshaking and message types.  At this layer attributes
are exchanged (disk size, network device addresses, etc.)
descriptor rings are registered, and data transfers are
triggers and replied to.

Signed-off-by: David S. Miller <davem@davemloft.net>
arch/sparc64/Kconfig
arch/sparc64/kernel/Makefile
arch/sparc64/kernel/ldc.c [new file with mode: 0644]
arch/sparc64/kernel/vio.c [new file with mode: 0644]
arch/sparc64/kernel/viohs.c [new file with mode: 0644]
include/asm-sparc64/ldc.h [new file with mode: 0644]
include/asm-sparc64/vio.h [new file with mode: 0644]

index 6566d13db04fec3c5417cde60c0295319f405c9f..af59daa81058ed44d0c3b3ecb6bc3fee3600f43d 100644 (file)
@@ -305,6 +305,12 @@ config SUN_IO
        bool
        default y
 
+config SUN_LDOMS
+       bool "Sun Logical Domains support"
+       help
+         Say Y here is you want to support virtual devices via
+         Logical Domains.
+
 config PCI
        bool "PCI support"
        select ARCH_SUPPORTS_MSI
index f964bf28d21a2568fed3a436af906c3e78d238ea..719ab23b19386f6476ddec26bb259c6443fae032 100644 (file)
@@ -26,6 +26,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_US3_FREQ) += us3_cpufreq.o
 obj-$(CONFIG_US2E_FREQ) += us2e_cpufreq.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_SUN_LDOMS) += ldc.o vio.o viohs.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDIT)$(CONFIG_SPARC32_COMPAT) += compat_audit.o
 obj-y += $(obj-yy)
diff --git a/arch/sparc64/kernel/ldc.c b/arch/sparc64/kernel/ldc.c
new file mode 100644 (file)
index 0000000..0fa04d6
--- /dev/null
@@ -0,0 +1,2338 @@
+/* ldc.c: Logical Domain Channel link-layer protocol driver.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/init.h>
+
+#include <asm/hypervisor.h>
+#include <asm/iommu.h>
+#include <asm/page.h>
+#include <asm/ldc.h>
+#include <asm/mdesc.h>
+
+#define DRV_MODULE_NAME                "ldc"
+#define PFX DRV_MODULE_NAME    ": "
+#define DRV_MODULE_VERSION     "1.0"
+#define DRV_MODULE_RELDATE     "June 25, 2007"
+
+static char version[] __devinitdata =
+       DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+#define LDC_PACKET_SIZE                64
+
+/* Packet header layout for unreliable and reliable mode frames.
+ * When in RAW mode, packets are simply straight 64-byte payloads
+ * with no headers.
+ */
+struct ldc_packet {
+       u8                      type;
+#define LDC_CTRL               0x01
+#define LDC_DATA               0x02
+#define LDC_ERR                        0x10
+
+       u8                      stype;
+#define LDC_INFO               0x01
+#define LDC_ACK                        0x02
+#define LDC_NACK               0x04
+
+       u8                      ctrl;
+#define LDC_VERS               0x01 /* Link Version            */
+#define LDC_RTS                        0x02 /* Request To Send         */
+#define LDC_RTR                        0x03 /* Ready To Receive        */
+#define LDC_RDX                        0x04 /* Ready for Data eXchange */
+#define LDC_CTRL_MSK           0x0f
+
+       u8                      env;
+#define LDC_LEN                        0x3f
+#define LDC_FRAG_MASK          0xc0
+#define LDC_START              0x40
+#define LDC_STOP               0x80
+
+       u32                     seqid;
+
+       union {
+               u8              u_data[LDC_PACKET_SIZE - 8];
+               struct {
+                       u32     pad;
+                       u32     ackid;
+                       u8      r_data[LDC_PACKET_SIZE - 8 - 8];
+               } r;
+       } u;
+};
+
+struct ldc_version {
+       u16 major;
+       u16 minor;
+};
+
+/* Ordered from largest major to lowest.  */
+static struct ldc_version ver_arr[] = {
+       { .major = 1, .minor = 0 },
+};
+
+#define LDC_DEFAULT_MTU                        (4 * LDC_PACKET_SIZE)
+#define LDC_DEFAULT_NUM_ENTRIES                (PAGE_SIZE / LDC_PACKET_SIZE)
+
+struct ldc_channel;
+
+struct ldc_mode_ops {
+       int (*write)(struct ldc_channel *, const void *, unsigned int);
+       int (*read)(struct ldc_channel *, void *, unsigned int);
+};
+
+static const struct ldc_mode_ops raw_ops;
+static const struct ldc_mode_ops nonraw_ops;
+static const struct ldc_mode_ops stream_ops;
+
+int ldom_domaining_enabled;
+
+struct ldc_iommu {
+       /* Protects arena alloc/free.  */
+       spinlock_t                      lock;
+       struct iommu_arena              arena;
+       struct ldc_mtable_entry         *page_table;
+};
+
+struct ldc_channel {
+       /* Protects all operations that depend upon channel state.  */
+       spinlock_t                      lock;
+
+       unsigned long                   id;
+
+       u8                              *mssbuf;
+       u32                             mssbuf_len;
+       u32                             mssbuf_off;
+
+       struct ldc_packet               *tx_base;
+       unsigned long                   tx_head;
+       unsigned long                   tx_tail;
+       unsigned long                   tx_num_entries;
+       unsigned long                   tx_ra;
+
+       unsigned long                   tx_acked;
+
+       struct ldc_packet               *rx_base;
+       unsigned long                   rx_head;
+       unsigned long                   rx_tail;
+       unsigned long                   rx_num_entries;
+       unsigned long                   rx_ra;
+
+       u32                             rcv_nxt;
+       u32                             snd_nxt;
+
+       unsigned long                   chan_state;
+
+       struct ldc_channel_config       cfg;
+       void                            *event_arg;
+
+       const struct ldc_mode_ops       *mops;
+
+       struct ldc_iommu                iommu;
+
+       struct ldc_version              ver;
+
+       u8                              hs_state;
+#define LDC_HS_CLOSED                  0x00
+#define LDC_HS_OPEN                    0x01
+#define LDC_HS_GOTVERS                 0x02
+#define LDC_HS_SENTRTR                 0x03
+#define LDC_HS_GOTRTR                  0x04
+#define LDC_HS_COMPLETE                        0x10
+
+       u8                              flags;
+#define LDC_FLAG_ALLOCED_QUEUES                0x01
+#define LDC_FLAG_REGISTERED_QUEUES     0x02
+#define LDC_FLAG_REGISTERED_IRQS       0x04
+#define LDC_FLAG_RESET                 0x10
+
+       u8                              mss;
+       u8                              state;
+
+       struct hlist_head               mh_list;
+
+       struct hlist_node               list;
+};
+
+#define ldcdbg(TYPE, f, a...) \
+do {   if (lp->cfg.debug & LDC_DEBUG_##TYPE) \
+               printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \
+} while (0)
+
+static const char *state_to_str(u8 state)
+{
+       switch (state) {
+       case LDC_STATE_INVALID:
+               return "INVALID";
+       case LDC_STATE_INIT:
+               return "INIT";
+       case LDC_STATE_BOUND:
+               return "BOUND";
+       case LDC_STATE_READY:
+               return "READY";
+       case LDC_STATE_CONNECTED:
+               return "CONNECTED";
+       default:
+               return "<UNKNOWN>";
+       }
+}
+
+static void ldc_set_state(struct ldc_channel *lp, u8 state)
+{
+       ldcdbg(STATE, "STATE (%s) --> (%s)\n",
+              state_to_str(lp->state),
+              state_to_str(state));
+
+       lp->state = state;
+}
+
+static unsigned long __advance(unsigned long off, unsigned long num_entries)
+{
+       off += LDC_PACKET_SIZE;
+       if (off == (num_entries * LDC_PACKET_SIZE))
+               off = 0;
+
+       return off;
+}
+
+static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off)
+{
+       return __advance(off, lp->rx_num_entries);
+}
+
+static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off)
+{
+       return __advance(off, lp->tx_num_entries);
+}
+
+static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp,
+                                                 unsigned long *new_tail)
+{
+       struct ldc_packet *p;
+       unsigned long t;
+
+       t = tx_advance(lp, lp->tx_tail);
+       if (t == lp->tx_head)
+               return NULL;
+
+       *new_tail = t;
+
+       p = lp->tx_base;
+       return p + (lp->tx_tail / LDC_PACKET_SIZE);
+}
+
+/* When we are in reliable or stream mode, have to track the next packet
+ * we haven't gotten an ACK for in the TX queue using tx_acked.  We have
+ * to be careful not to stomp over the queue past that point.  During
+ * the handshake, we don't have TX data packets pending in the queue
+ * and that's why handshake_get_tx_packet() need not be mindful of
+ * lp->tx_acked.
+ */
+static unsigned long head_for_data(struct ldc_channel *lp)
+{
+       if (lp->cfg.mode == LDC_MODE_RELIABLE ||
+           lp->cfg.mode == LDC_MODE_STREAM)
+               return lp->tx_acked;
+       return lp->tx_head;
+}
+
+static int tx_has_space_for(struct ldc_channel *lp, unsigned int size)
+{
+       unsigned long limit, tail, new_tail, diff;
+       unsigned int mss;
+
+       limit = head_for_data(lp);
+       tail = lp->tx_tail;
+       new_tail = tx_advance(lp, tail);
+       if (new_tail == limit)
+               return 0;
+
+       if (limit > new_tail)
+               diff = limit - new_tail;
+       else
+               diff = (limit +
+                       ((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail));
+       diff /= LDC_PACKET_SIZE;
+       mss = lp->mss;
+
+       if (diff * mss < size)
+               return 0;
+
+       return 1;
+}
+
+static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp,
+                                            unsigned long *new_tail)
+{
+       struct ldc_packet *p;
+       unsigned long h, t;
+
+       h = head_for_data(lp);
+       t = tx_advance(lp, lp->tx_tail);
+       if (t == h)
+               return NULL;
+
+       *new_tail = t;
+
+       p = lp->tx_base;
+       return p + (lp->tx_tail / LDC_PACKET_SIZE);
+}
+
+static int set_tx_tail(struct ldc_channel *lp, unsigned long tail)
+{
+       unsigned long orig_tail = lp->tx_tail;
+       int limit = 1000;
+
+       lp->tx_tail = tail;
+       while (limit-- > 0) {
+               unsigned long err;
+
+               err = sun4v_ldc_tx_set_qtail(lp->id, tail);
+               if (!err)
+                       return 0;
+
+               if (err != HV_EWOULDBLOCK) {
+                       lp->tx_tail = orig_tail;
+                       return -EINVAL;
+               }
+               udelay(1);
+       }
+
+       lp->tx_tail = orig_tail;
+       return -EBUSY;
+}
+
+/* This just updates the head value in the hypervisor using
+ * a polling loop with a timeout.  The caller takes care of
+ * upating software state representing the head change, if any.
+ */
+static int __set_rx_head(struct ldc_channel *lp, unsigned long head)
+{
+       int limit = 1000;
+
+       while (limit-- > 0) {
+               unsigned long err;
+
+               err = sun4v_ldc_rx_set_qhead(lp->id, head);
+               if (!err)
+                       return 0;
+
+               if (err != HV_EWOULDBLOCK)
+                       return -EINVAL;
+
+               udelay(1);
+       }
+
+       return -EBUSY;
+}
+
+static int send_tx_packet(struct ldc_channel *lp,
+                         struct ldc_packet *p,
+                         unsigned long new_tail)
+{
+       BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE)));
+
+       return set_tx_tail(lp, new_tail);
+}
+
+static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp,
+                                                u8 stype, u8 ctrl,
+                                                void *data, int dlen,
+                                                unsigned long *new_tail)
+{
+       struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail);
+
+       if (p) {
+               memset(p, 0, sizeof(*p));
+               p->type = LDC_CTRL;
+               p->stype = stype;
+               p->ctrl = ctrl;
+               if (data)
+                       memcpy(p->u.u_data, data, dlen);
+       }
+       return p;
+}
+
+static int start_handshake(struct ldc_channel *lp)
+{
+       struct ldc_packet *p;
+       struct ldc_version *ver;
+       unsigned long new_tail;
+
+       ver = &ver_arr[0];
+
+       ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n",
+              ver->major, ver->minor);
+
+       p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
+                                  ver, sizeof(*ver), &new_tail);
+       if (p) {
+               int err = send_tx_packet(lp, p, new_tail);
+               if (!err)
+                       lp->flags &= ~LDC_FLAG_RESET;
+               return err;
+       }
+       return -EBUSY;
+}
+
+static int send_version_nack(struct ldc_channel *lp,
+                            u16 major, u16 minor)
+{
+       struct ldc_packet *p;
+       struct ldc_version ver;
+       unsigned long new_tail;
+
+       ver.major = major;
+       ver.minor = minor;
+
+       p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS,
+                                  &ver, sizeof(ver), &new_tail);
+       if (p) {
+               ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n",
+                      ver.major, ver.minor);
+
+               return send_tx_packet(lp, p, new_tail);
+       }
+       return -EBUSY;
+}
+
+static int send_version_ack(struct ldc_channel *lp,
+                           struct ldc_version *vp)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+
+       p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS,
+                                  vp, sizeof(*vp), &new_tail);
+       if (p) {
+               ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n",
+                      vp->major, vp->minor);
+
+               return send_tx_packet(lp, p, new_tail);
+       }
+       return -EBUSY;
+}
+
+static int send_rts(struct ldc_channel *lp)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+
+       p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0,
+                                  &new_tail);
+       if (p) {
+               p->env = lp->cfg.mode;
+               p->seqid = 0;
+               lp->rcv_nxt = 0;
+
+               ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n",
+                      p->env, p->seqid);
+
+               return send_tx_packet(lp, p, new_tail);
+       }
+       return -EBUSY;
+}
+
+static int send_rtr(struct ldc_channel *lp)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+
+       p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0,
+                                  &new_tail);
+       if (p) {
+               p->env = lp->cfg.mode;
+               p->seqid = 0;
+
+               ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n",
+                      p->env, p->seqid);
+
+               return send_tx_packet(lp, p, new_tail);
+       }
+       return -EBUSY;
+}
+
+static int send_rdx(struct ldc_channel *lp)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+
+       p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0,
+                                  &new_tail);
+       if (p) {
+               p->env = 0;
+               p->seqid = ++lp->snd_nxt;
+               p->u.r.ackid = lp->rcv_nxt;
+
+               ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n",
+                      p->env, p->seqid, p->u.r.ackid);
+
+               return send_tx_packet(lp, p, new_tail);
+       }
+       return -EBUSY;
+}
+
+static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+       int err;
+
+       p = data_get_tx_packet(lp, &new_tail);
+       if (!p)
+               return -EBUSY;
+       memset(p, 0, sizeof(*p));
+       p->type = data_pkt->type;
+       p->stype = LDC_NACK;
+       p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK;
+       p->seqid = lp->snd_nxt;
+       p->u.r.ackid = lp->rcv_nxt;
+
+       ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n",
+              p->type, p->ctrl, p->seqid, p->u.r.ackid);
+
+       err = send_tx_packet(lp, p, new_tail);
+       if (!err)
+               lp->snd_nxt++;
+
+       return err;
+}
+
+static int ldc_abort(struct ldc_channel *lp)
+{
+       unsigned long hv_err;
+
+       ldcdbg(STATE, "ABORT\n");
+
+       /* We report but do not act upon the hypervisor errors because
+        * there really isn't much we can do if they fail at this point.
+        */
+       hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+       if (hv_err)
+               printk(KERN_ERR PFX "ldc_abort: "
+                      "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
+                      lp->id, lp->tx_ra, lp->tx_num_entries, hv_err);
+
+       hv_err = sun4v_ldc_tx_get_state(lp->id,
+                                       &lp->tx_head,
+                                       &lp->tx_tail,
+                                       &lp->chan_state);
+       if (hv_err)
+               printk(KERN_ERR PFX "ldc_abort: "
+                      "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n",
+                      lp->id, hv_err);
+
+       hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+       if (hv_err)
+               printk(KERN_ERR PFX "ldc_abort: "
+                      "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
+                      lp->id, lp->rx_ra, lp->rx_num_entries, hv_err);
+
+       /* Refetch the RX queue state as well, because we could be invoked
+        * here in the queue processing context.
+        */
+       hv_err = sun4v_ldc_rx_get_state(lp->id,
+                                       &lp->rx_head,
+                                       &lp->rx_tail,
+                                       &lp->chan_state);
+       if (hv_err)
+               printk(KERN_ERR PFX "ldc_abort: "
+                      "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n",
+                      lp->id, hv_err);
+
+       return -ECONNRESET;
+}
+
+static struct ldc_version *find_by_major(u16 major)
+{
+       struct ldc_version *ret = NULL;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(ver_arr); i++) {
+               struct ldc_version *v = &ver_arr[i];
+               if (v->major <= major) {
+                       ret = v;
+                       break;
+               }
+       }
+       return ret;
+}
+
+static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp)
+{
+       struct ldc_version *vap;
+       int err;
+
+       ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n",
+              vp->major, vp->minor);
+
+       if (lp->hs_state == LDC_HS_GOTVERS) {
+               lp->hs_state = LDC_HS_OPEN;
+               memset(&lp->ver, 0, sizeof(lp->ver));
+       }
+
+       vap = find_by_major(vp->major);
+       if (!vap) {
+               err = send_version_nack(lp, 0, 0);
+       } else if (vap->major != vp->major) {
+               err = send_version_nack(lp, vap->major, vap->minor);
+       } else {
+               struct ldc_version ver = *vp;
+               if (ver.minor > vap->minor)
+                       ver.minor = vap->minor;
+               err = send_version_ack(lp, &ver);
+               if (!err) {
+                       lp->ver = ver;
+                       lp->hs_state = LDC_HS_GOTVERS;
+               }
+       }
+       if (err)
+               return ldc_abort(lp);
+
+       return 0;
+}
+
+static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp)
+{
+       ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n",
+              vp->major, vp->minor);
+
+       if (lp->hs_state == LDC_HS_GOTVERS) {
+               if (lp->ver.major != vp->major ||
+                   lp->ver.minor != vp->minor)
+                       return ldc_abort(lp);
+       } else {
+               lp->ver = *vp;
+               lp->hs_state = LDC_HS_GOTVERS;
+       }
+       if (send_rts(lp))
+               return ldc_abort(lp);
+       return 0;
+}
+
+static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp)
+{
+       struct ldc_version *vap;
+
+       if ((vp->major == 0 && vp->minor == 0) ||
+           !(vap = find_by_major(vp->major))) {
+               return ldc_abort(lp);
+       } else {
+               struct ldc_packet *p;
+               unsigned long new_tail;
+
+               p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
+                                          vap, sizeof(*vap),
+                                          &new_tail);
+               if (p)
+                       return send_tx_packet(lp, p, new_tail);
+               else
+                       return ldc_abort(lp);
+       }
+}
+
+static int process_version(struct ldc_channel *lp,
+                          struct ldc_packet *p)
+{
+       struct ldc_version *vp;
+
+       vp = (struct ldc_version *) p->u.u_data;
+
+       switch (p->stype) {
+       case LDC_INFO:
+               return process_ver_info(lp, vp);
+
+       case LDC_ACK:
+               return process_ver_ack(lp, vp);
+
+       case LDC_NACK:
+               return process_ver_nack(lp, vp);
+
+       default:
+               return ldc_abort(lp);
+       }
+}
+
+static int process_rts(struct ldc_channel *lp,
+                      struct ldc_packet *p)
+{
+       ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n",
+              p->stype, p->seqid, p->env);
+
+       if (p->stype     != LDC_INFO       ||
+           lp->hs_state != LDC_HS_GOTVERS ||
+           p->env       != lp->cfg.mode)
+               return ldc_abort(lp);
+
+       lp->snd_nxt = p->seqid;
+       lp->rcv_nxt = p->seqid;
+       lp->hs_state = LDC_HS_SENTRTR;
+       if (send_rtr(lp))
+               return ldc_abort(lp);
+
+       return 0;
+}
+
+static int process_rtr(struct ldc_channel *lp,
+                      struct ldc_packet *p)
+{
+       ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n",
+              p->stype, p->seqid, p->env);
+
+       if (p->stype     != LDC_INFO ||
+           p->env       != lp->cfg.mode)
+               return ldc_abort(lp);
+
+       lp->snd_nxt = p->seqid;
+       lp->hs_state = LDC_HS_COMPLETE;
+       ldc_set_state(lp, LDC_STATE_CONNECTED);
+       send_rdx(lp);
+
+       return LDC_EVENT_UP;
+}
+
+static int rx_seq_ok(struct ldc_channel *lp, u32 seqid)
+{
+       return lp->rcv_nxt + 1 == seqid;
+}
+
+static int process_rdx(struct ldc_channel *lp,
+                      struct ldc_packet *p)
+{
+       ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n",
+              p->stype, p->seqid, p->env, p->u.r.ackid);
+
+       if (p->stype != LDC_INFO ||
+           !(rx_seq_ok(lp, p->seqid)))
+               return ldc_abort(lp);
+
+       lp->rcv_nxt = p->seqid;
+
+       lp->hs_state = LDC_HS_COMPLETE;
+       ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+       return LDC_EVENT_UP;
+}
+
+static int process_control_frame(struct ldc_channel *lp,
+                                struct ldc_packet *p)
+{
+       switch (p->ctrl) {
+       case LDC_VERS:
+               return process_version(lp, p);
+
+       case LDC_RTS:
+               return process_rts(lp, p);
+
+       case LDC_RTR:
+               return process_rtr(lp, p);
+
+       case LDC_RDX:
+               return process_rdx(lp, p);
+
+       default:
+               return ldc_abort(lp);
+       }
+}
+
+static int process_error_frame(struct ldc_channel *lp,
+                              struct ldc_packet *p)
+{
+       return ldc_abort(lp);
+}
+
+static int process_data_ack(struct ldc_channel *lp,
+                           struct ldc_packet *ack)
+{
+       unsigned long head = lp->tx_acked;
+       u32 ackid = ack->u.r.ackid;
+
+       while (1) {
+               struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE);
+
+               head = tx_advance(lp, head);
+
+               if (p->seqid == ackid) {
+                       lp->tx_acked = head;
+                       return 0;
+               }
+               if (head == lp->tx_head)
+                       return ldc_abort(lp);
+       }
+
+       return 0;
+}
+
+static void send_events(struct ldc_channel *lp, unsigned int event_mask)
+{
+       if (event_mask & LDC_EVENT_RESET)
+               lp->cfg.event(lp->event_arg, LDC_EVENT_RESET);
+       if (event_mask & LDC_EVENT_UP)
+               lp->cfg.event(lp->event_arg, LDC_EVENT_UP);
+       if (event_mask & LDC_EVENT_DATA_READY)
+               lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY);
+}
+
+static irqreturn_t ldc_rx(int irq, void *dev_id)
+{
+       struct ldc_channel *lp = dev_id;
+       unsigned long orig_state, hv_err, flags;
+       unsigned int event_mask;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       orig_state = lp->chan_state;
+       hv_err = sun4v_ldc_rx_get_state(lp->id,
+                                       &lp->rx_head,
+                                       &lp->rx_tail,
+                                       &lp->chan_state);
+
+       ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
+              orig_state, lp->chan_state, lp->rx_head, lp->rx_tail);
+
+       event_mask = 0;
+
+       if (lp->cfg.mode == LDC_MODE_RAW &&
+           lp->chan_state == LDC_CHANNEL_UP) {
+               lp->hs_state = LDC_HS_COMPLETE;
+               ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+               event_mask |= LDC_EVENT_UP;
+
+               orig_state = lp->chan_state;
+       }
+
+       /* If we are in reset state, flush the RX queue and ignore
+        * everything.
+        */
+       if (lp->flags & LDC_FLAG_RESET) {
+               (void) __set_rx_head(lp, lp->rx_tail);
+               goto out;
+       }
+
+       /* Once we finish the handshake, we let the ldc_read()
+        * paths do all of the control frame and state management.
+        * Just trigger the callback.
+        */
+       if (lp->hs_state == LDC_HS_COMPLETE) {
+handshake_complete:
+               if (lp->chan_state != orig_state) {
+                       unsigned int event = LDC_EVENT_RESET;
+
+                       if (lp->chan_state == LDC_CHANNEL_UP)
+                               event = LDC_EVENT_UP;
+
+                       event_mask |= event;
+               }
+               if (lp->rx_head != lp->rx_tail)
+                       event_mask |= LDC_EVENT_DATA_READY;
+
+               goto out;
+       }
+
+       if (lp->chan_state != orig_state)
+               goto out;
+
+       while (lp->rx_head != lp->rx_tail) {
+               struct ldc_packet *p;
+               unsigned long new;
+               int err;
+
+               p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
+
+               switch (p->type) {
+               case LDC_CTRL:
+                       err = process_control_frame(lp, p);
+                       if (err > 0)
+                               event_mask |= err;
+                       break;
+
+               case LDC_DATA:
+                       event_mask |= LDC_EVENT_DATA_READY;
+                       err = 0;
+                       break;
+
+               case LDC_ERR:
+                       err = process_error_frame(lp, p);
+                       break;
+
+               default:
+                       err = ldc_abort(lp);
+                       break;
+               }
+
+               if (err < 0)
+                       break;
+
+               new = lp->rx_head;
+               new += LDC_PACKET_SIZE;
+               if (new == (lp->rx_num_entries * LDC_PACKET_SIZE))
+                       new = 0;
+               lp->rx_head = new;
+
+               err = __set_rx_head(lp, new);
+               if (err < 0) {
+                       (void) ldc_abort(lp);
+                       break;
+               }
+               if (lp->hs_state == LDC_HS_COMPLETE)
+                       goto handshake_complete;
+       }
+
+out:
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       send_events(lp, event_mask);
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t ldc_tx(int irq, void *dev_id)
+{
+       struct ldc_channel *lp = dev_id;
+       unsigned long flags, hv_err, orig_state;
+       unsigned int event_mask = 0;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       orig_state = lp->chan_state;
+       hv_err = sun4v_ldc_tx_get_state(lp->id,
+                                       &lp->tx_head,
+                                       &lp->tx_tail,
+                                       &lp->chan_state);
+
+       ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
+              orig_state, lp->chan_state, lp->tx_head, lp->tx_tail);
+
+       if (lp->cfg.mode == LDC_MODE_RAW &&
+           lp->chan_state == LDC_CHANNEL_UP) {
+               lp->hs_state = LDC_HS_COMPLETE;
+               ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+               event_mask |= LDC_EVENT_UP;
+       }
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       send_events(lp, event_mask);
+
+       return IRQ_HANDLED;
+}
+
+/* XXX ldc_alloc() and ldc_free() needs to run under a mutex so
+ * XXX that addition and removal from the ldc_channel_list has
+ * XXX atomicity, otherwise the __ldc_channel_exists() check is
+ * XXX totally pointless as another thread can slip into ldc_alloc()
+ * XXX and add a channel with the same ID.  There also needs to be
+ * XXX a spinlock for ldc_channel_list.
+ */
+static HLIST_HEAD(ldc_channel_list);
+
+static int __ldc_channel_exists(unsigned long id)
+{
+       struct ldc_channel *lp;
+       struct hlist_node *n;
+
+       hlist_for_each_entry(lp, n, &ldc_channel_list, list) {
+               if (lp->id == id)
+                       return 1;
+       }
+       return 0;
+}
+
+static int alloc_queue(const char *name, unsigned long num_entries,
+                      struct ldc_packet **base, unsigned long *ra)
+{
+       unsigned long size, order;
+       void *q;
+
+       size = num_entries * LDC_PACKET_SIZE;
+       order = get_order(size);
+
+       q = (void *) __get_free_pages(GFP_KERNEL, order);
+       if (!q) {
+               printk(KERN_ERR PFX "Alloc of %s queue failed with "
+                      "size=%lu order=%lu\n", name, size, order);
+               return -ENOMEM;
+       }
+
+       memset(q, 0, PAGE_SIZE << order);
+
+       *base = q;
+       *ra = __pa(q);
+
+       return 0;
+}
+
+static void free_queue(unsigned long num_entries, struct ldc_packet *q)
+{
+       unsigned long size, order;
+
+       if (!q)
+               return;
+
+       size = num_entries * LDC_PACKET_SIZE;
+       order = get_order(size);
+
+       free_pages((unsigned long)q, order);
+}
+
+/* XXX Make this configurable... XXX */
+#define LDC_IOTABLE_SIZE       (8 * 1024)
+
+static int ldc_iommu_init(struct ldc_channel *lp)
+{
+       unsigned long sz, num_tsb_entries, tsbsize, order;
+       struct ldc_iommu *iommu = &lp->iommu;
+       struct ldc_mtable_entry *table;
+       unsigned long hv_err;
+       int err;
+
+       num_tsb_entries = LDC_IOTABLE_SIZE;
+       tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
+
+       spin_lock_init(&iommu->lock);
+
+       sz = num_tsb_entries / 8;
+       sz = (sz + 7UL) & ~7UL;
+       iommu->arena.map = kzalloc(sz, GFP_KERNEL);
+       if (!iommu->arena.map) {
+               printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz);
+               return -ENOMEM;
+       }
+
+       iommu->arena.limit = num_tsb_entries;
+
+       order = get_order(tsbsize);
+
+       table = (struct ldc_mtable_entry *)
+               __get_free_pages(GFP_KERNEL, order);
+       err = -ENOMEM;
+       if (!table) {
+               printk(KERN_ERR PFX "Alloc of MTE table failed, "
+                      "size=%lu order=%lu\n", tsbsize, order);
+               goto out_free_map;
+       }
+
+       memset(table, 0, PAGE_SIZE << order);
+
+       iommu->page_table = table;
+
+       hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table),
+                                        num_tsb_entries);
+       err = -EINVAL;
+       if (hv_err)
+               goto out_free_table;
+
+       return 0;
+
+out_free_table:
+       free_pages((unsigned long) table, order);
+       iommu->page_table = NULL;
+
+out_free_map:
+       kfree(iommu->arena.map);
+       iommu->arena.map = NULL;
+
+       return err;
+}
+
+static void ldc_iommu_release(struct ldc_channel *lp)
+{
+       struct ldc_iommu *iommu = &lp->iommu;
+       unsigned long num_tsb_entries, tsbsize, order;
+
+       (void) sun4v_ldc_set_map_table(lp->id, 0, 0);
+
+       num_tsb_entries = iommu->arena.limit;
+       tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
+       order = get_order(tsbsize);
+
+       free_pages((unsigned long) iommu->page_table, order);
+       iommu->page_table = NULL;
+
+       kfree(iommu->arena.map);
+       iommu->arena.map = NULL;
+}
+
+struct ldc_channel *ldc_alloc(unsigned long id,
+                             const struct ldc_channel_config *cfgp,
+                             void *event_arg)
+{
+       struct ldc_channel *lp;
+       const struct ldc_mode_ops *mops;
+       unsigned long dummy1, dummy2, hv_err;
+       u8 mss, *mssbuf;
+       int err;
+
+       err = -ENODEV;
+       if (!ldom_domaining_enabled)
+               goto out_err;
+
+       err = -EINVAL;
+       if (!cfgp)
+               goto out_err;
+
+       switch (cfgp->mode) {
+       case LDC_MODE_RAW:
+               mops = &raw_ops;
+               mss = LDC_PACKET_SIZE;
+               break;
+
+       case LDC_MODE_UNRELIABLE:
+               mops = &nonraw_ops;
+               mss = LDC_PACKET_SIZE - 8;
+               break;
+
+       case LDC_MODE_RELIABLE:
+               mops = &nonraw_ops;
+               mss = LDC_PACKET_SIZE - 8 - 8;
+               break;
+
+       case LDC_MODE_STREAM:
+               mops = &stream_ops;
+               mss = LDC_PACKET_SIZE - 8 - 8;
+               break;
+
+       default:
+               goto out_err;
+       }
+
+       if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq)
+               goto out_err;
+
+       hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2);
+       err = -ENODEV;
+       if (hv_err == HV_ECHANNEL)
+               goto out_err;
+
+       err = -EEXIST;
+       if (__ldc_channel_exists(id))
+               goto out_err;
+
+       mssbuf = NULL;
+
+       lp = kzalloc(sizeof(*lp), GFP_KERNEL);
+       err = -ENOMEM;
+       if (!lp)
+               goto out_err;
+
+       spin_lock_init(&lp->lock);
+
+       lp->id = id;
+
+       err = ldc_iommu_init(lp);
+       if (err)
+               goto out_free_ldc;
+
+       lp->mops = mops;
+       lp->mss = mss;
+
+       lp->cfg = *cfgp;
+       if (!lp->cfg.mtu)
+               lp->cfg.mtu = LDC_DEFAULT_MTU;
+
+       if (lp->cfg.mode == LDC_MODE_STREAM) {
+               mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL);
+               if (!mssbuf) {
+                       err = -ENOMEM;
+                       goto out_free_iommu;
+               }
+               lp->mssbuf = mssbuf;
+       }
+
+       lp->event_arg = event_arg;
+
+       /* XXX allow setting via ldc_channel_config to override defaults
+        * XXX or use some formula based upon mtu
+        */
+       lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
+       lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
+
+       err = alloc_queue("TX", lp->tx_num_entries,
+                         &lp->tx_base, &lp->tx_ra);
+       if (err)
+               goto out_free_mssbuf;
+
+       err = alloc_queue("RX", lp->rx_num_entries,
+                         &lp->rx_base, &lp->rx_ra);
+       if (err)
+               goto out_free_txq;
+
+       lp->flags |= LDC_FLAG_ALLOCED_QUEUES;
+
+       lp->hs_state = LDC_HS_CLOSED;
+       ldc_set_state(lp, LDC_STATE_INIT);
+
+       INIT_HLIST_NODE(&lp->list);
+       hlist_add_head(&lp->list, &ldc_channel_list);
+
+       INIT_HLIST_HEAD(&lp->mh_list);
+
+       return lp;
+
+out_free_txq:
+       free_queue(lp->tx_num_entries, lp->tx_base);
+
+out_free_mssbuf:
+       if (mssbuf)
+               kfree(mssbuf);
+
+out_free_iommu:
+       ldc_iommu_release(lp);
+
+out_free_ldc:
+       kfree(lp);
+
+out_err:
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ldc_alloc);
+
+void ldc_free(struct ldc_channel *lp)
+{
+       if (lp->flags & LDC_FLAG_REGISTERED_IRQS) {
+               free_irq(lp->cfg.rx_irq, lp);
+               free_irq(lp->cfg.tx_irq, lp);
+       }
+
+       if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) {
+               sun4v_ldc_tx_qconf(lp->id, 0, 0);
+               sun4v_ldc_rx_qconf(lp->id, 0, 0);
+               lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
+       }
+       if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) {
+               free_queue(lp->tx_num_entries, lp->tx_base);
+               free_queue(lp->rx_num_entries, lp->rx_base);
+               lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES;
+       }
+
+       hlist_del(&lp->list);
+
+       if (lp->mssbuf)
+               kfree(lp->mssbuf);
+
+       ldc_iommu_release(lp);
+
+       kfree(lp);
+}
+EXPORT_SYMBOL(ldc_free);
+
+/* Bind the channel.  This registers the LDC queues with
+ * the hypervisor and puts the channel into a pseudo-listening
+ * state.  This does not initiate a handshake, ldc_connect() does
+ * that.
+ */
+int ldc_bind(struct ldc_channel *lp)
+{
+       unsigned long hv_err, flags;
+       int err = -EINVAL;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       if (lp->state != LDC_STATE_INIT)
+               goto out_err;
+
+       err = request_irq(lp->cfg.rx_irq, ldc_rx,
+                         IRQF_SAMPLE_RANDOM | IRQF_SHARED,
+                         "LDC RX", lp);
+       if (err)
+               goto out_err;
+
+       err = request_irq(lp->cfg.tx_irq, ldc_tx,
+                         IRQF_SAMPLE_RANDOM | IRQF_SHARED,
+                         "LDC TX", lp);
+       if (err)
+               goto out_free_rx_irq;
+
+
+       lp->flags |= LDC_FLAG_REGISTERED_IRQS;
+
+       err = -ENODEV;
+       hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
+       if (hv_err)
+               goto out_free_tx_irq;
+
+       hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+       if (hv_err)
+               goto out_free_tx_irq;
+
+       hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
+       if (hv_err)
+               goto out_unmap_tx;
+
+       hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+       if (hv_err)
+               goto out_unmap_tx;
+
+       lp->flags |= LDC_FLAG_REGISTERED_QUEUES;
+
+       hv_err = sun4v_ldc_tx_get_state(lp->id,
+                                       &lp->tx_head,
+                                       &lp->tx_tail,
+                                       &lp->chan_state);
+       err = -EBUSY;
+       if (hv_err)
+               goto out_unmap_rx;
+
+       lp->tx_acked = lp->tx_head;
+
+       lp->hs_state = LDC_HS_OPEN;
+       ldc_set_state(lp, LDC_STATE_BOUND);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return 0;
+
+out_unmap_rx:
+       lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
+       sun4v_ldc_rx_qconf(lp->id, 0, 0);
+
+out_unmap_tx:
+       sun4v_ldc_tx_qconf(lp->id, 0, 0);
+
+out_free_tx_irq:
+       lp->flags &= ~LDC_FLAG_REGISTERED_IRQS;
+       free_irq(lp->cfg.tx_irq, lp);
+
+out_free_rx_irq:
+       free_irq(lp->cfg.rx_irq, lp);
+
+out_err:
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return err;
+}
+EXPORT_SYMBOL(ldc_bind);
+
+int ldc_connect(struct ldc_channel *lp)
+{
+       unsigned long flags;
+       int err;
+
+       if (lp->cfg.mode == LDC_MODE_RAW)
+               return -EINVAL;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
+           !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) ||
+           lp->hs_state != LDC_HS_OPEN)
+               err = -EINVAL;
+       else
+               err = start_handshake(lp);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return err;
+}
+EXPORT_SYMBOL(ldc_connect);
+
+int ldc_disconnect(struct ldc_channel *lp)
+{
+       unsigned long hv_err, flags;
+       int err;
+
+       if (lp->cfg.mode == LDC_MODE_RAW)
+               return -EINVAL;
+
+       if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
+           !(lp->flags & LDC_FLAG_REGISTERED_QUEUES))
+               return -EINVAL;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       err = -ENODEV;
+       hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
+       if (hv_err)
+               goto out_err;
+
+       hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+       if (hv_err)
+               goto out_err;
+
+       hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
+       if (hv_err)
+               goto out_err;
+
+       hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+       if (hv_err)
+               goto out_err;
+
+       ldc_set_state(lp, LDC_STATE_BOUND);
+       lp->hs_state = LDC_HS_OPEN;
+       lp->flags |= LDC_FLAG_RESET;
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return 0;
+
+out_err:
+       sun4v_ldc_tx_qconf(lp->id, 0, 0);
+       sun4v_ldc_rx_qconf(lp->id, 0, 0);
+       free_irq(lp->cfg.tx_irq, lp);
+       free_irq(lp->cfg.rx_irq, lp);
+       lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS |
+                      LDC_FLAG_REGISTERED_QUEUES);
+       ldc_set_state(lp, LDC_STATE_INIT);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return err;
+}
+EXPORT_SYMBOL(ldc_disconnect);
+
+int ldc_state(struct ldc_channel *lp)
+{
+       return lp->state;
+}
+EXPORT_SYMBOL(ldc_state);
+
+static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size)
+{
+       struct ldc_packet *p;
+       unsigned long new_tail;
+       int err;
+
+       if (size > LDC_PACKET_SIZE)
+               return -EMSGSIZE;
+
+       p = data_get_tx_packet(lp, &new_tail);
+       if (!p)
+               return -EAGAIN;
+
+       memcpy(p, buf, size);
+
+       err = send_tx_packet(lp, p, new_tail);
+       if (!err)
+               err = size;
+
+       return err;
+}
+
+static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+       struct ldc_packet *p;
+       unsigned long hv_err, new;
+       int err;
+
+       if (size < LDC_PACKET_SIZE)
+               return -EINVAL;
+
+       hv_err = sun4v_ldc_rx_get_state(lp->id,
+                                       &lp->rx_head,
+                                       &lp->rx_tail,
+                                       &lp->chan_state);
+       if (hv_err)
+               return ldc_abort(lp);
+
+       if (lp->chan_state == LDC_CHANNEL_DOWN ||
+           lp->chan_state == LDC_CHANNEL_RESETTING)
+               return -ECONNRESET;
+
+       if (lp->rx_head == lp->rx_tail)
+               return 0;
+
+       p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
+       memcpy(buf, p, LDC_PACKET_SIZE);
+
+       new = rx_advance(lp, lp->rx_head);
+       lp->rx_head = new;
+
+       err = __set_rx_head(lp, new);
+       if (err < 0)
+               err = -ECONNRESET;
+       else
+               err = LDC_PACKET_SIZE;
+
+       return err;
+}
+
+static const struct ldc_mode_ops raw_ops = {
+       .write          =       write_raw,
+       .read           =       read_raw,
+};
+
+static int write_nonraw(struct ldc_channel *lp, const void *buf,
+                       unsigned int size)
+{
+       unsigned long hv_err, tail;
+       unsigned int copied;
+       u32 seq;
+       int err;
+
+       hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
+                                       &lp->chan_state);
+       if (unlikely(hv_err))
+               return -EBUSY;
+
+       if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
+               return ldc_abort(lp);
+
+       if (!tx_has_space_for(lp, size))
+               return -EAGAIN;
+
+       seq = lp->snd_nxt;
+       copied = 0;
+       tail = lp->tx_tail;
+       while (copied < size) {
+               struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE);
+               u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ?
+                           p->u.u_data :
+                           p->u.r.r_data);
+               int data_len;
+
+               p->type = LDC_DATA;
+               p->stype = LDC_INFO;
+               p->ctrl = 0;
+
+               data_len = size - copied;
+               if (data_len > lp->mss)
+                       data_len = lp->mss;
+
+               BUG_ON(data_len > LDC_LEN);
+
+               p->env = (data_len |
+                         (copied == 0 ? LDC_START : 0) |
+                         (data_len == size - copied ? LDC_STOP : 0));
+
+               p->seqid = ++seq;
+
+               ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n",
+                      p->type,
+                      p->stype,
+                      p->ctrl,
+                      p->env,
+                      p->seqid);
+
+               memcpy(data, buf, data_len);
+               buf += data_len;
+               copied += data_len;
+
+               tail = tx_advance(lp, tail);
+       }
+
+       err = set_tx_tail(lp, tail);
+       if (!err) {
+               lp->snd_nxt = seq;
+               err = size;
+       }
+
+       return err;
+}
+
+static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p,
+                     struct ldc_packet *first_frag)
+{
+       int err;
+
+       if (first_frag)
+               lp->rcv_nxt = first_frag->seqid - 1;
+
+       err = send_data_nack(lp, p);
+       if (err)
+               return err;
+
+       err = __set_rx_head(lp, lp->rx_tail);
+       if (err < 0)
+               return ldc_abort(lp);
+
+       return 0;
+}
+
+static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p)
+{
+       if (p->stype & LDC_ACK) {
+               int err = process_data_ack(lp, p);
+               if (err)
+                       return err;
+       }
+       if (p->stype & LDC_NACK)
+               return ldc_abort(lp);
+
+       return 0;
+}
+
+static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head)
+{
+       unsigned long dummy;
+       int limit = 1000;
+
+       ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n",
+              cur_head, lp->rx_head, lp->rx_tail);
+       while (limit-- > 0) {
+               unsigned long hv_err;
+
+               hv_err = sun4v_ldc_rx_get_state(lp->id,
+                                               &dummy,
+                                               &lp->rx_tail,
+                                               &lp->chan_state);
+               if (hv_err)
+                       return ldc_abort(lp);
+
+               ldcdbg(DATA, "REREAD head[%lx] tail[%lx] chan_state[%lx]\n",
+                      dummy, lp->rx_tail, lp->chan_state);
+
+               if (lp->chan_state == LDC_CHANNEL_DOWN ||
+                   lp->chan_state == LDC_CHANNEL_RESETTING)
+                       return -ECONNRESET;
+
+               if (cur_head != lp->rx_tail) {
+                       ldcdbg(DATA, "DATA WAIT DONE\n");
+                       return 0;
+               }
+
+               udelay(1);
+       }
+       return -EAGAIN;
+}
+
+static int rx_set_head(struct ldc_channel *lp, unsigned long head)
+{
+       int err = __set_rx_head(lp, head);
+
+       if (err < 0)
+               return ldc_abort(lp);
+
+       lp->rx_head = head;
+       return 0;
+}
+
+static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+       struct ldc_packet *first_frag;
+       unsigned long hv_err, new;
+       int err, copied;
+
+       hv_err = sun4v_ldc_rx_get_state(lp->id,
+                                       &lp->rx_head,
+                                       &lp->rx_tail,
+                                       &lp->chan_state);
+       if (hv_err)
+               return ldc_abort(lp);
+
+       if (lp->chan_state == LDC_CHANNEL_DOWN ||
+           lp->chan_state == LDC_CHANNEL_RESETTING)
+               return -ECONNRESET;
+
+       if (lp->rx_head == lp->rx_tail)
+               return 0;
+
+       first_frag = NULL;
+       copied = err = 0;
+       new = lp->rx_head;
+       while (1) {
+               struct ldc_packet *p;
+               int pkt_len;
+
+               BUG_ON(new == lp->rx_tail);
+               p = lp->rx_base + (new / LDC_PACKET_SIZE);
+
+               ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x] "
+                      "rcv_nxt[%08x]\n",
+                      p->type,
+                      p->stype,
+                      p->ctrl,
+                      p->env,
+                      p->seqid,
+                      lp->rcv_nxt);
+
+               if (unlikely(!rx_seq_ok(lp, p->seqid))) {
+                       err = rx_bad_seq(lp, p, first_frag);
+                       copied = 0;
+                       break;
+               }
+
+               if (p->type & LDC_CTRL) {
+                       err = process_control_frame(lp, p);
+                       if (err < 0)
+                               break;
+                       err = 0;
+               }
+
+               lp->rcv_nxt = p->seqid;
+
+               if (!(p->type & LDC_DATA)) {
+                       new = rx_advance(lp, new);
+                       goto no_data;
+               }
+               if (p->stype & (LDC_ACK | LDC_NACK)) {
+                       err = data_ack_nack(lp, p);
+                       if (err)
+                               break;
+               }
+               if (!(p->stype & LDC_INFO)) {
+                       new = rx_advance(lp, new);
+                       goto no_data;
+               }
+
+               pkt_len = p->env & LDC_LEN;
+
+               /* Every initial packet starts with the START bit set.
+                *
+                * Singleton packets will have both START+STOP set.
+                *
+                * Fragments will have START set in the first frame, STOP
+                * set in the last frame, and neither bit set in middle
+                * frames of the packet.
+                *
+                * Therefore if we are at the beginning of a packet and
+                * we don't see START, or we are in the middle of a fragmented
+                * packet and do see START, we are unsynchronized and should
+                * flush the RX queue.
+                */
+               if ((first_frag == NULL && !(p->env & LDC_START)) ||
+                   (first_frag != NULL &&  (p->env & LDC_START))) {
+                       if (!first_frag)
+                               new = rx_advance(lp, new);
+
+                       err = rx_set_head(lp, new);
+                       if (err)
+                               break;
+
+                       if (!first_frag)
+                               goto no_data;
+               }
+               if (!first_frag)
+                       first_frag = p;
+
+               if (pkt_len > size - copied) {
+                       /* User didn't give us a big enough buffer,
+                        * what to do?  This is a pretty serious error.
+                        *
+                        * Since we haven't updated the RX ring head to
+                        * consume any of the packets, signal the error
+                        * to the user and just leave the RX ring alone.
+                        *
+                        * This seems the best behavior because this allows
+                        * a user of the LDC layer to start with a small
+                        * RX buffer for ldc_read() calls and use -EMSGSIZE
+                        * as a cue to enlarge it's read buffer.
+                        */
+                       err = -EMSGSIZE;
+                       break;
+               }
+
+               /* Ok, we are gonna eat this one.  */
+               new = rx_advance(lp, new);
+
+               memcpy(buf,
+                      (lp->cfg.mode == LDC_MODE_UNRELIABLE ?
+                       p->u.u_data : p->u.r.r_data), pkt_len);
+               buf += pkt_len;
+               copied += pkt_len;
+
+               if (p->env & LDC_STOP)
+                       break;
+
+no_data:
+               if (new == lp->rx_tail) {
+                       err = rx_data_wait(lp, new);
+                       if (err)
+                               break;
+               }
+       }
+
+       if (!err)
+               err = rx_set_head(lp, new);
+
+       if (err && first_frag)
+               lp->rcv_nxt = first_frag->seqid - 1;
+
+       if (!err)
+               err = copied;
+
+       return err;
+}
+
+static const struct ldc_mode_ops nonraw_ops = {
+       .write          =       write_nonraw,
+       .read           =       read_nonraw,
+};
+
+static int write_stream(struct ldc_channel *lp, const void *buf,
+                       unsigned int size)
+{
+       if (size > lp->cfg.mtu)
+               size = lp->cfg.mtu;
+       return write_nonraw(lp, buf, size);
+}
+
+static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+       if (!lp->mssbuf_len) {
+               int err = read_nonraw(lp, lp->mssbuf,
+                                     (size > lp->cfg.mtu ?
+                                      lp->cfg.mtu : size));
+               if (err < 0)
+                       return err;
+
+               lp->mssbuf_len = err;
+               lp->mssbuf_off = 0;
+       }
+
+       if (size > lp->mssbuf_len)
+               size = lp->mssbuf_len;
+       memcpy(buf, lp->mssbuf + lp->mssbuf_off, size);
+
+       lp->mssbuf_off += size;
+       lp->mssbuf_len -= size;
+
+       return size;
+}
+
+static const struct ldc_mode_ops stream_ops = {
+       .write          =       write_stream,
+       .read           =       read_stream,
+};
+
+int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
+{
+       unsigned long flags;
+       int err;
+
+       if (!buf)
+               return -EINVAL;
+
+       if (!size)
+               return 0;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       if (lp->hs_state != LDC_HS_COMPLETE)
+               err = -ENOTCONN;
+       else
+               err = lp->mops->write(lp, buf, size);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return err;
+}
+EXPORT_SYMBOL(ldc_write);
+
+int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+       unsigned long flags;
+       int err;
+
+       if (!buf)
+               return -EINVAL;
+
+       if (!size)
+               return 0;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       if (lp->hs_state != LDC_HS_COMPLETE)
+               err = -ENOTCONN;
+       else
+               err = lp->mops->read(lp, buf, size);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return err;
+}
+EXPORT_SYMBOL(ldc_read);
+
+static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages)
+{
+       struct iommu_arena *arena = &iommu->arena;
+       unsigned long n, i, start, end, limit;
+       int pass;
+
+       limit = arena->limit;
+       start = arena->hint;
+       pass = 0;
+
+again:
+       n = find_next_zero_bit(arena->map, limit, start);
+       end = n + npages;
+       if (unlikely(end >= limit)) {
+               if (likely(pass < 1)) {
+                       limit = start;
+                       start = 0;
+                       pass++;
+                       goto again;
+               } else {
+                       /* Scanned the whole thing, give up. */
+                       return -1;
+               }
+       }
+
+       for (i = n; i < end; i++) {
+               if (test_bit(i, arena->map)) {
+                       start = i + 1;
+                       goto again;
+               }
+       }
+
+       for (i = n; i < end; i++)
+               __set_bit(i, arena->map);
+
+       arena->hint = end;
+
+       return n;
+}
+
+#define COOKIE_PGSZ_CODE       0xf000000000000000ULL
+#define COOKIE_PGSZ_CODE_SHIFT 60ULL
+
+static u64 pagesize_code(void)
+{
+       switch (PAGE_SIZE) {
+       default:
+       case (8ULL * 1024ULL):
+               return 0;
+       case (64ULL * 1024ULL):
+               return 1;
+       case (512ULL * 1024ULL):
+               return 2;
+       case (4ULL * 1024ULL * 1024ULL):
+               return 3;
+       case (32ULL * 1024ULL * 1024ULL):
+               return 4;
+       case (256ULL * 1024ULL * 1024ULL):
+               return 5;
+       }
+}
+
+static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset)
+{
+       return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) |
+               (index << PAGE_SHIFT) |
+               page_offset);
+}
+
+static u64 cookie_to_index(u64 cookie, unsigned long *shift)
+{
+       u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;
+
+       cookie &= ~COOKIE_PGSZ_CODE;
+
+       *shift = szcode * 3;
+
+       return (cookie >> (13ULL + (szcode * 3ULL)));
+}
+
+static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu,
+                                            unsigned long npages)
+{
+       long entry;
+
+       entry = arena_alloc(iommu, npages);
+       if (unlikely(entry < 0))
+               return NULL;
+
+       return iommu->page_table + entry;
+}
+
+static u64 perm_to_mte(unsigned int map_perm)
+{
+       u64 mte_base;
+
+       mte_base = pagesize_code();
+
+       if (map_perm & LDC_MAP_SHADOW) {
+               if (map_perm & LDC_MAP_R)
+                       mte_base |= LDC_MTE_COPY_R;
+               if (map_perm & LDC_MAP_W)
+                       mte_base |= LDC_MTE_COPY_W;
+       }
+       if (map_perm & LDC_MAP_DIRECT) {
+               if (map_perm & LDC_MAP_R)
+                       mte_base |= LDC_MTE_READ;
+               if (map_perm & LDC_MAP_W)
+                       mte_base |= LDC_MTE_WRITE;
+               if (map_perm & LDC_MAP_X)
+                       mte_base |= LDC_MTE_EXEC;
+       }
+       if (map_perm & LDC_MAP_IO) {
+               if (map_perm & LDC_MAP_R)
+                       mte_base |= LDC_MTE_IOMMU_R;
+               if (map_perm & LDC_MAP_W)
+                       mte_base |= LDC_MTE_IOMMU_W;
+       }
+
+       return mte_base;
+}
+
+static int pages_in_region(unsigned long base, long len)
+{
+       int count = 0;
+
+       do {
+               unsigned long new = (base + PAGE_SIZE) & PAGE_MASK;
+
+               len -= (new - base);
+               base = new;
+               count++;
+       } while (len > 0);
+
+       return count;
+}
+
+struct cookie_state {
+       struct ldc_mtable_entry         *page_table;
+       struct ldc_trans_cookie         *cookies;
+       u64                             mte_base;
+       u64                             prev_cookie;
+       u32                             pte_idx;
+       u32                             nc;
+};
+
+static void fill_cookies(struct cookie_state *sp, unsigned long pa,
+                        unsigned long off, unsigned long len)
+{
+       do {
+               unsigned long tlen, new = pa + PAGE_SIZE;
+               u64 this_cookie;
+
+               sp->page_table[sp->pte_idx].mte = sp->mte_base | pa;
+
+               tlen = PAGE_SIZE;
+               if (off)
+                       tlen = PAGE_SIZE - off;
+               if (tlen > len)
+                       tlen = len;
+
+               this_cookie = make_cookie(sp->pte_idx,
+                                         pagesize_code(), off);
+
+               off = 0;
+
+               if (this_cookie == sp->prev_cookie) {
+                       sp->cookies[sp->nc - 1].cookie_size += tlen;
+               } else {
+                       sp->cookies[sp->nc].cookie_addr = this_cookie;
+                       sp->cookies[sp->nc].cookie_size = tlen;
+                       sp->nc++;
+               }
+               sp->prev_cookie = this_cookie + tlen;
+
+               sp->pte_idx++;
+
+               len -= tlen;
+               pa = new;
+       } while (len > 0);
+}
+
+static int sg_count_one(struct scatterlist *sg)
+{
+       unsigned long base = page_to_pfn(sg->page) << PAGE_SHIFT;
+       long len = sg->length;
+
+       if ((sg->offset | len) & (8UL - 1))
+               return -EFAULT;
+
+       return pages_in_region(base + sg->offset, len);
+}
+
+static int sg_count_pages(struct scatterlist *sg, int num_sg)
+{
+       int count;
+       int i;
+
+       count = 0;
+       for (i = 0; i < num_sg; i++) {
+               int err = sg_count_one(sg + i);
+               if (err < 0)
+                       return err;
+               count += err;
+       }
+
+       return count;
+}
+
+int ldc_map_sg(struct ldc_channel *lp,
+              struct scatterlist *sg, int num_sg,
+              struct ldc_trans_cookie *cookies, int ncookies,
+              unsigned int map_perm)
+{
+       unsigned long i, npages, flags;
+       struct ldc_mtable_entry *base;
+       struct cookie_state state;
+       struct ldc_iommu *iommu;
+       int err;
+
+       if (map_perm & ~LDC_MAP_ALL)
+               return -EINVAL;
+
+       err = sg_count_pages(sg, num_sg);
+       if (err < 0)
+               return err;
+
+       npages = err;
+       if (err > ncookies)
+               return -EMSGSIZE;
+
+       iommu = &lp->iommu;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       base = alloc_npages(iommu, npages);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       if (!base)
+               return -ENOMEM;
+
+       state.page_table = iommu->page_table;
+       state.cookies = cookies;
+       state.mte_base = perm_to_mte(map_perm);
+       state.prev_cookie = ~(u64)0;
+       state.pte_idx = (base - iommu->page_table);
+       state.nc = 0;
+
+       for (i = 0; i < num_sg; i++)
+               fill_cookies(&state, page_to_pfn(sg[i].page) << PAGE_SHIFT,
+                            sg[i].offset, sg[i].length);
+
+       return state.nc;
+}
+EXPORT_SYMBOL(ldc_map_sg);
+
+int ldc_map_single(struct ldc_channel *lp,
+                  void *buf, unsigned int len,
+                  struct ldc_trans_cookie *cookies, int ncookies,
+                  unsigned int map_perm)
+{
+       unsigned long npages, pa, flags;
+       struct ldc_mtable_entry *base;
+       struct cookie_state state;
+       struct ldc_iommu *iommu;
+
+       if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1))
+               return -EINVAL;
+
+       pa = __pa(buf);
+       if ((pa | len) & (8UL - 1))
+               return -EFAULT;
+
+       npages = pages_in_region(pa, len);
+
+       iommu = &lp->iommu;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       base = alloc_npages(iommu, npages);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       if (!base)
+               return -ENOMEM;
+
+       state.page_table = iommu->page_table;
+       state.cookies = cookies;
+       state.mte_base = perm_to_mte(map_perm);
+       state.prev_cookie = ~(u64)0;
+       state.pte_idx = (base - iommu->page_table);
+       state.nc = 0;
+       fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len);
+       BUG_ON(state.nc != 1);
+
+       return state.nc;
+}
+EXPORT_SYMBOL(ldc_map_single);
+
+static void free_npages(unsigned long id, struct ldc_iommu *iommu,
+                       u64 cookie, u64 size)
+{
+       struct iommu_arena *arena = &iommu->arena;
+       unsigned long i, shift, index, npages;
+       struct ldc_mtable_entry *base;
+
+       npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT;
+       index = cookie_to_index(cookie, &shift);
+       base = iommu->page_table + index;
+
+       BUG_ON(index > arena->limit ||
+              (index + npages) > arena->limit);
+
+       for (i = 0; i < npages; i++) {
+               if (base->cookie)
+                       sun4v_ldc_revoke(id, cookie + (i << shift),
+                                        base->cookie);
+               base->mte = 0;
+               __clear_bit(index + i, arena->map);
+       }
+}
+
+void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
+              int ncookies)
+{
+       struct ldc_iommu *iommu = &lp->iommu;
+       unsigned long flags;
+       int i;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       for (i = 0; i < ncookies; i++) {
+               u64 addr = cookies[i].cookie_addr;
+               u64 size = cookies[i].cookie_size;
+
+               free_npages(lp->id, iommu, addr, size);
+       }
+       spin_unlock_irqrestore(&iommu->lock, flags);
+}
+EXPORT_SYMBOL(ldc_unmap);
+
+int ldc_copy(struct ldc_channel *lp, int copy_dir,
+            void *buf, unsigned int len, unsigned long offset,
+            struct ldc_trans_cookie *cookies, int ncookies)
+{
+       unsigned int orig_len;
+       unsigned long ra;
+       int i;
+
+       if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) {
+               printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n",
+                      lp->id, copy_dir);
+               return -EINVAL;
+       }
+
+       ra = __pa(buf);
+       if ((ra | len | offset) & (8UL - 1)) {
+               printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer "
+                      "ra[%lx] len[%x] offset[%lx]\n",
+                      lp->id, ra, len, offset);
+               return -EFAULT;
+       }
+
+       if (lp->hs_state != LDC_HS_COMPLETE ||
+           (lp->flags & LDC_FLAG_RESET)) {
+               printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] "
+                      "flags[%x]\n", lp->id, lp->hs_state, lp->flags);
+               return -ECONNRESET;
+       }
+
+       orig_len = len;
+       for (i = 0; i < ncookies; i++) {
+               unsigned long cookie_raddr = cookies[i].cookie_addr;
+               unsigned long this_len = cookies[i].cookie_size;
+               unsigned long actual_len;
+
+               if (unlikely(offset)) {
+                       unsigned long this_off = offset;
+
+                       if (this_off > this_len)
+                               this_off = this_len;
+
+                       offset -= this_off;
+                       this_len -= this_off;
+                       if (!this_len)
+                               continue;
+                       cookie_raddr += this_off;
+               }
+
+               if (this_len > len)
+                       this_len = len;
+
+               while (1) {
+                       unsigned long hv_err;
+
+                       hv_err = sun4v_ldc_copy(lp->id, copy_dir,
+                                               cookie_raddr, ra,
+                                               this_len, &actual_len);
+                       if (unlikely(hv_err)) {
+                               printk(KERN_ERR PFX "ldc_copy: ID[%lu] "
+                                      "HV error %lu\n",
+                                      lp->id, hv_err);
+                               if (lp->hs_state != LDC_HS_COMPLETE ||
+                                   (lp->flags & LDC_FLAG_RESET))
+                                       return -ECONNRESET;
+                               else
+                                       return -EFAULT;
+                       }
+
+                       cookie_raddr += actual_len;
+                       ra += actual_len;
+                       len -= actual_len;
+                       if (actual_len == this_len)
+                               break;
+
+                       this_len -= actual_len;
+               }
+
+               if (!len)
+                       break;
+       }
+
+       /* It is caller policy what to do about short copies.
+        * For example, a networking driver can declare the
+        * packet a runt and drop it.
+        */
+
+       return orig_len - len;
+}
+EXPORT_SYMBOL(ldc_copy);
+
+void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
+                         struct ldc_trans_cookie *cookies, int *ncookies,
+                         unsigned int map_perm)
+{
+       void *buf;
+       int err;
+
+       if (len & (8UL - 1))
+               return ERR_PTR(-EINVAL);
+
+       buf = kzalloc(len, GFP_KERNEL);
+       if (!buf)
+               return ERR_PTR(-ENOMEM);
+
+       err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm);
+       if (err < 0) {
+               kfree(buf);
+               return ERR_PTR(err);
+       }
+       *ncookies = err;
+
+       return buf;
+}
+EXPORT_SYMBOL(ldc_alloc_exp_dring);
+
+void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len,
+                       struct ldc_trans_cookie *cookies, int ncookies)
+{
+       ldc_unmap(lp, cookies, ncookies);
+       kfree(buf);
+}
+EXPORT_SYMBOL(ldc_free_exp_dring);
+
+static int __init ldc_init(void)
+{
+       struct mdesc_node *mp;
+       unsigned long major, minor;
+       const u64 *v;
+
+       mp = md_find_node_by_name(NULL, "platform");
+       if (!mp)
+               return -ENODEV;
+
+       v = md_get_property(mp, "domaining-enabled", NULL);
+       if (!v)
+               return -ENODEV;
+
+       major = 1;
+       minor = 0;
+       if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) {
+               printk(KERN_INFO PFX "Could not register LDOM hvapi.\n");
+               return -ENODEV;
+       }
+
+       printk(KERN_INFO "%s", version);
+
+       if (!*v) {
+               printk(KERN_INFO PFX "Domaining disabled.\n");
+               return -ENODEV;
+       }
+       ldom_domaining_enabled = 1;
+
+       return 0;
+}
+
+core_initcall(ldc_init);
diff --git a/arch/sparc64/kernel/vio.c b/arch/sparc64/kernel/vio.c
new file mode 100644 (file)
index 0000000..21c015e
--- /dev/null
@@ -0,0 +1,347 @@
+/* vio.c: Virtual I/O channel devices probing infrastructure.
+ *
+ *    Copyright (c) 2003-2005 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *
+ * Adapted to sparc64 by David S. Miller davem@davemloft.net
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+
+#include <asm/mdesc.h>
+#include <asm/vio.h>
+
+static inline int find_in_proplist(const char *list, const char *match,
+                                  int len)
+{
+       while (len > 0) {
+               int l;
+
+               if (!strcmp(list, match))
+                       return 1;
+               l = strlen(list) + 1;
+               list += l;
+               len -= l;
+       }
+       return 0;
+}
+
+static const struct vio_device_id *vio_match_device(
+       const struct vio_device_id *matches,
+       const struct vio_dev *dev)
+{
+       const char *type, *compat;
+       int len;
+
+       type = dev->type;
+       compat = dev->compat;
+       len = dev->compat_len;
+
+       while (matches->type[0] || matches->compat[0]) {
+               int match = 1;
+               if (matches->type[0]) {
+                       match &= type
+                               && !strcmp(matches->type, type);
+               }
+               if (matches->compat[0]) {
+                       match &= compat &&
+                               find_in_proplist(compat, matches->compat, len);
+               }
+               if (match)
+                       return matches;
+               matches++;
+       }
+       return NULL;
+}
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+       struct vio_dev *vio_dev = to_vio_dev(dev);
+       struct vio_driver *vio_drv = to_vio_driver(drv);
+       const struct vio_device_id *matches = vio_drv->id_table;
+
+       if (!matches)
+               return 0;
+
+       return vio_match_device(matches, vio_dev) != NULL;
+}
+
+static int vio_device_probe(struct device *dev)
+{
+       struct vio_dev *vdev = to_vio_dev(dev);
+       struct vio_driver *drv = to_vio_driver(dev->driver);
+       const struct vio_device_id *id;
+       int error = -ENODEV;
+
+       if (drv->probe) {
+               id = vio_match_device(drv->id_table, vdev);
+               if (id)
+                       error = drv->probe(vdev, id);
+       }
+
+       return error;
+}
+
+static int vio_device_remove(struct device *dev)
+{
+       struct vio_dev *vdev = to_vio_dev(dev);
+       struct vio_driver *drv = to_vio_driver(dev->driver);
+
+       if (drv->remove)
+               return drv->remove(vdev);
+
+       return 1;
+}
+
+static ssize_t devspec_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct vio_dev *vdev = to_vio_dev(dev);
+       const char *str = "none";
+
+       if (vdev->type) {
+               if (!strcmp(vdev->type, "network"))
+                       str = "vnet";
+               else if (!strcmp(vdev->type, "block"))
+                       str = "vdisk";
+       }
+
+       return sprintf(buf, "%s\n", str);
+}
+
+static struct device_attribute vio_dev_attrs[] = {
+       __ATTR_RO(devspec),
+       __ATTR_NULL
+};
+
+static struct bus_type vio_bus_type = {
+       .name           = "vio",
+       .dev_attrs      = vio_dev_attrs,
+       .match          = vio_bus_match,
+       .probe          = vio_device_probe,
+       .remove         = vio_device_remove,
+};
+
+int vio_register_driver(struct vio_driver *viodrv)
+{
+       viodrv->driver.bus = &vio_bus_type;
+
+       return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_register_driver);
+
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+       driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+struct mdesc_node *vio_find_endpoint(struct vio_dev *vdev)
+{
+       struct mdesc_node *endp, *mp = vdev->mp;
+       int i;
+
+       endp = NULL;
+       for (i = 0; i < mp->num_arcs; i++) {
+               struct mdesc_node *t;
+
+               if (strcmp(mp->arcs[i].name, "fwd"))
+                       continue;
+
+               t = mp->arcs[i].arc;
+               if (strcmp(t->name, "channel-endpoint"))
+                       continue;
+
+               endp = t;
+               break;
+       }
+
+       return endp;
+}
+EXPORT_SYMBOL(vio_find_endpoint);
+
+static void __devinit vio_dev_release(struct device *dev)
+{
+       kfree(to_vio_dev(dev));
+}
+
+static ssize_t
+show_pciobppath_attr(struct device *dev, struct device_attribute *attr,
+                    char *buf)
+{
+       struct vio_dev *vdev;
+       struct device_node *dp;
+
+       vdev = to_vio_dev(dev);
+       dp = vdev->dp;
+
+       return snprintf (buf, PAGE_SIZE, "%s\n", dp->full_name);
+}
+
+static DEVICE_ATTR(obppath, S_IRUSR | S_IRGRP | S_IROTH,
+                  show_pciobppath_attr, NULL);
+
+struct device_node *cdev_node;
+
+static struct vio_dev *root_vdev;
+static u64 cdev_cfg_handle;
+
+static struct vio_dev *vio_create_one(struct mdesc_node *mp,
+                                     struct device *parent)
+{
+       const char *type, *compat;
+       struct device_node *dp;
+       struct vio_dev *vdev;
+       const u64 *irq;
+       int err, clen;
+
+       type = md_get_property(mp, "device-type", NULL);
+       if (!type)
+               type = md_get_property(mp, "name", NULL);
+       compat = md_get_property(mp, "device-type", &clen);
+
+       vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+       if (!vdev) {
+               printk(KERN_ERR "VIO: Could not allocate vio_dev\n");
+               return NULL;
+       }
+
+       vdev->mp = mp;
+       vdev->type = type;
+       vdev->compat = compat;
+       vdev->compat_len = clen;
+
+       irq = md_get_property(mp, "tx-ino", NULL);
+       if (irq)
+               mp->irqs[0] = sun4v_build_virq(cdev_cfg_handle, *irq);
+
+       irq = md_get_property(mp, "rx-ino", NULL);
+       if (irq)
+               mp->irqs[1] = sun4v_build_virq(cdev_cfg_handle, *irq);
+
+       snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%lx", mp->node);
+       vdev->dev.parent = parent;
+       vdev->dev.bus = &vio_bus_type;
+       vdev->dev.release = vio_dev_release;
+
+       if (parent == NULL) {
+               dp = cdev_node;
+       } else if (to_vio_dev(parent) == root_vdev) {
+               dp = of_get_next_child(cdev_node, NULL);
+               while (dp) {
+                       if (!strcmp(dp->type, type))
+                               break;
+
+                       dp = of_get_next_child(cdev_node, dp);
+               }
+       } else {
+               dp = to_vio_dev(parent)->dp;
+       }
+       vdev->dp = dp;
+
+       err = device_register(&vdev->dev);
+       if (err) {
+               printk(KERN_ERR "VIO: Could not register device %s, err=%d\n",
+                      vdev->dev.bus_id, err);
+               kfree(vdev);
+               return NULL;
+       }
+       if (vdev->dp)
+               err = sysfs_create_file(&vdev->dev.kobj,
+                                       &dev_attr_obppath.attr);
+
+       return vdev;
+}
+
+static void walk_tree(struct mdesc_node *n, struct vio_dev *parent)
+{
+       int i;
+
+       for (i = 0; i < n->num_arcs; i++) {
+               struct mdesc_node *mp;
+               struct vio_dev *vdev;
+
+               if (strcmp(n->arcs[i].name, "fwd"))
+                       continue;
+
+               mp = n->arcs[i].arc;
+
+               vdev = vio_create_one(mp, &parent->dev);
+               if (vdev && mp->num_arcs)
+                       walk_tree(mp, vdev);
+       }
+}
+
+static void create_devices(struct mdesc_node *root)
+{
+       root_vdev = vio_create_one(root, NULL);
+       if (!root_vdev) {
+               printk(KERN_ERR "VIO: Coult not create root device.\n");
+               return;
+       }
+
+       walk_tree(root, root_vdev);
+}
+
+const char *channel_devices_node = "channel-devices";
+const char *channel_devices_compat = "SUNW,sun4v-channel-devices";
+const char *cfg_handle_prop = "cfg-handle";
+
+static int __init vio_init(void)
+{
+       struct mdesc_node *root;
+       const char *compat;
+       const u64 *cfg_handle;
+       int err, len;
+
+       root = md_find_node_by_name(NULL, channel_devices_node);
+       if (!root) {
+               printk(KERN_INFO "VIO: No channel-devices MDESC node.\n");
+               return 0;
+       }
+
+       cdev_node = of_find_node_by_name(NULL, "channel-devices");
+       if (!cdev_node) {
+               printk(KERN_INFO "VIO: No channel-devices OBP node.\n");
+               return -ENODEV;
+       }
+
+       compat = md_get_property(root, "compatible", &len);
+       if (!compat) {
+               printk(KERN_ERR "VIO: Channel devices lacks compatible "
+                      "property\n");
+               return -ENODEV;
+       }
+       if (!find_in_proplist(compat, channel_devices_compat, len)) {
+               printk(KERN_ERR "VIO: Channel devices node lacks (%s) "
+                      "compat entry.\n", channel_devices_compat);
+               return -ENODEV;
+       }
+
+       cfg_handle = md_get_property(root, cfg_handle_prop, NULL);
+       if (!cfg_handle) {
+               printk(KERN_ERR "VIO: Channel devices lacks %s property\n",
+                      cfg_handle_prop);
+               return -ENODEV;
+       }
+
+       cdev_cfg_handle = *cfg_handle;
+
+       err = bus_register(&vio_bus_type);
+       if (err) {
+               printk(KERN_ERR "VIO: Could not register bus type err=%d\n",
+                      err);
+               return err;
+       }
+
+       create_devices(root);
+
+       return 0;
+}
+
+postcore_initcall(vio_init);
diff --git a/arch/sparc64/kernel/viohs.c b/arch/sparc64/kernel/viohs.c
new file mode 100644 (file)
index 0000000..3eb42e3
--- /dev/null
@@ -0,0 +1,809 @@
+/* viohs.c: LDOM Virtual I/O handshake helper layer.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/ldc.h>
+#include <asm/vio.h>
+
+int vio_ldc_send(struct vio_driver_state *vio, void *data, int len)
+{
+       int err, limit = 1000;
+
+       err = -EINVAL;
+       while (limit-- > 0) {
+               err = ldc_write(vio->lp, data, len);
+               if (!err || (err != -EAGAIN))
+                       break;
+               udelay(1);
+       }
+
+       return err;
+}
+EXPORT_SYMBOL(vio_ldc_send);
+
+static int send_ctrl(struct vio_driver_state *vio,
+                    struct vio_msg_tag *tag, int len)
+{
+       tag->sid = vio_send_sid(vio);
+       return vio_ldc_send(vio, tag, len);
+}
+
+static void init_tag(struct vio_msg_tag *tag, u8 type, u8 stype, u16 stype_env)
+{
+       tag->type = type;
+       tag->stype = stype;
+       tag->stype_env = stype_env;
+}
+
+static int send_version(struct vio_driver_state *vio, u16 major, u16 minor)
+{
+       struct vio_ver_info pkt;
+
+       vio->_local_sid = (u32) sched_clock();
+
+       memset(&pkt, 0, sizeof(pkt));
+       init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_VER_INFO);
+       pkt.major = major;
+       pkt.minor = minor;
+       pkt.dev_class = vio->dev_class;
+
+       viodbg(HS, "SEND VERSION INFO maj[%u] min[%u] devclass[%u]\n",
+              major, minor, vio->dev_class);
+
+       return send_ctrl(vio, &pkt.tag, sizeof(pkt));
+}
+
+static int start_handshake(struct vio_driver_state *vio)
+{
+       int err;
+
+       viodbg(HS, "START HANDSHAKE\n");
+
+       vio->hs_state = VIO_HS_INVALID;
+
+       err = send_version(vio,
+                          vio->ver_table[0].major,
+                          vio->ver_table[0].minor);
+       if (err < 0)
+               return err;
+
+       return 0;
+}
+
+void vio_link_state_change(struct vio_driver_state *vio, int event)
+{
+       if (event == LDC_EVENT_UP) {
+               vio->hs_state = VIO_HS_INVALID;
+
+               switch (vio->dev_class) {
+               case VDEV_NETWORK:
+               case VDEV_NETWORK_SWITCH:
+                       vio->dr_state = (VIO_DR_STATE_TXREQ |
+                                        VIO_DR_STATE_RXREQ);
+                       break;
+
+               case VDEV_DISK:
+                       vio->dr_state = VIO_DR_STATE_TXREQ;
+                       break;
+               case VDEV_DISK_SERVER:
+                       vio->dr_state = VIO_DR_STATE_RXREQ;
+                       break;
+               }
+               start_handshake(vio);
+       }
+}
+EXPORT_SYMBOL(vio_link_state_change);
+
+static int handshake_failure(struct vio_driver_state *vio)
+{
+       struct vio_dring_state *dr;
+
+       /* XXX Put policy here...  Perhaps start a timer to fire
+        * XXX in 100 ms, which will bring the link up and retry
+        * XXX the handshake.
+        */
+
+       viodbg(HS, "HANDSHAKE FAILURE\n");
+
+       vio->dr_state &= ~(VIO_DR_STATE_TXREG |
+                          VIO_DR_STATE_RXREG);
+
+       dr = &vio->drings[VIO_DRIVER_RX_RING];
+       memset(dr, 0, sizeof(*dr));
+
+       kfree(vio->desc_buf);
+       vio->desc_buf = NULL;
+       vio->desc_buf_len = 0;
+
+       vio->hs_state = VIO_HS_INVALID;
+
+       return -ECONNRESET;
+}
+
+static int process_unknown(struct vio_driver_state *vio, void *arg)
+{
+       struct vio_msg_tag *pkt = arg;
+
+       viodbg(HS, "UNKNOWN CONTROL [%02x:%02x:%04x:%08x]\n",
+              pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+
+       printk(KERN_ERR "vio: ID[%lu] Resetting connection.\n",
+              vio->channel_id);
+
+       ldc_disconnect(vio->lp);
+
+       return -ECONNRESET;
+}
+
+static int send_dreg(struct vio_driver_state *vio)
+{
+       struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_TX_RING];
+       union {
+               struct vio_dring_register pkt;
+               char all[sizeof(struct vio_dring_register) +
+                        (sizeof(struct ldc_trans_cookie) *
+                         dr->ncookies)];
+       } u;
+       int i;
+
+       memset(&u, 0, sizeof(u));
+       init_tag(&u.pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_DRING_REG);
+       u.pkt.dring_ident = 0;
+       u.pkt.num_descr = dr->num_entries;
+       u.pkt.descr_size = dr->entry_size;
+       u.pkt.options = VIO_TX_DRING;
+       u.pkt.num_cookies = dr->ncookies;
+
+       viodbg(HS, "SEND DRING_REG INFO ndesc[%u] dsz[%u] opt[0x%x] "
+              "ncookies[%u]\n",
+              u.pkt.num_descr, u.pkt.descr_size, u.pkt.options,
+              u.pkt.num_cookies);
+
+       for (i = 0; i < dr->ncookies; i++) {
+               u.pkt.cookies[i] = dr->cookies[i];
+
+               viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n",
+                      i,
+                      (unsigned long long) u.pkt.cookies[i].cookie_addr,
+                      (unsigned long long) u.pkt.cookies[i].cookie_size);
+       }
+
+       return send_ctrl(vio, &u.pkt.tag, sizeof(u));
+}
+
+static int send_rdx(struct vio_driver_state *vio)
+{
+       struct vio_rdx pkt;
+
+       memset(&pkt, 0, sizeof(pkt));
+
+       init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX);
+
+       viodbg(HS, "SEND RDX INFO\n");
+
+       return send_ctrl(vio, &pkt.tag, sizeof(pkt));
+}
+
+static int send_attr(struct vio_driver_state *vio)
+{
+       return vio->ops->send_attr(vio);
+}
+
+static struct vio_version *find_by_major(struct vio_driver_state *vio,
+                                        u16 major)
+{
+       struct vio_version *ret = NULL;
+       int i;
+
+       for (i = 0; i < vio->ver_table_entries; i++) {
+               struct vio_version *v = &vio->ver_table[i];
+               if (v->major <= major) {
+                       ret = v;
+                       break;
+               }
+       }
+       return ret;
+}
+
+static int process_ver_info(struct vio_driver_state *vio,
+                           struct vio_ver_info *pkt)
+{
+       struct vio_version *vap;
+       int err;
+
+       viodbg(HS, "GOT VERSION INFO maj[%u] min[%u] devclass[%u]\n",
+              pkt->major, pkt->minor, pkt->dev_class);
+
+       if (vio->hs_state != VIO_HS_INVALID) {
+               /* XXX Perhaps invoke start_handshake? XXX */
+               memset(&vio->ver, 0, sizeof(vio->ver));
+               vio->hs_state = VIO_HS_INVALID;
+       }
+
+       vap = find_by_major(vio, pkt->major);
+
+       vio->_peer_sid = pkt->tag.sid;
+
+       if (!vap) {
+               pkt->tag.stype = VIO_SUBTYPE_NACK;
+               pkt->major = 0;
+               pkt->minor = 0;
+               viodbg(HS, "SEND VERSION NACK maj[0] min[0]\n");
+               err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+       } else if (vap->major != pkt->major) {
+               pkt->tag.stype = VIO_SUBTYPE_NACK;
+               pkt->major = vap->major;
+               pkt->minor = vap->minor;
+               viodbg(HS, "SEND VERSION NACK maj[%u] min[%u]\n",
+                      pkt->major, pkt->minor);
+               err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+       } else {
+               struct vio_version ver = {
+                       .major = pkt->major,
+                       .minor = pkt->minor,
+               };
+               if (ver.minor > vap->minor)
+                       ver.minor = vap->minor;
+               pkt->minor = ver.minor;
+               pkt->tag.stype = VIO_SUBTYPE_ACK;
+               viodbg(HS, "SEND VERSION ACK maj[%u] min[%u]\n",
+                      pkt->major, pkt->minor);
+               err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+               if (err > 0) {
+                       vio->ver = ver;
+                       vio->hs_state = VIO_HS_GOTVERS;
+               }
+       }
+       if (err < 0)
+               return handshake_failure(vio);
+
+       return 0;
+}
+
+static int process_ver_ack(struct vio_driver_state *vio,
+                          struct vio_ver_info *pkt)
+{
+       viodbg(HS, "GOT VERSION ACK maj[%u] min[%u] devclass[%u]\n",
+              pkt->major, pkt->minor, pkt->dev_class);
+
+       if (vio->hs_state & VIO_HS_GOTVERS) {
+               if (vio->ver.major != pkt->major ||
+                   vio->ver.minor != pkt->minor) {
+                       pkt->tag.stype = VIO_SUBTYPE_NACK;
+                       (void) send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+                       return handshake_failure(vio);
+               }
+       } else {
+               vio->ver.major = pkt->major;
+               vio->ver.minor = pkt->minor;
+               vio->hs_state = VIO_HS_GOTVERS;
+       }
+
+       switch (vio->dev_class) {
+       case VDEV_NETWORK:
+       case VDEV_DISK:
+               if (send_attr(vio) < 0)
+                       return handshake_failure(vio);
+               break;
+
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+static int process_ver_nack(struct vio_driver_state *vio,
+                           struct vio_ver_info *pkt)
+{
+       struct vio_version *nver;
+
+       viodbg(HS, "GOT VERSION NACK maj[%u] min[%u] devclass[%u]\n",
+              pkt->major, pkt->minor, pkt->dev_class);
+
+       if ((pkt->major == 0 && pkt->minor == 0) ||
+           !(nver = find_by_major(vio, pkt->major)))
+               return handshake_failure(vio);
+
+       if (send_version(vio, nver->major, nver->minor) < 0)
+               return handshake_failure(vio);
+
+       return 0;
+}
+
+static int process_ver(struct vio_driver_state *vio, struct vio_ver_info *pkt)
+{
+       switch (pkt->tag.stype) {
+       case VIO_SUBTYPE_INFO:
+               return process_ver_info(vio, pkt);
+
+       case VIO_SUBTYPE_ACK:
+               return process_ver_ack(vio, pkt);
+
+       case VIO_SUBTYPE_NACK:
+               return process_ver_nack(vio, pkt);
+
+       default:
+               return handshake_failure(vio);
+       };
+}
+
+static int process_attr(struct vio_driver_state *vio, void *pkt)
+{
+       int err;
+
+       if (!(vio->hs_state & VIO_HS_GOTVERS))
+               return handshake_failure(vio);
+
+       err = vio->ops->handle_attr(vio, pkt);
+       if (err < 0) {
+               return handshake_failure(vio);
+       } else {
+               vio->hs_state |= VIO_HS_GOT_ATTR;
+
+               if ((vio->dr_state & VIO_DR_STATE_TXREQ) &&
+                   !(vio->hs_state & VIO_HS_SENT_DREG)) {
+                       if (send_dreg(vio) < 0)
+                               return handshake_failure(vio);
+
+                       vio->hs_state |= VIO_HS_SENT_DREG;
+               }
+       }
+       return 0;
+}
+
+static int all_drings_registered(struct vio_driver_state *vio)
+{
+       int need_rx, need_tx;
+
+       need_rx = (vio->dr_state & VIO_DR_STATE_RXREQ);
+       need_tx = (vio->dr_state & VIO_DR_STATE_TXREQ);
+
+       if (need_rx &&
+           !(vio->dr_state & VIO_DR_STATE_RXREG))
+               return 0;
+
+       if (need_tx &&
+           !(vio->dr_state & VIO_DR_STATE_TXREG))
+               return 0;
+
+       return 1;
+}
+
+static int process_dreg_info(struct vio_driver_state *vio,
+                            struct vio_dring_register *pkt)
+{
+       struct vio_dring_state *dr;
+       int i, len;
+
+       viodbg(HS, "GOT DRING_REG INFO ident[%llx] "
+              "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+              (unsigned long long) pkt->dring_ident,
+              pkt->num_descr, pkt->descr_size, pkt->options,
+              pkt->num_cookies);
+
+       if (!(vio->dr_state & VIO_DR_STATE_RXREQ))
+               goto send_nack;
+
+       if (vio->dr_state & VIO_DR_STATE_RXREG)
+               goto send_nack;
+
+       vio->desc_buf = kzalloc(pkt->descr_size, GFP_ATOMIC);
+       if (!vio->desc_buf)
+               goto send_nack;
+
+       vio->desc_buf_len = pkt->descr_size;
+
+       dr = &vio->drings[VIO_DRIVER_RX_RING];
+
+       dr->num_entries = pkt->num_descr;
+       dr->entry_size = pkt->descr_size;
+       dr->ncookies = pkt->num_cookies;
+       for (i = 0; i < dr->ncookies; i++) {
+               dr->cookies[i] = pkt->cookies[i];
+
+               viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n",
+                      i,
+                      (unsigned long long)
+                      pkt->cookies[i].cookie_addr,
+                      (unsigned long long)
+                      pkt->cookies[i].cookie_size);
+       }
+
+       pkt->tag.stype = VIO_SUBTYPE_ACK;
+       pkt->dring_ident = ++dr->ident;
+
+       viodbg(HS, "SEND DRING_REG ACK ident[%llx]\n",
+              (unsigned long long) pkt->dring_ident);
+
+       len = (sizeof(*pkt) +
+              (dr->ncookies * sizeof(struct ldc_trans_cookie)));
+       if (send_ctrl(vio, &pkt->tag, len) < 0)
+               goto send_nack;
+
+       vio->dr_state |= VIO_DR_STATE_RXREG;
+
+       return 0;
+
+send_nack:
+       pkt->tag.stype = VIO_SUBTYPE_NACK;
+       viodbg(HS, "SEND DRING_REG NACK\n");
+       (void) send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+
+       return handshake_failure(vio);
+}
+
+static int process_dreg_ack(struct vio_driver_state *vio,
+                           struct vio_dring_register *pkt)
+{
+       struct vio_dring_state *dr;
+
+       viodbg(HS, "GOT DRING_REG ACK ident[%llx] "
+              "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+              (unsigned long long) pkt->dring_ident,
+              pkt->num_descr, pkt->descr_size, pkt->options,
+              pkt->num_cookies);
+
+       dr = &vio->drings[VIO_DRIVER_TX_RING];
+
+       if (!(vio->dr_state & VIO_DR_STATE_TXREQ))
+               return handshake_failure(vio);
+
+       dr->ident = pkt->dring_ident;
+       vio->dr_state |= VIO_DR_STATE_TXREG;
+
+       if (all_drings_registered(vio)) {
+               if (send_rdx(vio) < 0)
+                       return handshake_failure(vio);
+               vio->hs_state = VIO_HS_SENT_RDX;
+       }
+       return 0;
+}
+
+static int process_dreg_nack(struct vio_driver_state *vio,
+                            struct vio_dring_register *pkt)
+{
+       viodbg(HS, "GOT DRING_REG NACK ident[%llx] "
+              "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+              (unsigned long long) pkt->dring_ident,
+              pkt->num_descr, pkt->descr_size, pkt->options,
+              pkt->num_cookies);
+
+       return handshake_failure(vio);
+}
+
+static int process_dreg(struct vio_driver_state *vio,
+                       struct vio_dring_register *pkt)
+{
+       if (!(vio->hs_state & VIO_HS_GOTVERS))
+               return handshake_failure(vio);
+
+       switch (pkt->tag.stype) {
+       case VIO_SUBTYPE_INFO:
+               return process_dreg_info(vio, pkt);
+
+       case VIO_SUBTYPE_ACK:
+               return process_dreg_ack(vio, pkt);
+
+       case VIO_SUBTYPE_NACK:
+               return process_dreg_nack(vio, pkt);
+
+       default:
+               return handshake_failure(vio);
+       }
+}
+
+static int process_dunreg(struct vio_driver_state *vio,
+                         struct vio_dring_unregister *pkt)
+{
+       struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_RX_RING];
+
+       viodbg(HS, "GOT DRING_UNREG\n");
+
+       if (pkt->dring_ident != dr->ident)
+               return 0;
+
+       vio->dr_state &= ~VIO_DR_STATE_RXREG;
+
+       memset(dr, 0, sizeof(*dr));
+
+       kfree(vio->desc_buf);
+       vio->desc_buf = NULL;
+       vio->desc_buf_len = 0;
+
+       return 0;
+}
+
+static int process_rdx_info(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+       viodbg(HS, "GOT RDX INFO\n");
+
+       pkt->tag.stype = VIO_SUBTYPE_ACK;
+       viodbg(HS, "SEND RDX ACK\n");
+       if (send_ctrl(vio, &pkt->tag, sizeof(*pkt)) < 0)
+               return handshake_failure(vio);
+
+       vio->hs_state |= VIO_HS_SENT_RDX_ACK;
+       return 0;
+}
+
+static int process_rdx_ack(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+       viodbg(HS, "GOT RDX ACK\n");
+
+       if (!(vio->hs_state & VIO_HS_SENT_RDX))
+               return handshake_failure(vio);
+
+       vio->hs_state |= VIO_HS_GOT_RDX_ACK;
+       return 0;
+}
+
+static int process_rdx_nack(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+       viodbg(HS, "GOT RDX NACK\n");
+
+       return handshake_failure(vio);
+}
+
+static int process_rdx(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+       if (!all_drings_registered(vio))
+               handshake_failure(vio);
+
+       switch (pkt->tag.stype) {
+       case VIO_SUBTYPE_INFO:
+               return process_rdx_info(vio, pkt);
+
+       case VIO_SUBTYPE_ACK:
+               return process_rdx_ack(vio, pkt);
+
+       case VIO_SUBTYPE_NACK:
+               return process_rdx_nack(vio, pkt);
+
+       default:
+               return handshake_failure(vio);
+       }
+}
+
+int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt)
+{
+       struct vio_msg_tag *tag = pkt;
+       u8 prev_state = vio->hs_state;
+       int err;
+
+       switch (tag->stype_env) {
+       case VIO_VER_INFO:
+               err = process_ver(vio, pkt);
+               break;
+
+       case VIO_ATTR_INFO:
+               err = process_attr(vio, pkt);
+               break;
+
+       case VIO_DRING_REG:
+               err = process_dreg(vio, pkt);
+               break;
+
+       case VIO_DRING_UNREG:
+               err = process_dunreg(vio, pkt);
+               break;
+
+       case VIO_RDX:
+               err = process_rdx(vio, pkt);
+               break;
+
+       default:
+               err = process_unknown(vio, pkt);
+               break;
+       }
+       if (!err &&
+           vio->hs_state != prev_state &&
+           (vio->hs_state & VIO_HS_COMPLETE))
+               vio->ops->handshake_complete(vio);
+
+       return err;
+}
+EXPORT_SYMBOL(vio_control_pkt_engine);
+
+void vio_conn_reset(struct vio_driver_state *vio)
+{
+}
+EXPORT_SYMBOL(vio_conn_reset);
+
+/* The issue is that the Solaris virtual disk server just mirrors the
+ * SID values it gets from the client peer.  So we work around that
+ * here in vio_{validate,send}_sid() so that the drivers don't need
+ * to be aware of this crap.
+ */
+int vio_validate_sid(struct vio_driver_state *vio, struct vio_msg_tag *tp)
+{
+       u32 sid;
+
+       /* Always let VERSION+INFO packets through unchecked, they
+        * define the new SID.
+        */
+       if (tp->type == VIO_TYPE_CTRL &&
+           tp->stype == VIO_SUBTYPE_INFO &&
+           tp->stype_env == VIO_VER_INFO)
+               return 0;
+
+       /* Ok, now figure out which SID to use.  */
+       switch (vio->dev_class) {
+       case VDEV_NETWORK:
+       case VDEV_NETWORK_SWITCH:
+       case VDEV_DISK_SERVER:
+       default:
+               sid = vio->_peer_sid;
+               break;
+
+       case VDEV_DISK:
+               sid = vio->_local_sid;
+               break;
+       }
+
+       if (sid == tp->sid)
+               return 0;
+       viodbg(DATA, "BAD SID tag->sid[%08x] peer_sid[%08x] local_sid[%08x]\n",
+              tp->sid, vio->_peer_sid, vio->_local_sid);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(vio_validate_sid);
+
+u32 vio_send_sid(struct vio_driver_state *vio)
+{
+       switch (vio->dev_class) {
+       case VDEV_NETWORK:
+       case VDEV_NETWORK_SWITCH:
+       case VDEV_DISK:
+       default:
+               return vio->_local_sid;
+
+       case VDEV_DISK_SERVER:
+               return vio->_peer_sid;
+       }
+}
+EXPORT_SYMBOL(vio_send_sid);
+
+extern int vio_ldc_alloc(struct vio_driver_state *vio,
+                        struct ldc_channel_config *base_cfg,
+                        void *event_arg)
+{
+       struct ldc_channel_config cfg = *base_cfg;
+       struct ldc_channel *lp;
+       const u64 *id;
+
+       id = md_get_property(vio->endpoint, "id", NULL);
+       if (!id) {
+               printk(KERN_ERR "%s: Channel lacks id property.\n",
+                      vio->name);
+               return -ENODEV;
+       }
+
+       vio->channel_id = *id;
+
+       cfg.rx_irq = vio->rx_irq;
+       cfg.tx_irq = vio->tx_irq;
+
+       lp = ldc_alloc(vio->channel_id, &cfg, event_arg);
+       if (IS_ERR(lp))
+               return PTR_ERR(lp);
+
+       vio->lp = lp;
+
+       return 0;
+}
+EXPORT_SYMBOL(vio_ldc_alloc);
+
+void vio_ldc_free(struct vio_driver_state *vio)
+{
+       ldc_free(vio->lp);
+       vio->lp = NULL;
+
+       kfree(vio->desc_buf);
+       vio->desc_buf = NULL;
+       vio->desc_buf_len = 0;
+}
+EXPORT_SYMBOL(vio_ldc_free);
+
+void vio_port_up(struct vio_driver_state *vio)
+{
+       unsigned long flags;
+       int err, state;
+
+       spin_lock_irqsave(&vio->lock, flags);
+
+       state = ldc_state(vio->lp);
+
+       err = 0;
+       if (state == LDC_STATE_INIT) {
+               err = ldc_bind(vio->lp);
+               if (err)
+                       printk(KERN_WARNING "%s: Port %lu bind failed, "
+                              "err=%d\n",
+                              vio->name, vio->channel_id, err);
+       }
+
+       if (!err) {
+               err = ldc_connect(vio->lp);
+               if (err)
+                       printk(KERN_WARNING "%s: Port %lu connect failed, "
+                              "err=%d\n",
+                              vio->name, vio->channel_id, err);
+       }
+       if (err) {
+               unsigned long expires = jiffies + HZ;
+
+               expires = round_jiffies(expires);
+               mod_timer(&vio->timer, expires);
+       }
+
+       spin_unlock_irqrestore(&vio->lock, flags);
+}
+EXPORT_SYMBOL(vio_port_up);
+
+static void vio_port_timer(unsigned long _arg)
+{
+       struct vio_driver_state *vio = (struct vio_driver_state *) _arg;
+
+       vio_port_up(vio);
+}
+
+int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
+                   u8 dev_class, struct mdesc_node *channel_endpoint,
+                   struct vio_version *ver_table, int ver_table_size,
+                   struct vio_driver_ops *ops, char *name)
+{
+       switch (dev_class) {
+       case VDEV_NETWORK:
+       case VDEV_NETWORK_SWITCH:
+       case VDEV_DISK:
+       case VDEV_DISK_SERVER:
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       if (!ops->send_attr ||
+           !ops->handle_attr ||
+           !ops->handshake_complete)
+               return -EINVAL;
+
+       if (!channel_endpoint)
+               return -EINVAL;
+
+       if (!ver_table || ver_table_size < 0)
+               return -EINVAL;
+
+       if (!name)
+               return -EINVAL;
+
+       spin_lock_init(&vio->lock);
+
+       vio->name = name;
+
+       vio->dev_class = dev_class;
+       vio->vdev = vdev;
+
+       vio->endpoint = channel_endpoint;
+       vio->tx_irq = channel_endpoint->irqs[0];
+       vio->rx_irq = channel_endpoint->irqs[1];
+
+       vio->ver_table = ver_table;
+       vio->ver_table_entries = ver_table_size;
+
+       vio->ops = ops;
+
+       setup_timer(&vio->timer, vio_port_timer, (unsigned long) vio);
+
+       return 0;
+}
+EXPORT_SYMBOL(vio_driver_init);
diff --git a/include/asm-sparc64/ldc.h b/include/asm-sparc64/ldc.h
new file mode 100644 (file)
index 0000000..24fd236
--- /dev/null
@@ -0,0 +1,136 @@
+#ifndef _SPARC64_LDC_H
+#define _SPARC64_LDC_H
+
+#include <asm/hypervisor.h>
+
+extern int ldom_domaining_enabled;
+
+/* The event handler will be evoked when link state changes
+ * or data becomes available on the receive side.
+ *
+ * For non-RAW links, if the LDC_EVENT_RESET event arrives the
+ * driver should reset all of it's internal state and reinvoke
+ * ldc_connect() to try and bring the link up again.
+ *
+ * For RAW links, ldc_connect() is not used.  Instead the driver
+ * just waits for the LDC_EVENT_UP event.
+ */
+struct ldc_channel_config {
+       void (*event)(void *arg, int event);
+
+       u32                     mtu;
+       unsigned int            rx_irq;
+       unsigned int            tx_irq;
+       u8                      mode;
+#define LDC_MODE_RAW           0x00
+#define LDC_MODE_UNRELIABLE    0x01
+#define LDC_MODE_RESERVED      0x02
+#define LDC_MODE_RELIABLE      0x03
+#define LDC_MODE_STREAM                0x04
+
+       u8                      debug;
+#define LDC_DEBUG_HS           0x01
+#define LDC_DEBUG_STATE                0x02
+#define LDC_DEBUG_RX           0x04
+#define LDC_DEBUG_TX           0x08
+#define LDC_DEBUG_DATA         0x10
+};
+
+#define LDC_EVENT_RESET                0x01
+#define LDC_EVENT_UP           0x02
+#define LDC_EVENT_DATA_READY   0x04
+
+#define LDC_STATE_INVALID      0x00
+#define LDC_STATE_INIT         0x01
+#define LDC_STATE_BOUND                0x02
+#define LDC_STATE_READY                0x03
+#define LDC_STATE_CONNECTED    0x04
+
+struct ldc_channel;
+
+/* Allocate state for a channel.  */
+extern struct ldc_channel *ldc_alloc(unsigned long id,
+                                    const struct ldc_channel_config *cfgp,
+                                    void *event_arg);
+
+/* Shut down and free state for a channel.  */
+extern void ldc_free(struct ldc_channel *lp);
+
+/* Register TX and RX queues of the link with the hypervisor.  */
+extern int ldc_bind(struct ldc_channel *lp);
+
+/* For non-RAW protocols we need to complete a handshake before
+ * communication can proceed.  ldc_connect() does that, if the
+ * handshake completes successfully, an LDC_EVENT_UP event will
+ * be sent up to the driver.
+ */
+extern int ldc_connect(struct ldc_channel *lp);
+extern int ldc_disconnect(struct ldc_channel *lp);
+
+extern int ldc_state(struct ldc_channel *lp);
+
+/* Read and write operations.  Only valid when the link is up.  */
+extern int ldc_write(struct ldc_channel *lp, const void *buf,
+                    unsigned int size);
+extern int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size);
+
+#define LDC_MAP_SHADOW 0x01
+#define LDC_MAP_DIRECT 0x02
+#define LDC_MAP_IO     0x04
+#define LDC_MAP_R      0x08
+#define LDC_MAP_W      0x10
+#define LDC_MAP_X      0x20
+#define LDC_MAP_RW     (LDC_MAP_R | LDC_MAP_W)
+#define LDC_MAP_RWX    (LDC_MAP_R | LDC_MAP_W | LDC_MAP_X)
+#define LDC_MAP_ALL    0x03f
+
+struct ldc_trans_cookie {
+       u64                     cookie_addr;
+       u64                     cookie_size;
+};
+
+struct scatterlist;
+extern int ldc_map_sg(struct ldc_channel *lp,
+                     struct scatterlist *sg, int num_sg,
+                     struct ldc_trans_cookie *cookies, int ncookies,
+                     unsigned int map_perm);
+
+extern int ldc_map_single(struct ldc_channel *lp,
+                         void *buf, unsigned int len,
+                         struct ldc_trans_cookie *cookies, int ncookies,
+                         unsigned int map_perm);
+
+extern void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
+                     int ncookies);
+
+extern int ldc_copy(struct ldc_channel *lp, int copy_dir,
+                   void *buf, unsigned int len, unsigned long offset,
+                   struct ldc_trans_cookie *cookies, int ncookies);
+
+static inline int ldc_get_dring_entry(struct ldc_channel *lp,
+                                     void *buf, unsigned int len,
+                                     unsigned long offset,
+                                     struct ldc_trans_cookie *cookies,
+                                     int ncookies)
+{
+       return ldc_copy(lp, LDC_COPY_IN, buf, len, offset, cookies, ncookies);
+}
+
+static inline int ldc_put_dring_entry(struct ldc_channel *lp,
+                                     void *buf, unsigned int len,
+                                     unsigned long offset,
+                                     struct ldc_trans_cookie *cookies,
+                                     int ncookies)
+{
+       return ldc_copy(lp, LDC_COPY_OUT, buf, len, offset, cookies, ncookies);
+}
+
+extern void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
+                                struct ldc_trans_cookie *cookies,
+                                int *ncookies, unsigned int map_perm);
+
+extern void ldc_free_exp_dring(struct ldc_channel *lp, void *buf,
+                              unsigned int len,
+                              struct ldc_trans_cookie *cookies, int ncookies);
+
+#endif /* _SPARC64_LDC_H */
diff --git a/include/asm-sparc64/vio.h b/include/asm-sparc64/vio.h
new file mode 100644 (file)
index 0000000..47c3da7
--- /dev/null
@@ -0,0 +1,402 @@
+#ifndef _SPARC64_VIO_H
+#define _SPARC64_VIO_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+
+#include <asm/ldc.h>
+#include <asm/mdesc.h>
+
+struct vio_msg_tag {
+       u8                      type;
+#define VIO_TYPE_CTRL          0x01
+#define VIO_TYPE_DATA          0x02
+#define VIO_TYPE_ERR           0x04
+
+       u8                      stype;
+#define VIO_SUBTYPE_INFO       0x01
+#define VIO_SUBTYPE_ACK                0x02
+#define VIO_SUBTYPE_NACK       0x04
+
+       u16                     stype_env;
+#define VIO_VER_INFO           0x0001
+#define VIO_ATTR_INFO          0x0002
+#define VIO_DRING_REG          0x0003
+#define VIO_DRING_UNREG                0x0004
+#define VIO_RDX                        0x0005
+#define VIO_PKT_DATA           0x0040
+#define VIO_DESC_DATA          0x0041
+#define VIO_DRING_DATA         0x0042
+#define VNET_MCAST_INFO                0x0101
+
+       u32             sid;
+};
+
+struct vio_rdx {
+       struct vio_msg_tag      tag;
+       u64                     resv[6];
+};
+
+struct vio_ver_info {
+       struct vio_msg_tag      tag;
+       u16                     major;
+       u16                     minor;
+       u8                      dev_class;
+#define VDEV_NETWORK           0x01
+#define VDEV_NETWORK_SWITCH    0x02
+#define VDEV_DISK              0x03
+#define VDEV_DISK_SERVER       0x04
+
+       u8                      resv1[3];
+       u64                     resv2[5];
+};
+
+struct vio_dring_register {
+       struct vio_msg_tag      tag;
+       u64                     dring_ident;
+       u32                     num_descr;
+       u32                     descr_size;
+       u16                     options;
+#define VIO_TX_DRING           0x0001
+#define VIO_RX_DRING           0x0002
+       u16                     resv;
+       u32                     num_cookies;
+       struct ldc_trans_cookie cookies[0];
+};
+
+struct vio_dring_unregister {
+       struct vio_msg_tag      tag;
+       u64                     dring_ident;
+       u64                     resv[5];
+};
+
+/* Data transfer modes */
+#define VIO_PKT_MODE           0x01 /* Packet based transfer   */
+#define VIO_DESC_MODE          0x02 /* In-band descriptors     */
+#define VIO_DRING_MODE         0x03 /* Descriptor rings        */
+
+struct vio_dring_data {
+       struct vio_msg_tag      tag;
+       u64                     seq;
+       u64                     dring_ident;
+       u32                     start_idx;
+       u32                     end_idx;
+       u8                      state;
+#define VIO_DRING_ACTIVE       0x01
+#define VIO_DRING_STOPPED      0x02
+
+       u8                      __pad1;
+       u16                     __pad2;
+       u32                     __pad3;
+       u64                     __par4[2];
+};
+
+struct vio_dring_hdr {
+       u8                      state;
+#define VIO_DESC_FREE          0x01
+#define VIO_DESC_READY         0x02
+#define VIO_DESC_ACCEPTED      0x03
+#define VIO_DESC_DONE          0x04
+       u8                      ack;
+#define VIO_ACK_ENABLE         0x01
+#define VIO_ACK_DISABLE                0x00
+
+       u16                     __pad1;
+       u32                     __pad2;
+};
+
+/* VIO disk specific structures and defines */
+struct vio_disk_attr_info {
+       struct vio_msg_tag      tag;
+       u8                      xfer_mode;
+       u8                      vdisk_type;
+#define VD_DISK_TYPE_SLICE     0x01 /* Slice in block device   */
+#define VD_DISK_TYPE_DISK      0x02 /* Entire block device     */
+       u16                     resv1;
+       u32                     vdisk_block_size;
+       u64                     operations;
+       u64                     vdisk_size;
+       u64                     max_xfer_size;
+       u64                     resv2[2];
+};
+
+struct vio_disk_desc {
+       struct vio_dring_hdr    hdr;
+       u64                     req_id;
+       u8                      operation;
+#define VD_OP_BREAD            0x01 /* Block read                      */
+#define VD_OP_BWRITE           0x02 /* Block write                     */
+#define VD_OP_FLUSH            0x03 /* Flush disk contents             */
+#define VD_OP_GET_WCE          0x04 /* Get write-cache status          */
+#define VD_OP_SET_WCE          0x05 /* Enable/disable write-cache      */
+#define VD_OP_GET_VTOC         0x06 /* Get VTOC                        */
+#define VD_OP_SET_VTOC         0x07 /* Set VTOC                        */
+#define VD_OP_GET_DISKGEOM     0x08 /* Get disk geometry               */
+#define VD_OP_SET_DISKGEOM     0x09 /* Set disk geometry               */
+#define VD_OP_SCSICMD          0x0a /* SCSI control command            */
+#define VD_OP_GET_DEVID                0x0b /* Get device ID                   */
+#define VD_OP_GET_EFI          0x0c /* Get EFI                         */
+#define VD_OP_SET_EFI          0x0d /* Set EFI                         */
+       u8                      slice;
+       u16                     resv1;
+       u32                     status;
+       u64                     offset;
+       u64                     size;
+       u32                     ncookies;
+       u32                     resv2;
+       struct ldc_trans_cookie cookies[0];
+};
+
+#define VIO_DISK_VNAME_LEN     8
+#define VIO_DISK_ALABEL_LEN    128
+#define VIO_DISK_NUM_PART      8
+
+struct vio_disk_vtoc {
+       u8                      volume_name[VIO_DISK_VNAME_LEN];
+       u16                     sector_size;
+       u16                     num_partitions;
+       u8                      ascii_label[VIO_DISK_ALABEL_LEN];
+       struct {
+               u16             id;
+               u16             perm_flags;
+               u32             resv;
+               u64             start_block;
+               u64             num_blocks;
+       } partitions[VIO_DISK_NUM_PART];
+};
+
+struct vio_disk_geom {
+       u16                     num_cyl; /* Num data cylinders          */
+       u16                     alt_cyl; /* Num alternate cylinders     */
+       u16                     beg_cyl; /* Cyl off of fixed head area  */
+       u16                     num_hd;  /* Num heads                   */
+       u16                     num_sec; /* Num sectors                 */
+       u16                     ifact;   /* Interleave factor           */
+       u16                     apc;     /* Alts per cylinder (SCSI)    */
+       u16                     rpm;     /* Revolutions per minute      */
+       u16                     phy_cyl; /* Num physical cylinders      */
+       u16                     wr_skip; /* Num sects to skip, writes   */
+       u16                     rd_skip; /* Num sects to skip, writes   */
+};
+
+struct vio_disk_devid {
+       u16                     resv;
+       u16                     type;
+       u32                     len;
+       char                    id[0];
+};
+
+struct vio_disk_efi {
+       u64                     lba;
+       u64                     len;
+       char                    data[0];
+};
+
+/* VIO net specific structures and defines */
+struct vio_net_attr_info {
+       struct vio_msg_tag      tag;
+       u8                      xfer_mode;
+       u8                      addr_type;
+#define VNET_ADDR_ETHERMAC     0x01
+       u16                     ack_freq;
+       u32                     resv1;
+       u64                     addr;
+       u64                     mtu;
+       u64                     resv2[3];
+};
+
+#define VNET_NUM_MCAST         7
+
+struct vio_net_mcast_info {
+       struct vio_msg_tag      tag;
+       u8                      set;
+       u8                      count;
+       u8                      mcast_addr[VNET_NUM_MCAST * 6];
+       u32                     resv;
+};
+
+struct vio_net_desc {
+       struct vio_dring_hdr    hdr;
+       u32                     size;
+       u32                     ncookies;
+       struct ldc_trans_cookie cookies[0];
+};
+
+#define VIO_MAX_RING_COOKIES   24
+
+struct vio_dring_state {
+       u64                     ident;
+       void                    *base;
+       u64                     snd_nxt;
+       u64                     rcv_nxt;
+       u32                     entry_size;
+       u32                     num_entries;
+       u32                     prod;
+       u32                     cons;
+       u32                     pending;
+       int                     ncookies;
+       struct ldc_trans_cookie cookies[VIO_MAX_RING_COOKIES];
+};
+
+static inline void *vio_dring_cur(struct vio_dring_state *dr)
+{
+       return dr->base + (dr->entry_size * dr->prod);
+}
+
+static inline void *vio_dring_entry(struct vio_dring_state *dr,
+                                   unsigned int index)
+{
+       return dr->base + (dr->entry_size * index);
+}
+
+static inline u32 vio_dring_avail(struct vio_dring_state *dr,
+                                 unsigned int ring_size)
+{
+       /* Ensure build-time power-of-2.  */
+       BUILD_BUG_ON(ring_size & (ring_size - 1));
+
+       return (dr->pending -
+               ((dr->prod - dr->cons) & (ring_size - 1)));
+}
+
+struct vio_dev {
+       struct mdesc_node       *mp;
+       struct device_node      *dp;
+
+       const char              *type;
+       const char              *compat;
+       int                     compat_len;
+
+       struct device           dev;
+};
+
+struct vio_driver {
+       struct list_head                node;
+       const struct vio_device_id      *id_table;
+       int (*probe)(struct vio_dev *dev, const struct vio_device_id *id);
+       int (*remove)(struct vio_dev *dev);
+       void (*shutdown)(struct vio_dev *dev);
+       unsigned long                   driver_data;
+       struct device_driver            driver;
+};
+
+struct vio_version {
+       u16             major;
+       u16             minor;
+};
+
+struct vio_driver_state;
+struct vio_driver_ops {
+       int     (*send_attr)(struct vio_driver_state *vio);
+       int     (*handle_attr)(struct vio_driver_state *vio, void *pkt);
+       void    (*handshake_complete)(struct vio_driver_state *vio);
+};
+
+struct vio_completion {
+       struct completion       com;
+       int                     err;
+       int                     waiting_for;
+};
+
+struct vio_driver_state {
+       /* Protects VIO handshake and, optionally, driver private state.  */
+       spinlock_t              lock;
+
+       struct ldc_channel      *lp;
+
+       u32                     _peer_sid;
+       u32                     _local_sid;
+       struct vio_dring_state  drings[2];
+#define VIO_DRIVER_TX_RING     0
+#define VIO_DRIVER_RX_RING     1
+
+       u8                      hs_state;
+#define VIO_HS_INVALID         0x00
+#define VIO_HS_GOTVERS         0x01
+#define VIO_HS_GOT_ATTR                0x04
+#define VIO_HS_SENT_DREG       0x08
+#define VIO_HS_SENT_RDX                0x10
+#define VIO_HS_GOT_RDX_ACK     0x20
+#define VIO_HS_GOT_RDX         0x40
+#define VIO_HS_SENT_RDX_ACK    0x80
+#define VIO_HS_COMPLETE                (VIO_HS_GOT_RDX_ACK | VIO_HS_SENT_RDX_ACK)
+
+       u8                      dev_class;
+
+       u8                      dr_state;
+#define VIO_DR_STATE_TXREG     0x01
+#define VIO_DR_STATE_RXREG     0x02
+#define VIO_DR_STATE_TXREQ     0x10
+#define VIO_DR_STATE_RXREQ     0x20
+
+       u8                      debug;
+#define VIO_DEBUG_HS           0x01
+#define VIO_DEBUG_DATA         0x02
+
+       void                    *desc_buf;
+       unsigned int            desc_buf_len;
+
+       struct vio_completion   *cmp;
+
+       struct vio_dev          *vdev;
+
+       unsigned long           channel_id;
+       unsigned int            tx_irq;
+       unsigned int            rx_irq;
+
+       struct timer_list       timer;
+
+       struct vio_version      ver;
+
+       struct mdesc_node       *endpoint;
+
+       struct vio_version      *ver_table;
+       int                     ver_table_entries;
+
+       char                    *name;
+
+       struct vio_driver_ops   *ops;
+};
+
+#define viodbg(TYPE, f, a...) \
+do {   if (vio->debug & VIO_DEBUG_##TYPE) \
+               printk(KERN_INFO "vio: ID[%lu] " f, vio->channel_id, ## a); \
+} while (0)
+
+extern int vio_register_driver(struct vio_driver *drv);
+extern void vio_unregister_driver(struct vio_driver *drv);
+
+static inline struct vio_driver *to_vio_driver(struct device_driver *drv)
+{
+       return container_of(drv, struct vio_driver, driver);
+}
+
+static inline struct vio_dev *to_vio_dev(struct device *dev)
+{
+       return container_of(dev, struct vio_dev, dev);
+}
+
+extern int vio_ldc_send(struct vio_driver_state *vio, void *data, int len);
+extern void vio_link_state_change(struct vio_driver_state *vio, int event);
+extern void vio_conn_reset(struct vio_driver_state *vio);
+extern int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt);
+extern int vio_validate_sid(struct vio_driver_state *vio,
+                           struct vio_msg_tag *tp);
+extern u32 vio_send_sid(struct vio_driver_state *vio);
+extern int vio_ldc_alloc(struct vio_driver_state *vio,
+                        struct ldc_channel_config *base_cfg, void *event_arg);
+extern void vio_ldc_free(struct vio_driver_state *vio);
+extern int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
+                          u8 dev_class, struct mdesc_node *channel_endpoint,
+                          struct vio_version *ver_table, int ver_table_size,
+                          struct vio_driver_ops *ops, char *name);
+
+extern struct mdesc_node *vio_find_endpoint(struct vio_dev *vdev);
+extern void vio_port_up(struct vio_driver_state *vio);
+
+#endif /* _SPARC64_VIO_H */