Patchwork [PATCHv2] Support for booting from virtio disks

login
register
mail settings
Submitter Gleb Natapov
Date May 10, 2010, 8:11 a.m.
Message ID <20100510081118.GI24787@redhat.com>
Download mbox | patch
Permalink /patch/52012/
State New
Headers show

Comments

Gleb Natapov - May 10, 2010, 8:11 a.m.
This patch adds native support for booting from virtio disks to Seabios.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---

Changelog:
 v1->v2:
  - free memory in case of vq initialization error.
  - change license of virtio ring/pci to LGPLv3 with permission
    of Laurent Vivier (aka the author).


--
			Gleb.
Stefan Hajnoczi - May 10, 2010, 8:25 a.m.
> diff --git a/src/virtio-blk.c b/src/virtio-blk.c
> new file mode 100644
> index 0000000..a41c336
> --- /dev/null
> +++ b/src/virtio-blk.c
> @@ -0,0 +1,155 @@
> +// Virtio blovl boot support.

Just noticed the "blovl" typo.

> +        char *desc = malloc_tmphigh(MAXDESCSIZE);
> +        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
> +        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
> +        if (!vdrive_g || !desc || !vq) {
> +            warn_noalloc();
> +            return;
> +        }

This error return can still leak.

Stefan
Gleb Natapov - May 10, 2010, 8:29 a.m.
On Mon, May 10, 2010 at 09:25:20AM +0100, Stefan Hajnoczi wrote:
> > diff --git a/src/virtio-blk.c b/src/virtio-blk.c
> > new file mode 100644
> > index 0000000..a41c336
> > --- /dev/null
> > +++ b/src/virtio-blk.c
> > @@ -0,0 +1,155 @@
> > +// Virtio blovl boot support.
> 
> Just noticed the "blovl" typo.
> 
> > +        char *desc = malloc_tmphigh(MAXDESCSIZE);
> > +        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
> > +        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
> > +        if (!vdrive_g || !desc || !vq) {
> > +            warn_noalloc();
> > +            return;
> > +        }
> 
> This error return can still leak.
> 
Oh Gosh, programming is hard. Why don't we write bios in python?

--
			Gleb.
Anthony Liguori - May 10, 2010, 3:48 p.m.
On 05/10/2010 03:11 AM, Gleb Natapov wrote:
> This patch adds native support for booting from virtio disks to Seabios.
>
> Signed-off-by: Gleb Natapov<gleb@redhat.com>
>    

A related problem that I think we need to think about how we solve is 
indicating to Seabios which device we want to boot from

With your patch, a user can select a virtio device explicitly or if they 
use only one virtio device, it will Just Work.

However, if a user uses IDE and virtio, or a user has multiple disks, 
they cannot select a device via -boot.

Is this something we need to address?  I don't think we'd break libvirt 
if we didn't.

Regards,

Anthony Liguori

> ---
>
> Changelog:
>   v1->v2:
>    - free memory in case of vq initialization error.
>    - change license of virtio ring/pci to LGPLv3 with permission
>      of Laurent Vivier (aka the author).
>
> diff --git a/Makefile b/Makefile
> index 327a1bf..d0b8881 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -14,7 +14,8 @@ OUT=out/
>   SRCBOTH=misc.c pmm.c stacks.c output.c util.c block.c floppy.c ata.c mouse.c \
>           kbd.c pci.c serial.c clock.c pic.c cdrom.c ps2port.c smp.c resume.c \
>           pnpbios.c pirtable.c vgahooks.c ramdisk.c pcibios.c blockcmd.c \
> -        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c
> +        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c \
> +        virtio-ring.c virtio-pci.c virtio-blk.c
>   SRC16=$(SRCBOTH) system.c disk.c apm.c font.c
>   SRC32FLAT=$(SRCBOTH) post.c shadow.c memmap.c coreboot.c boot.c \
>         acpi.c smm.c mptable.c smbios.c pciinit.c optionroms.c mtrr.c \
> diff --git a/src/block.c b/src/block.c
> index ddf441f..b6b1902 100644
> --- a/src/block.c
> +++ b/src/block.c
> @@ -11,6 +11,7 @@
>   #include "util.h" // dprintf
>   #include "ata.h" // process_ata_op
>   #include "usb-msc.h" // process_usb_op
> +#include "virtio-blk.h" // process_virtio_op
>
>   struct drives_s Drives VAR16VISIBLE;
>
> @@ -289,6 +290,8 @@ process_op(struct disk_op_s *op)
>           return process_cdemu_op(op);
>       case DTYPE_USB:
>           return process_usb_op(op);
> +    case DTYPE_VIRTIO:
> +	return process_virtio_op(op);
>       default:
>           op->count = 0;
>           return DISK_RET_EPARAM;
> diff --git a/src/config.h b/src/config.h
> index b101174..ad569c6 100644
> --- a/src/config.h
> +++ b/src/config.h
> @@ -136,6 +136,9 @@
>   #define CONFIG_SUBMODEL_ID   0x00
>   #define CONFIG_BIOS_REVISION 0x01
>
> +// Support boot from virtio storage
> +#define CONFIG_VIRTIO_BLK 1
> +
>   // Various memory addresses used by the code.
>   #define BUILD_STACK_ADDR          0x7000
>   #define BUILD_S3RESUME_STACK_ADDR 0x1000
> diff --git a/src/disk.h b/src/disk.h
> index 0cd1b74..9e5b083 100644
> --- a/src/disk.h
> +++ b/src/disk.h
> @@ -197,6 +197,7 @@ struct drive_s {
>   #define DTYPE_RAMDISK  0x04
>   #define DTYPE_CDEMU    0x05
>   #define DTYPE_USB      0x06
> +#define DTYPE_VIRTIO   0x07
>
>   #define MAXDESCSIZE 80
>
> diff --git a/src/pci_ids.h b/src/pci_ids.h
> index 1800f1d..e1cded2 100644
> --- a/src/pci_ids.h
> +++ b/src/pci_ids.h
> @@ -2605,3 +2605,6 @@
>   #define PCI_DEVICE_ID_RME_DIGI32	0x9896
>   #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
>   #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
> +
> +#define PCI_VENDOR_ID_REDHAT_QUMRANET	0x1af4
> +#define PCI_DEVICE_ID_VIRTIO_BLK	0x1001
> diff --git a/src/post.c b/src/post.c
> index 638b0f7..25535e2 100644
> --- a/src/post.c
> +++ b/src/post.c
> @@ -23,6 +23,7 @@
>   #include "smbios.h" // smbios_init
>   #include "paravirt.h" // qemu_cfg_port_probe
>   #include "ps2port.h" // ps2port_setup
> +#include "virtio-blk.h" // virtio_blk_setup
>
>   void
>   __set_irq(int vector, void *loc)
> @@ -184,6 +185,7 @@ init_hw(void)
>       floppy_setup();
>       ata_setup();
>       ramdisk_setup();
> +    virtio_blk_setup();
>   }
>
>   // Main setup code.
> diff --git a/src/virtio-blk.c b/src/virtio-blk.c
> new file mode 100644
> index 0000000..a41c336
> --- /dev/null
> +++ b/src/virtio-blk.c
> @@ -0,0 +1,155 @@
> +// Virtio blovl boot support.
> +//
> +// Copyright (C) 2010 Red Hat Inc.
> +//
> +// Authors:
> +//  Gleb Natapov<gnatapov@redhat.com>
> +//
> +// This file may be distributed under the terms of the GNU LGPLv3 license.
> +
> +#include "util.h" // dprintf
> +#include "pci.h" // foreachpci
> +#include "config.h" // CONFIG_*
> +#include "virtio-pci.h"
> +#include "virtio-blk.h"
> +#include "disk.h"
> +
> +struct virtiodrive_s {
> +    struct drive_s drive;
> +    struct vring_virtqueue *vq;
> +    u16 ioaddr;
> +};
> +
> +static int
> +virtio_blk_read(struct disk_op_s *op)
> +{
> +    struct virtiodrive_s *vdrive_g =
> +        container_of(op->drive_g, struct virtiodrive_s, drive);
> +    struct vring_virtqueue *vq = GET_GLOBAL(vdrive_g->vq);
> +    struct virtio_blk_outhdr hdr = {
> +        .type = VIRTIO_BLK_T_IN,
> +        .ioprio = 0,
> +        .sector = op->lba,
> +    };
> +    u8 status = VIRTIO_BLK_S_UNSUPP;
> +    struct vring_list sg[] = {
> +        {
> +            .addr	= MAKE_FLATPTR(GET_SEG(SS),&hdr),
> +            .length	= sizeof(hdr),
> +        },
> +        {
> +            .addr	= op->buf_fl,
> +            .length	= GET_GLOBAL(vdrive_g->drive.blksize) * op->count,
> +        },
> +        {
> +            .addr	= MAKE_FLATPTR(GET_SEG(SS),&status),
> +            .length	= sizeof(status),
> +        },
> +    };
> +
> +    /* Add to virtqueue and kick host */
> +    vring_add_buf(vq, sg, 1, 2, 0, 0);
> +    vring_kick(GET_GLOBAL(vdrive_g->ioaddr), vq, 1);
> +
> +    /* Wait for reply */
> +    while (!vring_more_used(vq))
> +        udelay(5);
> +
> +    /* Reclaim virtqueue element */
> +    vring_get_buf(vq, NULL);
> +    return status == VIRTIO_BLK_S_OK ? DISK_RET_SUCCESS : DISK_RET_EBADTRACK;
> +}
> +
> +int
> +process_virtio_op(struct disk_op_s *op)
> +{
> +    switch (op->command) {
> +    case CMD_READ:
> +        return virtio_blk_read(op);
> +    case CMD_FORMAT:
> +    case CMD_WRITE:
> +        return DISK_RET_EWRITEPROTECT;
> +    case CMD_RESET:
> +    case CMD_ISREADY:
> +    case CMD_VERIFY:
> +    case CMD_SEEK:
> +        return DISK_RET_SUCCESS;
> +    default:
> +        op->count = 0;
> +        return DISK_RET_EPARAM;
> +    }
> +}
> +
> +void
> +virtio_blk_setup(void)
> +{
> +    ASSERT32FLAT();
> +    if (! CONFIG_VIRTIO_BLK)
> +        return;
> +
> +    dprintf(3, "init virtio-blk\n");
> +
> +    int bdf, max;
> +    u32 id = PCI_VENDOR_ID_REDHAT_QUMRANET | (PCI_DEVICE_ID_VIRTIO_BLK<<  16);
> +    foreachpci(bdf, max) {
> +        u32 v = pci_config_readl(bdf, PCI_VENDOR_ID);
> +        if (v != id)
> +            continue;
> +        dprintf(3, "found virtio-blk at %x:%x\n", pci_bdf_to_bus(bdf),
> +                pci_bdf_to_dev(bdf));
> +        char *desc = malloc_tmphigh(MAXDESCSIZE);
> +        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
> +        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
> +        if (!vdrive_g || !desc || !vq) {
> +            warn_noalloc();
> +            return;
> +        }
> +        memset(vdrive_g, 0, sizeof(*vdrive_g));
> +        vdrive_g->drive.type = DTYPE_VIRTIO;
> +        vdrive_g->drive.cntl_id = bdf;
> +        vdrive_g->vq = vq;
> +
> +        u16 ioaddr = pci_config_readl(bdf, PCI_BASE_ADDRESS_0)&
> +            PCI_BASE_ADDRESS_IO_MASK;
> +
> +        vdrive_g->ioaddr = ioaddr;
> +
> +        vp_reset(ioaddr);
> +        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> +                      VIRTIO_CONFIG_S_DRIVER );
> +
> +        if (vp_find_vq(ioaddr, 0, vdrive_g->vq)<  0 ) {
> +            free(vdrive_g);
> +            free(desc);
> +            free(vq);
> +            dprintf(1, "fail to find vq for virtio-blk %x:%x\n",
> +                    pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf));
> +            continue;
> +        }
> +
> +        struct virtio_blk_config cfg;
> +        vp_get(ioaddr, 0,&cfg, sizeof(cfg));
> +
> +        vdrive_g->drive.blksize = cfg.blk_size;
> +        vdrive_g->drive.sectors = cfg.capacity;
> +        dprintf(3, "virtio-blk %x:%x blksize=%d sectors=%u\n",
> +                pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf),
> +                vdrive_g->drive.blksize, (u32)vdrive_g->drive.sectors);
> +
> +        vdrive_g->drive.pchs.cylinders = cfg.cylinders;
> +        vdrive_g->drive.pchs.heads = cfg.heads;
> +        vdrive_g->drive.pchs.spt = cfg.sectors;
> +
> +        setup_translation(&vdrive_g->drive);
> +        add_bcv_internal(&vdrive_g->drive);
> +
> +        snprintf(desc, MAXDESCSIZE, "Virtio disk PCI:%x:%x",
> +                 pci_bdf_to_bus(bdf), pci_bdf_to_dev(bdf));
> +
> +        vdrive_g->drive.desc = desc;
> +
> +        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> +                      VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK);
> +    }
> +}
> +
> diff --git a/src/virtio-blk.h b/src/virtio-blk.h
> new file mode 100644
> index 0000000..3369ea4
> --- /dev/null
> +++ b/src/virtio-blk.h
> @@ -0,0 +1,40 @@
> +#ifndef _VIRTIO_BLK_H
> +#define _VIRTIO_BLK_H
> +
> +struct virtio_blk_config
> +{
> +    u64 capacity;
> +    u32 size_max;
> +    u32 seg_max;
> +    u16 cylinders;
> +    u8 heads;
> +    u8 sectors;
> +    u32 blk_size;
> +    u8 physical_block_exp;
> +    u8 alignment_offset;
> +    u16 min_io_size;
> +    u32 opt_io_size;
> +} __attribute__((packed));
> +
> +/* These two define direction. */
> +#define VIRTIO_BLK_T_IN		0
> +#define VIRTIO_BLK_T_OUT	1
> +
> +/* This is the first element of the read scatter-gather list. */
> +struct virtio_blk_outhdr {
> +    /* VIRTIO_BLK_T* */
> +    u32 type;
> +    /* io priority. */
> +    u32 ioprio;
> +    /* Sector (ie. 512 byte offset) */
> +    u64 sector;
> +};
> +
> +#define VIRTIO_BLK_S_OK		0
> +#define VIRTIO_BLK_S_IOERR	1
> +#define VIRTIO_BLK_S_UNSUPP	2
> +
> +int process_virtio_op(struct disk_op_s *op);
> +void virtio_blk_setup(void);
> +
> +#endif /* _VIRTIO_BLK_H */
> diff --git a/src/virtio-pci.c b/src/virtio-pci.c
> new file mode 100644
> index 0000000..e171ea3
> --- /dev/null
> +++ b/src/virtio-pci.c
> @@ -0,0 +1,67 @@
> +/* virtio-pci.c - pci interface for virtio interface
> + *
> + * (c) Copyright 2008 Bull S.A.S.
> + *
> + *  Author: Laurent Vivier<Laurent.Vivier@bull.net>
> + *
> + * some parts from Linux Virtio PCI driver
> + *
> + *  Copyright IBM Corp. 2007
> + *  Authors: Anthony Liguori<aliguori@us.ibm.com>
> + *
> + *  Adopted for Seabios: Gleb Natapov<gleb@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU LGPLv3
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "virtio-ring.h"
> +#include "virtio-pci.h"
> +
> +int vp_find_vq(unsigned int ioaddr, int queue_index,
> +               struct vring_virtqueue *vq)
> +{
> +   struct vring * vr =&vq->vring;
> +   u16 num;
> +
> +   ASSERT32FLAT();
> +   /* select the queue */
> +
> +   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
> +
> +   /* check if the queue is available */
> +
> +   num = inw(ioaddr + VIRTIO_PCI_QUEUE_NUM);
> +   if (!num) {
> +       dprintf(1, "ERROR: queue size is 0\n");
> +       return -1;
> +   }
> +
> +   if (num>  MAX_QUEUE_NUM) {
> +       dprintf(1, "ERROR: queue size %d>  %d\n", num, MAX_QUEUE_NUM);
> +       return -1;
> +   }
> +
> +   /* check if the queue is already active */
> +
> +   if (inl(ioaddr + VIRTIO_PCI_QUEUE_PFN)) {
> +       dprintf(1, "ERROR: queue already active\n");
> +       return -1;
> +   }
> +
> +   vq->queue_index = queue_index;
> +
> +   /* initialize the queue */
> +
> +   vring_init(vr, num, (unsigned char*)&vq->queue);
> +
> +   /* activate the queue
> +    *
> +    * NOTE: vr->desc is initialized by vring_init()
> +    */
> +
> +   outl((unsigned long)virt_to_phys(vr->desc)>>  PAGE_SHIFT,
> +        ioaddr + VIRTIO_PCI_QUEUE_PFN);
> +
> +   return num;
> +}
> diff --git a/src/virtio-pci.h b/src/virtio-pci.h
> new file mode 100644
> index 0000000..6932036
> --- /dev/null
> +++ b/src/virtio-pci.h
> @@ -0,0 +1,97 @@
> +#ifndef _VIRTIO_PCI_H
> +#define _VIRTIO_PCI_H
> +
> +/* A 32-bit r/o bitmask of the features supported by the host */
> +#define VIRTIO_PCI_HOST_FEATURES        0
> +
> +/* A 32-bit r/w bitmask of features activated by the guest */
> +#define VIRTIO_PCI_GUEST_FEATURES       4
> +
> +/* A 32-bit r/w PFN for the currently selected queue */
> +#define VIRTIO_PCI_QUEUE_PFN            8
> +
> +/* A 16-bit r/o queue size for the currently selected queue */
> +#define VIRTIO_PCI_QUEUE_NUM            12
> +
> +/* A 16-bit r/w queue selector */
> +#define VIRTIO_PCI_QUEUE_SEL            14
> +
> +/* A 16-bit r/w queue notifier */
> +#define VIRTIO_PCI_QUEUE_NOTIFY         16
> +
> +/* An 8-bit device status register.  */
> +#define VIRTIO_PCI_STATUS               18
> +
> +/* An 8-bit r/o interrupt status register.  Reading the value will return the
> + * current contents of the ISR and will also clear it.  This is effectively
> + * a read-and-acknowledge. */
> +#define VIRTIO_PCI_ISR                  19
> +
> +/* The bit of the ISR which indicates a device configuration change. */
> +#define VIRTIO_PCI_ISR_CONFIG           0x2
> +
> +/* The remaining space is defined by each driver as the per-driver
> + * configuration space */
> +#define VIRTIO_PCI_CONFIG               20
> +
> +/* Virtio ABI version, this must match exactly */
> +#define VIRTIO_PCI_ABI_VERSION          0
> +
> +static inline u32 vp_get_features(unsigned int ioaddr)
> +{
> +   return inl(ioaddr + VIRTIO_PCI_HOST_FEATURES);
> +}
> +
> +static inline void vp_set_features(unsigned int ioaddr, u32 features)
> +{
> +        outl(features, ioaddr + VIRTIO_PCI_GUEST_FEATURES);
> +}
> +
> +static inline void vp_get(unsigned int ioaddr, unsigned offset,
> +                     void *buf, unsigned len)
> +{
> +   u8 *ptr = buf;
> +   unsigned i;
> +
> +   for (i = 0; i<  len; i++)
> +           ptr[i] = inb(ioaddr + VIRTIO_PCI_CONFIG + offset + i);
> +}
> +
> +static inline u8 vp_get_status(unsigned int ioaddr)
> +{
> +   return inb(ioaddr + VIRTIO_PCI_STATUS);
> +}
> +
> +static inline void vp_set_status(unsigned int ioaddr, u8 status)
> +{
> +   if (status == 0)        /* reset */
> +           return;
> +   outb(status, ioaddr + VIRTIO_PCI_STATUS);
> +}
> +
> +
> +static inline void vp_reset(unsigned int ioaddr)
> +{
> +   outb(0, ioaddr + VIRTIO_PCI_STATUS);
> +   (void)inb(ioaddr + VIRTIO_PCI_ISR);
> +}
> +
> +static inline void vp_notify(unsigned int ioaddr, int queue_index)
> +{
> +   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
> +}
> +
> +static inline void vp_del_vq(unsigned int ioaddr, int queue_index)
> +{
> +   /* select the queue */
> +
> +   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
> +
> +   /* deactivate the queue */
> +
> +   outl(0, ioaddr + VIRTIO_PCI_QUEUE_PFN);
> +}
> +
> +int vp_find_vq(unsigned int ioaddr, int queue_index,
> +               struct vring_virtqueue *vq);
> +#endif /* _VIRTIO_PCI_H_ */
> diff --git a/src/virtio-ring.c b/src/virtio-ring.c
> new file mode 100644
> index 0000000..f4a2efe
> --- /dev/null
> +++ b/src/virtio-ring.c
> @@ -0,0 +1,152 @@
> +/* virtio-pci.c - virtio ring management
> + *
> + * (c) Copyright 2008 Bull S.A.S.
> + *
> + *  Author: Laurent Vivier<Laurent.Vivier@bull.net>
> + *
> + *  some parts from Linux Virtio Ring
> + *
> + *  Copyright Rusty Russell IBM Corporation 2007
> + *
> + *  Adopted for Seabios: Gleb Natapov<gleb@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU LGPLv3
> + * See the COPYING file in the top-level directory.
> + *
> + *
> + */
> +
> +#include "virtio-ring.h"
> +#include "virtio-pci.h"
> +
> +#define BUG() do {                                      \
> +        dprintf(1, "BUG: failure at %s:%d/%s()!\n",     \
> +                __FILE__, __LINE__, __FUNCTION__);      \
> +                while(1);                               \
> +        } while (0)
> +#define BUG_ON(condition) do { if (condition) BUG(); } while (0)
> +
> +/*
> + * vring_more_used
> + *
> + * is there some used buffers ?
> + *
> + */
> +
> +int vring_more_used(struct vring_virtqueue *vq)
> +{
> +    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> +    wmb();
> +    return GET_FLATPTR(vq->last_used_idx) != GET_FLATPTR(used->idx);
> +}
> +
> +/*
> + * vring_free
> + *
> + * put at the begin of the free list the current desc[head]
> + */
> +
> +void vring_detach(struct vring_virtqueue *vq, unsigned int head)
> +{
> +    struct vring *vr =&vq->vring;
> +    struct vring_desc *desc = GET_FLATPTR(vr->desc);
> +    unsigned int i;
> +
> +    /* find end of given descriptor */
> +
> +    i = head;
> +    while (GET_FLATPTR(desc[i].flags)&  VRING_DESC_F_NEXT)
> +        i = GET_FLATPTR(desc[i].next);
> +
> +    /* link it with free list and point to it */
> +
> +    SET_FLATPTR(desc[i].next, GET_FLATPTR(vq->free_head));
> +    wmb();
> +    SET_FLATPTR(vq->free_head, head);
> +}
> +
> +/*
> + * vring_get_buf
> + *
> + * get a buffer from the used list
> + *
> + */
> +
> +int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len)
> +{
> +    struct vring *vr =&vq->vring;
> +    struct vring_used_elem *elem;
> +    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> +    u32 id;
> +    int ret;
> +
> +//    BUG_ON(!vring_more_used(vq));
> +
> +    elem =&used->ring[GET_FLATPTR(vq->last_used_idx) % GET_FLATPTR(vr->num)];
> +    wmb();
> +    id = GET_FLATPTR(elem->id);
> +    if (len != NULL)
> +        *len = GET_FLATPTR(elem->len);
> +
> +    ret = GET_FLATPTR(vq->vdata[id]);
> +
> +    vring_detach(vq, id);
> +
> +    SET_FLATPTR(vq->last_used_idx, GET_FLATPTR(vq->last_used_idx) + 1);
> +
> +    return ret;
> +}
> +
> +void vring_add_buf(struct vring_virtqueue *vq,
> +                   struct vring_list list[],
> +                   unsigned int out, unsigned int in,
> +                   int index, int num_added)
> +{
> +    struct vring *vr =&vq->vring;
> +    int i, av, head, prev;
> +    struct vring_desc *desc = GET_FLATPTR(vr->desc);
> +    struct vring_avail *avail = GET_FLATPTR(vr->avail);
> +
> +    BUG_ON(out + in == 0);
> +
> +    prev = 0;
> +    head = GET_FLATPTR(vq->free_head);
> +    for (i = head; out; i = GET_FLATPTR(desc[i].next), out--) {
> +        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT);
> +        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
> +        SET_FLATPTR(desc[i].len, list->length);
> +        prev = i;
> +        list++;
> +    }
> +    for ( ; in; i = GET_FLATPTR(desc[i].next), in--) {
> +        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT|VRING_DESC_F_WRITE);
> +        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
> +        SET_FLATPTR(desc[i].len, list->length);
> +        prev = i;
> +        list++;
> +    }
> +    SET_FLATPTR(desc[prev].flags,
> +                GET_FLATPTR(desc[prev].flags)&  ~VRING_DESC_F_NEXT);
> +
> +    SET_FLATPTR(vq->free_head, i);
> +
> +    SET_FLATPTR(vq->vdata[head], index);
> +
> +    av = (GET_FLATPTR(avail->idx) + num_added) % GET_FLATPTR(vr->num);
> +    SET_FLATPTR(avail->ring[av], head);
> +    wmb();
> +}
> +
> +void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added)
> +{
> +    struct vring *vr =&vq->vring;
> +    struct vring_avail *avail = GET_FLATPTR(vr->avail);
> +    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> +
> +    wmb();
> +    SET_FLATPTR(avail->idx, GET_FLATPTR(avail->idx) + num_added);
> +
> +    mb();
> +    if (!(GET_FLATPTR(used->flags)&  VRING_USED_F_NO_NOTIFY))
> +        vp_notify(ioaddr, GET_FLATPTR(vq->queue_index));
> +}
> diff --git a/src/virtio-ring.h b/src/virtio-ring.h
> new file mode 100644
> index 0000000..b97d572
> --- /dev/null
> +++ b/src/virtio-ring.h
> @@ -0,0 +1,125 @@
> +#ifndef _VIRTIO_RING_H
> +#define _VIRTIO_RING_H
> +
> +#define PAGE_SHIFT 12
> +#define PAGE_MASK  (PAGE_SIZE-1)
> +
> +#define virt_to_phys(v) (unsigned long)(v)
> +#define phys_to_virt(p) (void*)(p)
> +#define wmb() barrier()
> +#define mb() barrier()
> +
> +/* Status byte for guest to report progress, and synchronize features. */
> +/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
> +#define VIRTIO_CONFIG_S_ACKNOWLEDGE     1
> +/* We have found a driver for the device. */
> +#define VIRTIO_CONFIG_S_DRIVER          2
> +/* Driver has used its parts of the config, and is happy */
> +#define VIRTIO_CONFIG_S_DRIVER_OK       4
> +/* We've given up on this device. */
> +#define VIRTIO_CONFIG_S_FAILED          0x80
> +
> +#define MAX_QUEUE_NUM      (128)
> +
> +#define VRING_DESC_F_NEXT  1
> +#define VRING_DESC_F_WRITE 2
> +
> +#define VRING_AVAIL_F_NO_INTERRUPT 1
> +
> +#define VRING_USED_F_NO_NOTIFY     1
> +
> +struct vring_desc
> +{
> +   u64 addr;
> +   u32 len;
> +   u16 flags;
> +   u16 next;
> +};
> +
> +struct vring_avail
> +{
> +   u16 flags;
> +   u16 idx;
> +   u16 ring[0];
> +};
> +
> +struct vring_used_elem
> +{
> +   u32 id;
> +   u32 len;
> +};
> +
> +struct vring_used
> +{
> +   u16 flags;
> +   u16 idx;
> +   struct vring_used_elem ring[];
> +};
> +
> +struct vring {
> +   unsigned int num;
> +   struct vring_desc *desc;
> +   struct vring_avail *avail;
> +   struct vring_used *used;
> +};
> +
> +#define vring_size(num) \
> +   (((((sizeof(struct vring_desc) * num) + \
> +      (sizeof(struct vring_avail) + sizeof(u16) * num)) \
> +         + PAGE_MASK)&  ~PAGE_MASK) + \
> +         (sizeof(struct vring_used) + sizeof(struct vring_used_elem) * num))
> +
> +typedef unsigned char virtio_queue_t[PAGE_MASK + vring_size(MAX_QUEUE_NUM)];
> +
> +struct vring_virtqueue {
> +   virtio_queue_t queue;
> +   struct vring vring;
> +   u16 free_head;
> +   u16 last_used_idx;
> +   u16 vdata[MAX_QUEUE_NUM];
> +   /* PCI */
> +   int queue_index;
> +};
> +
> +struct vring_list {
> +  char *addr;
> +  unsigned int length;
> +};
> +
> +static inline void vring_init(struct vring *vr,
> +                         unsigned int num, unsigned char *queue)
> +{
> +   unsigned int i;
> +   unsigned long pa;
> +
> +   ASSERT32FLAT();
> +   vr->num = num;
> +
> +   /* physical address of desc must be page aligned */
> +
> +   pa = virt_to_phys(queue);
> +   pa = (pa + PAGE_MASK)&  ~PAGE_MASK;
> +   vr->desc = phys_to_virt(pa);
> +
> +   vr->avail = (struct vring_avail *)&vr->desc[num];
> +
> +   /* physical address of used must be page aligned */
> +
> +   pa = virt_to_phys(&vr->avail->ring[num]);
> +   pa = (pa + PAGE_MASK)&  ~PAGE_MASK;
> +   vr->used = phys_to_virt(pa);
> +
> +   for (i = 0; i<  num - 1; i++)
> +           vr->desc[i].next = i + 1;
> +   vr->desc[i].next = 0;
> +}
> +
> +int vring_more_used(struct vring_virtqueue *vq);
> +void vring_detach(struct vring_virtqueue *vq, unsigned int head);
> +int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len);
> +void vring_add_buf(struct vring_virtqueue *vq, struct vring_list list[],
> +                   unsigned int out, unsigned int in,
> +                   int index, int num_added);
> +void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added);
> +
> +#endif /* _VIRTIO_RING_H_ */
>
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Gleb Natapov - May 10, 2010, 3:54 p.m.
On Mon, May 10, 2010 at 10:48:42AM -0500, Anthony Liguori wrote:
> On 05/10/2010 03:11 AM, Gleb Natapov wrote:
> >This patch adds native support for booting from virtio disks to Seabios.
> >
> >Signed-off-by: Gleb Natapov<gleb@redhat.com>
> 
> A related problem that I think we need to think about how we solve
> is indicating to Seabios which device we want to boot from
> 
> With your patch, a user can select a virtio device explicitly or if
> they use only one virtio device, it will Just Work.
> 
> However, if a user uses IDE and virtio, or a user has multiple
> disks, they cannot select a device via -boot.
> 
Isn't this problem unrelated to this patch?  I mean if I start qemu with
two ide devices can I specify from qemu command line which one I want to
boot from?

> Is this something we need to address?  I don't think we'd break
> libvirt if we didn't.
> 
> Regards,
> 
> Anthony Liguori
> 
> >---
> >
> >Changelog:
> >  v1->v2:
> >   - free memory in case of vq initialization error.
> >   - change license of virtio ring/pci to LGPLv3 with permission
> >     of Laurent Vivier (aka the author).
> >
> >diff --git a/Makefile b/Makefile
> >index 327a1bf..d0b8881 100644
> >--- a/Makefile
> >+++ b/Makefile
> >@@ -14,7 +14,8 @@ OUT=out/
> >  SRCBOTH=misc.c pmm.c stacks.c output.c util.c block.c floppy.c ata.c mouse.c \
> >          kbd.c pci.c serial.c clock.c pic.c cdrom.c ps2port.c smp.c resume.c \
> >          pnpbios.c pirtable.c vgahooks.c ramdisk.c pcibios.c blockcmd.c \
> >-        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c
> >+        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c \
> >+        virtio-ring.c virtio-pci.c virtio-blk.c
> >  SRC16=$(SRCBOTH) system.c disk.c apm.c font.c
> >  SRC32FLAT=$(SRCBOTH) post.c shadow.c memmap.c coreboot.c boot.c \
> >        acpi.c smm.c mptable.c smbios.c pciinit.c optionroms.c mtrr.c \
> >diff --git a/src/block.c b/src/block.c
> >index ddf441f..b6b1902 100644
> >--- a/src/block.c
> >+++ b/src/block.c
> >@@ -11,6 +11,7 @@
> >  #include "util.h" // dprintf
> >  #include "ata.h" // process_ata_op
> >  #include "usb-msc.h" // process_usb_op
> >+#include "virtio-blk.h" // process_virtio_op
> >
> >  struct drives_s Drives VAR16VISIBLE;
> >
> >@@ -289,6 +290,8 @@ process_op(struct disk_op_s *op)
> >          return process_cdemu_op(op);
> >      case DTYPE_USB:
> >          return process_usb_op(op);
> >+    case DTYPE_VIRTIO:
> >+	return process_virtio_op(op);
> >      default:
> >          op->count = 0;
> >          return DISK_RET_EPARAM;
> >diff --git a/src/config.h b/src/config.h
> >index b101174..ad569c6 100644
> >--- a/src/config.h
> >+++ b/src/config.h
> >@@ -136,6 +136,9 @@
> >  #define CONFIG_SUBMODEL_ID   0x00
> >  #define CONFIG_BIOS_REVISION 0x01
> >
> >+// Support boot from virtio storage
> >+#define CONFIG_VIRTIO_BLK 1
> >+
> >  // Various memory addresses used by the code.
> >  #define BUILD_STACK_ADDR          0x7000
> >  #define BUILD_S3RESUME_STACK_ADDR 0x1000
> >diff --git a/src/disk.h b/src/disk.h
> >index 0cd1b74..9e5b083 100644
> >--- a/src/disk.h
> >+++ b/src/disk.h
> >@@ -197,6 +197,7 @@ struct drive_s {
> >  #define DTYPE_RAMDISK  0x04
> >  #define DTYPE_CDEMU    0x05
> >  #define DTYPE_USB      0x06
> >+#define DTYPE_VIRTIO   0x07
> >
> >  #define MAXDESCSIZE 80
> >
> >diff --git a/src/pci_ids.h b/src/pci_ids.h
> >index 1800f1d..e1cded2 100644
> >--- a/src/pci_ids.h
> >+++ b/src/pci_ids.h
> >@@ -2605,3 +2605,6 @@
> >  #define PCI_DEVICE_ID_RME_DIGI32	0x9896
> >  #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
> >  #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
> >+
> >+#define PCI_VENDOR_ID_REDHAT_QUMRANET	0x1af4
> >+#define PCI_DEVICE_ID_VIRTIO_BLK	0x1001
> >diff --git a/src/post.c b/src/post.c
> >index 638b0f7..25535e2 100644
> >--- a/src/post.c
> >+++ b/src/post.c
> >@@ -23,6 +23,7 @@
> >  #include "smbios.h" // smbios_init
> >  #include "paravirt.h" // qemu_cfg_port_probe
> >  #include "ps2port.h" // ps2port_setup
> >+#include "virtio-blk.h" // virtio_blk_setup
> >
> >  void
> >  __set_irq(int vector, void *loc)
> >@@ -184,6 +185,7 @@ init_hw(void)
> >      floppy_setup();
> >      ata_setup();
> >      ramdisk_setup();
> >+    virtio_blk_setup();
> >  }
> >
> >  // Main setup code.
> >diff --git a/src/virtio-blk.c b/src/virtio-blk.c
> >new file mode 100644
> >index 0000000..a41c336
> >--- /dev/null
> >+++ b/src/virtio-blk.c
> >@@ -0,0 +1,155 @@
> >+// Virtio blovl boot support.
> >+//
> >+// Copyright (C) 2010 Red Hat Inc.
> >+//
> >+// Authors:
> >+//  Gleb Natapov<gnatapov@redhat.com>
> >+//
> >+// This file may be distributed under the terms of the GNU LGPLv3 license.
> >+
> >+#include "util.h" // dprintf
> >+#include "pci.h" // foreachpci
> >+#include "config.h" // CONFIG_*
> >+#include "virtio-pci.h"
> >+#include "virtio-blk.h"
> >+#include "disk.h"
> >+
> >+struct virtiodrive_s {
> >+    struct drive_s drive;
> >+    struct vring_virtqueue *vq;
> >+    u16 ioaddr;
> >+};
> >+
> >+static int
> >+virtio_blk_read(struct disk_op_s *op)
> >+{
> >+    struct virtiodrive_s *vdrive_g =
> >+        container_of(op->drive_g, struct virtiodrive_s, drive);
> >+    struct vring_virtqueue *vq = GET_GLOBAL(vdrive_g->vq);
> >+    struct virtio_blk_outhdr hdr = {
> >+        .type = VIRTIO_BLK_T_IN,
> >+        .ioprio = 0,
> >+        .sector = op->lba,
> >+    };
> >+    u8 status = VIRTIO_BLK_S_UNSUPP;
> >+    struct vring_list sg[] = {
> >+        {
> >+            .addr	= MAKE_FLATPTR(GET_SEG(SS),&hdr),
> >+            .length	= sizeof(hdr),
> >+        },
> >+        {
> >+            .addr	= op->buf_fl,
> >+            .length	= GET_GLOBAL(vdrive_g->drive.blksize) * op->count,
> >+        },
> >+        {
> >+            .addr	= MAKE_FLATPTR(GET_SEG(SS),&status),
> >+            .length	= sizeof(status),
> >+        },
> >+    };
> >+
> >+    /* Add to virtqueue and kick host */
> >+    vring_add_buf(vq, sg, 1, 2, 0, 0);
> >+    vring_kick(GET_GLOBAL(vdrive_g->ioaddr), vq, 1);
> >+
> >+    /* Wait for reply */
> >+    while (!vring_more_used(vq))
> >+        udelay(5);
> >+
> >+    /* Reclaim virtqueue element */
> >+    vring_get_buf(vq, NULL);
> >+    return status == VIRTIO_BLK_S_OK ? DISK_RET_SUCCESS : DISK_RET_EBADTRACK;
> >+}
> >+
> >+int
> >+process_virtio_op(struct disk_op_s *op)
> >+{
> >+    switch (op->command) {
> >+    case CMD_READ:
> >+        return virtio_blk_read(op);
> >+    case CMD_FORMAT:
> >+    case CMD_WRITE:
> >+        return DISK_RET_EWRITEPROTECT;
> >+    case CMD_RESET:
> >+    case CMD_ISREADY:
> >+    case CMD_VERIFY:
> >+    case CMD_SEEK:
> >+        return DISK_RET_SUCCESS;
> >+    default:
> >+        op->count = 0;
> >+        return DISK_RET_EPARAM;
> >+    }
> >+}
> >+
> >+void
> >+virtio_blk_setup(void)
> >+{
> >+    ASSERT32FLAT();
> >+    if (! CONFIG_VIRTIO_BLK)
> >+        return;
> >+
> >+    dprintf(3, "init virtio-blk\n");
> >+
> >+    int bdf, max;
> >+    u32 id = PCI_VENDOR_ID_REDHAT_QUMRANET | (PCI_DEVICE_ID_VIRTIO_BLK<<  16);
> >+    foreachpci(bdf, max) {
> >+        u32 v = pci_config_readl(bdf, PCI_VENDOR_ID);
> >+        if (v != id)
> >+            continue;
> >+        dprintf(3, "found virtio-blk at %x:%x\n", pci_bdf_to_bus(bdf),
> >+                pci_bdf_to_dev(bdf));
> >+        char *desc = malloc_tmphigh(MAXDESCSIZE);
> >+        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
> >+        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
> >+        if (!vdrive_g || !desc || !vq) {
> >+            warn_noalloc();
> >+            return;
> >+        }
> >+        memset(vdrive_g, 0, sizeof(*vdrive_g));
> >+        vdrive_g->drive.type = DTYPE_VIRTIO;
> >+        vdrive_g->drive.cntl_id = bdf;
> >+        vdrive_g->vq = vq;
> >+
> >+        u16 ioaddr = pci_config_readl(bdf, PCI_BASE_ADDRESS_0)&
> >+            PCI_BASE_ADDRESS_IO_MASK;
> >+
> >+        vdrive_g->ioaddr = ioaddr;
> >+
> >+        vp_reset(ioaddr);
> >+        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> >+                      VIRTIO_CONFIG_S_DRIVER );
> >+
> >+        if (vp_find_vq(ioaddr, 0, vdrive_g->vq)<  0 ) {
> >+            free(vdrive_g);
> >+            free(desc);
> >+            free(vq);
> >+            dprintf(1, "fail to find vq for virtio-blk %x:%x\n",
> >+                    pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf));
> >+            continue;
> >+        }
> >+
> >+        struct virtio_blk_config cfg;
> >+        vp_get(ioaddr, 0,&cfg, sizeof(cfg));
> >+
> >+        vdrive_g->drive.blksize = cfg.blk_size;
> >+        vdrive_g->drive.sectors = cfg.capacity;
> >+        dprintf(3, "virtio-blk %x:%x blksize=%d sectors=%u\n",
> >+                pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf),
> >+                vdrive_g->drive.blksize, (u32)vdrive_g->drive.sectors);
> >+
> >+        vdrive_g->drive.pchs.cylinders = cfg.cylinders;
> >+        vdrive_g->drive.pchs.heads = cfg.heads;
> >+        vdrive_g->drive.pchs.spt = cfg.sectors;
> >+
> >+        setup_translation(&vdrive_g->drive);
> >+        add_bcv_internal(&vdrive_g->drive);
> >+
> >+        snprintf(desc, MAXDESCSIZE, "Virtio disk PCI:%x:%x",
> >+                 pci_bdf_to_bus(bdf), pci_bdf_to_dev(bdf));
> >+
> >+        vdrive_g->drive.desc = desc;
> >+
> >+        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> >+                      VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK);
> >+    }
> >+}
> >+
> >diff --git a/src/virtio-blk.h b/src/virtio-blk.h
> >new file mode 100644
> >index 0000000..3369ea4
> >--- /dev/null
> >+++ b/src/virtio-blk.h
> >@@ -0,0 +1,40 @@
> >+#ifndef _VIRTIO_BLK_H
> >+#define _VIRTIO_BLK_H
> >+
> >+struct virtio_blk_config
> >+{
> >+    u64 capacity;
> >+    u32 size_max;
> >+    u32 seg_max;
> >+    u16 cylinders;
> >+    u8 heads;
> >+    u8 sectors;
> >+    u32 blk_size;
> >+    u8 physical_block_exp;
> >+    u8 alignment_offset;
> >+    u16 min_io_size;
> >+    u32 opt_io_size;
> >+} __attribute__((packed));
> >+
> >+/* These two define direction. */
> >+#define VIRTIO_BLK_T_IN		0
> >+#define VIRTIO_BLK_T_OUT	1
> >+
> >+/* This is the first element of the read scatter-gather list. */
> >+struct virtio_blk_outhdr {
> >+    /* VIRTIO_BLK_T* */
> >+    u32 type;
> >+    /* io priority. */
> >+    u32 ioprio;
> >+    /* Sector (ie. 512 byte offset) */
> >+    u64 sector;
> >+};
> >+
> >+#define VIRTIO_BLK_S_OK		0
> >+#define VIRTIO_BLK_S_IOERR	1
> >+#define VIRTIO_BLK_S_UNSUPP	2
> >+
> >+int process_virtio_op(struct disk_op_s *op);
> >+void virtio_blk_setup(void);
> >+
> >+#endif /* _VIRTIO_BLK_H */
> >diff --git a/src/virtio-pci.c b/src/virtio-pci.c
> >new file mode 100644
> >index 0000000..e171ea3
> >--- /dev/null
> >+++ b/src/virtio-pci.c
> >@@ -0,0 +1,67 @@
> >+/* virtio-pci.c - pci interface for virtio interface
> >+ *
> >+ * (c) Copyright 2008 Bull S.A.S.
> >+ *
> >+ *  Author: Laurent Vivier<Laurent.Vivier@bull.net>
> >+ *
> >+ * some parts from Linux Virtio PCI driver
> >+ *
> >+ *  Copyright IBM Corp. 2007
> >+ *  Authors: Anthony Liguori<aliguori@us.ibm.com>
> >+ *
> >+ *  Adopted for Seabios: Gleb Natapov<gleb@redhat.com>
> >+ *
> >+ * This work is licensed under the terms of the GNU LGPLv3
> >+ * See the COPYING file in the top-level directory.
> >+ */
> >+
> >+#include "virtio-ring.h"
> >+#include "virtio-pci.h"
> >+
> >+int vp_find_vq(unsigned int ioaddr, int queue_index,
> >+               struct vring_virtqueue *vq)
> >+{
> >+   struct vring * vr =&vq->vring;
> >+   u16 num;
> >+
> >+   ASSERT32FLAT();
> >+   /* select the queue */
> >+
> >+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
> >+
> >+   /* check if the queue is available */
> >+
> >+   num = inw(ioaddr + VIRTIO_PCI_QUEUE_NUM);
> >+   if (!num) {
> >+       dprintf(1, "ERROR: queue size is 0\n");
> >+       return -1;
> >+   }
> >+
> >+   if (num>  MAX_QUEUE_NUM) {
> >+       dprintf(1, "ERROR: queue size %d>  %d\n", num, MAX_QUEUE_NUM);
> >+       return -1;
> >+   }
> >+
> >+   /* check if the queue is already active */
> >+
> >+   if (inl(ioaddr + VIRTIO_PCI_QUEUE_PFN)) {
> >+       dprintf(1, "ERROR: queue already active\n");
> >+       return -1;
> >+   }
> >+
> >+   vq->queue_index = queue_index;
> >+
> >+   /* initialize the queue */
> >+
> >+   vring_init(vr, num, (unsigned char*)&vq->queue);
> >+
> >+   /* activate the queue
> >+    *
> >+    * NOTE: vr->desc is initialized by vring_init()
> >+    */
> >+
> >+   outl((unsigned long)virt_to_phys(vr->desc)>>  PAGE_SHIFT,
> >+        ioaddr + VIRTIO_PCI_QUEUE_PFN);
> >+
> >+   return num;
> >+}
> >diff --git a/src/virtio-pci.h b/src/virtio-pci.h
> >new file mode 100644
> >index 0000000..6932036
> >--- /dev/null
> >+++ b/src/virtio-pci.h
> >@@ -0,0 +1,97 @@
> >+#ifndef _VIRTIO_PCI_H
> >+#define _VIRTIO_PCI_H
> >+
> >+/* A 32-bit r/o bitmask of the features supported by the host */
> >+#define VIRTIO_PCI_HOST_FEATURES        0
> >+
> >+/* A 32-bit r/w bitmask of features activated by the guest */
> >+#define VIRTIO_PCI_GUEST_FEATURES       4
> >+
> >+/* A 32-bit r/w PFN for the currently selected queue */
> >+#define VIRTIO_PCI_QUEUE_PFN            8
> >+
> >+/* A 16-bit r/o queue size for the currently selected queue */
> >+#define VIRTIO_PCI_QUEUE_NUM            12
> >+
> >+/* A 16-bit r/w queue selector */
> >+#define VIRTIO_PCI_QUEUE_SEL            14
> >+
> >+/* A 16-bit r/w queue notifier */
> >+#define VIRTIO_PCI_QUEUE_NOTIFY         16
> >+
> >+/* An 8-bit device status register.  */
> >+#define VIRTIO_PCI_STATUS               18
> >+
> >+/* An 8-bit r/o interrupt status register.  Reading the value will return the
> >+ * current contents of the ISR and will also clear it.  This is effectively
> >+ * a read-and-acknowledge. */
> >+#define VIRTIO_PCI_ISR                  19
> >+
> >+/* The bit of the ISR which indicates a device configuration change. */
> >+#define VIRTIO_PCI_ISR_CONFIG           0x2
> >+
> >+/* The remaining space is defined by each driver as the per-driver
> >+ * configuration space */
> >+#define VIRTIO_PCI_CONFIG               20
> >+
> >+/* Virtio ABI version, this must match exactly */
> >+#define VIRTIO_PCI_ABI_VERSION          0
> >+
> >+static inline u32 vp_get_features(unsigned int ioaddr)
> >+{
> >+   return inl(ioaddr + VIRTIO_PCI_HOST_FEATURES);
> >+}
> >+
> >+static inline void vp_set_features(unsigned int ioaddr, u32 features)
> >+{
> >+        outl(features, ioaddr + VIRTIO_PCI_GUEST_FEATURES);
> >+}
> >+
> >+static inline void vp_get(unsigned int ioaddr, unsigned offset,
> >+                     void *buf, unsigned len)
> >+{
> >+   u8 *ptr = buf;
> >+   unsigned i;
> >+
> >+   for (i = 0; i<  len; i++)
> >+           ptr[i] = inb(ioaddr + VIRTIO_PCI_CONFIG + offset + i);
> >+}
> >+
> >+static inline u8 vp_get_status(unsigned int ioaddr)
> >+{
> >+   return inb(ioaddr + VIRTIO_PCI_STATUS);
> >+}
> >+
> >+static inline void vp_set_status(unsigned int ioaddr, u8 status)
> >+{
> >+   if (status == 0)        /* reset */
> >+           return;
> >+   outb(status, ioaddr + VIRTIO_PCI_STATUS);
> >+}
> >+
> >+
> >+static inline void vp_reset(unsigned int ioaddr)
> >+{
> >+   outb(0, ioaddr + VIRTIO_PCI_STATUS);
> >+   (void)inb(ioaddr + VIRTIO_PCI_ISR);
> >+}
> >+
> >+static inline void vp_notify(unsigned int ioaddr, int queue_index)
> >+{
> >+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
> >+}
> >+
> >+static inline void vp_del_vq(unsigned int ioaddr, int queue_index)
> >+{
> >+   /* select the queue */
> >+
> >+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
> >+
> >+   /* deactivate the queue */
> >+
> >+   outl(0, ioaddr + VIRTIO_PCI_QUEUE_PFN);
> >+}
> >+
> >+int vp_find_vq(unsigned int ioaddr, int queue_index,
> >+               struct vring_virtqueue *vq);
> >+#endif /* _VIRTIO_PCI_H_ */
> >diff --git a/src/virtio-ring.c b/src/virtio-ring.c
> >new file mode 100644
> >index 0000000..f4a2efe
> >--- /dev/null
> >+++ b/src/virtio-ring.c
> >@@ -0,0 +1,152 @@
> >+/* virtio-pci.c - virtio ring management
> >+ *
> >+ * (c) Copyright 2008 Bull S.A.S.
> >+ *
> >+ *  Author: Laurent Vivier<Laurent.Vivier@bull.net>
> >+ *
> >+ *  some parts from Linux Virtio Ring
> >+ *
> >+ *  Copyright Rusty Russell IBM Corporation 2007
> >+ *
> >+ *  Adopted for Seabios: Gleb Natapov<gleb@redhat.com>
> >+ *
> >+ * This work is licensed under the terms of the GNU LGPLv3
> >+ * See the COPYING file in the top-level directory.
> >+ *
> >+ *
> >+ */
> >+
> >+#include "virtio-ring.h"
> >+#include "virtio-pci.h"
> >+
> >+#define BUG() do {                                      \
> >+        dprintf(1, "BUG: failure at %s:%d/%s()!\n",     \
> >+                __FILE__, __LINE__, __FUNCTION__);      \
> >+                while(1);                               \
> >+        } while (0)
> >+#define BUG_ON(condition) do { if (condition) BUG(); } while (0)
> >+
> >+/*
> >+ * vring_more_used
> >+ *
> >+ * is there some used buffers ?
> >+ *
> >+ */
> >+
> >+int vring_more_used(struct vring_virtqueue *vq)
> >+{
> >+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> >+    wmb();
> >+    return GET_FLATPTR(vq->last_used_idx) != GET_FLATPTR(used->idx);
> >+}
> >+
> >+/*
> >+ * vring_free
> >+ *
> >+ * put at the begin of the free list the current desc[head]
> >+ */
> >+
> >+void vring_detach(struct vring_virtqueue *vq, unsigned int head)
> >+{
> >+    struct vring *vr =&vq->vring;
> >+    struct vring_desc *desc = GET_FLATPTR(vr->desc);
> >+    unsigned int i;
> >+
> >+    /* find end of given descriptor */
> >+
> >+    i = head;
> >+    while (GET_FLATPTR(desc[i].flags)&  VRING_DESC_F_NEXT)
> >+        i = GET_FLATPTR(desc[i].next);
> >+
> >+    /* link it with free list and point to it */
> >+
> >+    SET_FLATPTR(desc[i].next, GET_FLATPTR(vq->free_head));
> >+    wmb();
> >+    SET_FLATPTR(vq->free_head, head);
> >+}
> >+
> >+/*
> >+ * vring_get_buf
> >+ *
> >+ * get a buffer from the used list
> >+ *
> >+ */
> >+
> >+int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len)
> >+{
> >+    struct vring *vr =&vq->vring;
> >+    struct vring_used_elem *elem;
> >+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> >+    u32 id;
> >+    int ret;
> >+
> >+//    BUG_ON(!vring_more_used(vq));
> >+
> >+    elem =&used->ring[GET_FLATPTR(vq->last_used_idx) % GET_FLATPTR(vr->num)];
> >+    wmb();
> >+    id = GET_FLATPTR(elem->id);
> >+    if (len != NULL)
> >+        *len = GET_FLATPTR(elem->len);
> >+
> >+    ret = GET_FLATPTR(vq->vdata[id]);
> >+
> >+    vring_detach(vq, id);
> >+
> >+    SET_FLATPTR(vq->last_used_idx, GET_FLATPTR(vq->last_used_idx) + 1);
> >+
> >+    return ret;
> >+}
> >+
> >+void vring_add_buf(struct vring_virtqueue *vq,
> >+                   struct vring_list list[],
> >+                   unsigned int out, unsigned int in,
> >+                   int index, int num_added)
> >+{
> >+    struct vring *vr =&vq->vring;
> >+    int i, av, head, prev;
> >+    struct vring_desc *desc = GET_FLATPTR(vr->desc);
> >+    struct vring_avail *avail = GET_FLATPTR(vr->avail);
> >+
> >+    BUG_ON(out + in == 0);
> >+
> >+    prev = 0;
> >+    head = GET_FLATPTR(vq->free_head);
> >+    for (i = head; out; i = GET_FLATPTR(desc[i].next), out--) {
> >+        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT);
> >+        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
> >+        SET_FLATPTR(desc[i].len, list->length);
> >+        prev = i;
> >+        list++;
> >+    }
> >+    for ( ; in; i = GET_FLATPTR(desc[i].next), in--) {
> >+        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT|VRING_DESC_F_WRITE);
> >+        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
> >+        SET_FLATPTR(desc[i].len, list->length);
> >+        prev = i;
> >+        list++;
> >+    }
> >+    SET_FLATPTR(desc[prev].flags,
> >+                GET_FLATPTR(desc[prev].flags)&  ~VRING_DESC_F_NEXT);
> >+
> >+    SET_FLATPTR(vq->free_head, i);
> >+
> >+    SET_FLATPTR(vq->vdata[head], index);
> >+
> >+    av = (GET_FLATPTR(avail->idx) + num_added) % GET_FLATPTR(vr->num);
> >+    SET_FLATPTR(avail->ring[av], head);
> >+    wmb();
> >+}
> >+
> >+void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added)
> >+{
> >+    struct vring *vr =&vq->vring;
> >+    struct vring_avail *avail = GET_FLATPTR(vr->avail);
> >+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
> >+
> >+    wmb();
> >+    SET_FLATPTR(avail->idx, GET_FLATPTR(avail->idx) + num_added);
> >+
> >+    mb();
> >+    if (!(GET_FLATPTR(used->flags)&  VRING_USED_F_NO_NOTIFY))
> >+        vp_notify(ioaddr, GET_FLATPTR(vq->queue_index));
> >+}
> >diff --git a/src/virtio-ring.h b/src/virtio-ring.h
> >new file mode 100644
> >index 0000000..b97d572
> >--- /dev/null
> >+++ b/src/virtio-ring.h
> >@@ -0,0 +1,125 @@
> >+#ifndef _VIRTIO_RING_H
> >+#define _VIRTIO_RING_H
> >+
> >+#define PAGE_SHIFT 12
> >+#define PAGE_MASK  (PAGE_SIZE-1)
> >+
> >+#define virt_to_phys(v) (unsigned long)(v)
> >+#define phys_to_virt(p) (void*)(p)
> >+#define wmb() barrier()
> >+#define mb() barrier()
> >+
> >+/* Status byte for guest to report progress, and synchronize features. */
> >+/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
> >+#define VIRTIO_CONFIG_S_ACKNOWLEDGE     1
> >+/* We have found a driver for the device. */
> >+#define VIRTIO_CONFIG_S_DRIVER          2
> >+/* Driver has used its parts of the config, and is happy */
> >+#define VIRTIO_CONFIG_S_DRIVER_OK       4
> >+/* We've given up on this device. */
> >+#define VIRTIO_CONFIG_S_FAILED          0x80
> >+
> >+#define MAX_QUEUE_NUM      (128)
> >+
> >+#define VRING_DESC_F_NEXT  1
> >+#define VRING_DESC_F_WRITE 2
> >+
> >+#define VRING_AVAIL_F_NO_INTERRUPT 1
> >+
> >+#define VRING_USED_F_NO_NOTIFY     1
> >+
> >+struct vring_desc
> >+{
> >+   u64 addr;
> >+   u32 len;
> >+   u16 flags;
> >+   u16 next;
> >+};
> >+
> >+struct vring_avail
> >+{
> >+   u16 flags;
> >+   u16 idx;
> >+   u16 ring[0];
> >+};
> >+
> >+struct vring_used_elem
> >+{
> >+   u32 id;
> >+   u32 len;
> >+};
> >+
> >+struct vring_used
> >+{
> >+   u16 flags;
> >+   u16 idx;
> >+   struct vring_used_elem ring[];
> >+};
> >+
> >+struct vring {
> >+   unsigned int num;
> >+   struct vring_desc *desc;
> >+   struct vring_avail *avail;
> >+   struct vring_used *used;
> >+};
> >+
> >+#define vring_size(num) \
> >+   (((((sizeof(struct vring_desc) * num) + \
> >+      (sizeof(struct vring_avail) + sizeof(u16) * num)) \
> >+         + PAGE_MASK)&  ~PAGE_MASK) + \
> >+         (sizeof(struct vring_used) + sizeof(struct vring_used_elem) * num))
> >+
> >+typedef unsigned char virtio_queue_t[PAGE_MASK + vring_size(MAX_QUEUE_NUM)];
> >+
> >+struct vring_virtqueue {
> >+   virtio_queue_t queue;
> >+   struct vring vring;
> >+   u16 free_head;
> >+   u16 last_used_idx;
> >+   u16 vdata[MAX_QUEUE_NUM];
> >+   /* PCI */
> >+   int queue_index;
> >+};
> >+
> >+struct vring_list {
> >+  char *addr;
> >+  unsigned int length;
> >+};
> >+
> >+static inline void vring_init(struct vring *vr,
> >+                         unsigned int num, unsigned char *queue)
> >+{
> >+   unsigned int i;
> >+   unsigned long pa;
> >+
> >+   ASSERT32FLAT();
> >+   vr->num = num;
> >+
> >+   /* physical address of desc must be page aligned */
> >+
> >+   pa = virt_to_phys(queue);
> >+   pa = (pa + PAGE_MASK)&  ~PAGE_MASK;
> >+   vr->desc = phys_to_virt(pa);
> >+
> >+   vr->avail = (struct vring_avail *)&vr->desc[num];
> >+
> >+   /* physical address of used must be page aligned */
> >+
> >+   pa = virt_to_phys(&vr->avail->ring[num]);
> >+   pa = (pa + PAGE_MASK)&  ~PAGE_MASK;
> >+   vr->used = phys_to_virt(pa);
> >+
> >+   for (i = 0; i<  num - 1; i++)
> >+           vr->desc[i].next = i + 1;
> >+   vr->desc[i].next = 0;
> >+}
> >+
> >+int vring_more_used(struct vring_virtqueue *vq);
> >+void vring_detach(struct vring_virtqueue *vq, unsigned int head);
> >+int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len);
> >+void vring_add_buf(struct vring_virtqueue *vq, struct vring_list list[],
> >+                   unsigned int out, unsigned int in,
> >+                   int index, int num_added);
> >+void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added);
> >+
> >+#endif /* _VIRTIO_RING_H_ */
> >
> >--
> >			Gleb.
> >--
> >To unsubscribe from this list: send the line "unsubscribe kvm" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.
Anthony Liguori - May 10, 2010, 3:58 p.m.
On 05/10/2010 10:54 AM, Gleb Natapov wrote:
> On Mon, May 10, 2010 at 10:48:42AM -0500, Anthony Liguori wrote:
>    
>> On 05/10/2010 03:11 AM, Gleb Natapov wrote:
>>      
>>> This patch adds native support for booting from virtio disks to Seabios.
>>>
>>> Signed-off-by: Gleb Natapov<gleb@redhat.com>
>>>        
>> A related problem that I think we need to think about how we solve
>> is indicating to Seabios which device we want to boot from
>>
>> With your patch, a user can select a virtio device explicitly or if
>> they use only one virtio device, it will Just Work.
>>
>> However, if a user uses IDE and virtio, or a user has multiple
>> disks, they cannot select a device via -boot.
>>
>>      
> Isn't this problem unrelated to this patch?  I mean if I start qemu with
> two ide devices can I specify from qemu command line which one I want to
> boot from?
>    

That's sort of what I'm asking.  If you compare this approach to 
extboot, extboot provided a capability to select a disk.  I think it can 
be argued though that this isn't a necessary feature to carry over and 
I'm looking for additional opinions on that.

Regards,

Anthony Liguori
Gleb Natapov - May 10, 2010, 4:09 p.m.
On Mon, May 10, 2010 at 10:58:45AM -0500, Anthony Liguori wrote:
> On 05/10/2010 10:54 AM, Gleb Natapov wrote:
> >On Mon, May 10, 2010 at 10:48:42AM -0500, Anthony Liguori wrote:
> >>On 05/10/2010 03:11 AM, Gleb Natapov wrote:
> >>>This patch adds native support for booting from virtio disks to Seabios.
> >>>
> >>>Signed-off-by: Gleb Natapov<gleb@redhat.com>
> >>A related problem that I think we need to think about how we solve
> >>is indicating to Seabios which device we want to boot from
> >>
> >>With your patch, a user can select a virtio device explicitly or if
> >>they use only one virtio device, it will Just Work.
> >>
> >>However, if a user uses IDE and virtio, or a user has multiple
> >>disks, they cannot select a device via -boot.
> >>
> >Isn't this problem unrelated to this patch?  I mean if I start qemu with
> >two ide devices can I specify from qemu command line which one I want to
> >boot from?
> 
> That's sort of what I'm asking.  If you compare this approach to
> extboot, extboot provided a capability to select a disk.  I think it
> can be argued though that this isn't a necessary feature to carry
> over and I'm looking for additional opinions on that.
> 
Well, extboot is just a hack and shouldn't be used with ide disks at
all. With extboot it is not possible to switch to another disk from
F12 menu for instance (is it actually possible to read more then one
disks with bios int13 when extboot is in use?). About specifying boot
disk from qemu command like I think it will be very useful. It is not
clear how to pass default boot device into Seabios though. We should
pass a bus boot device is attached too (ide/virtio) and an unique id
of the device on the bus.

--
			Gleb.
Avi Kivity - May 11, 2010, 8:19 a.m.
On 05/10/2010 06:48 PM, Anthony Liguori wrote:
> On 05/10/2010 03:11 AM, Gleb Natapov wrote:
>> This patch adds native support for booting from virtio disks to Seabios.
>>
>> Signed-off-by: Gleb Natapov<gleb@redhat.com>
>
> A related problem that I think we need to think about how we solve is 
> indicating to Seabios which device we want to boot from
>
> With your patch, a user can select a virtio device explicitly or if 
> they use only one virtio device, it will Just Work.
>
> However, if a user uses IDE and virtio, or a user has multiple disks, 
> they cannot select a device via -boot.
>
> Is this something we need to address?  I don't think we'd break 
> libvirt if we didn't.

BIOSes traditionally address this by storing the boot order in RTC 
non-volatile memory, and allow the user to configure the order via a 
menu.  We could do the same (storing the RTC memory in a small disk image).

Alternatively we can seed the order from the command line (-boot 
id1,id2,id3 where id* are some qdev property attached to disks, this is 
more flexible than the current syntax I think).
Stefan Hajnoczi - May 11, 2010, 9:04 a.m.
From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
bev[], which contains IPL entries (the drives).

Is the order of bev[] entries well-defined?  Is there a way for QEMU
command-line to know that the first virtio-blk device corresponds to x
and the IDE CD-ROM corresponds to y?

Stefan
Gleb Natapov - May 11, 2010, 12:31 p.m.
On Tue, May 11, 2010 at 11:19:07AM +0300, Avi Kivity wrote:
> On 05/10/2010 06:48 PM, Anthony Liguori wrote:
> >On 05/10/2010 03:11 AM, Gleb Natapov wrote:
> >>This patch adds native support for booting from virtio disks to Seabios.
> >>
> >>Signed-off-by: Gleb Natapov<gleb@redhat.com>
> >
> >A related problem that I think we need to think about how we solve
> >is indicating to Seabios which device we want to boot from
> >
> >With your patch, a user can select a virtio device explicitly or
> >if they use only one virtio device, it will Just Work.
> >
> >However, if a user uses IDE and virtio, or a user has multiple
> >disks, they cannot select a device via -boot.
> >
> >Is this something we need to address?  I don't think we'd break
> >libvirt if we didn't.
> 
> BIOSes traditionally address this by storing the boot order in RTC
> non-volatile memory, and allow the user to configure the order via a
> menu.  We could do the same (storing the RTC memory in a small disk
> image).
> 
Real BIOS can do that because it enumerates all bootable devices,
attach name for each one of them and then asks user to configure
boot order using names it attached to devices. In our case we
want to provide boot order on qemu command line before BIOS
enumerated devices, so qemu should be able to pass enough information
about boot device so that BIOS can uniquely identify it after it will
discover all bootable devices. bus/device pair can be such thing.

> Alternatively we can seed the order from the command line (-boot
> id1,id2,id3 where id* are some qdev property attached to disks, this
> is more flexible than the current syntax I think).
> 
The problem is how to communicate this order to Seabios.

--
			Gleb.
Gleb Natapov - May 11, 2010, 12:33 p.m.
On Tue, May 11, 2010 at 10:04:25AM +0100, Stefan Hajnoczi wrote:
> >From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
> CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
> bev[], which contains IPL entries (the drives).
> 
> Is the order of bev[] entries well-defined?  Is there a way for QEMU
> command-line to know that the first virtio-blk device corresponds to x
> and the IDE CD-ROM corresponds to y?
> 
The order of bev[] is the order in which bootable devices where discovered
and can change from one version of Seabios to another.

--
			Gleb.
Kevin O'Connor - May 11, 2010, 12:45 p.m.
On Tue, May 11, 2010 at 10:04:25AM +0100, Stefan Hajnoczi wrote:
> From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
> CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
> bev[], which contains IPL entries (the drives).
> 
> Is the order of bev[] entries well-defined?  Is there a way for QEMU
> command-line to know that the first virtio-blk device corresponds to x
> and the IDE CD-ROM corresponds to y?

SeaBIOS arranges for bev[0] = floppy, bev[1] = hd, bev[2] = cdrom, and
bev[3] to be the first network card - it does this so that the boot
order can be read from qemu.  However, it's a pain to force this
order.

-Kevin
Gleb Natapov - May 11, 2010, 12:47 p.m.
On Tue, May 11, 2010 at 08:45:29AM -0400, Kevin O'Connor wrote:
> On Tue, May 11, 2010 at 10:04:25AM +0100, Stefan Hajnoczi wrote:
> > From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
> > CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
> > bev[], which contains IPL entries (the drives).
> > 
> > Is the order of bev[] entries well-defined?  Is there a way for QEMU
> > command-line to know that the first virtio-blk device corresponds to x
> > and the IDE CD-ROM corresponds to y?
> 
> SeaBIOS arranges for bev[0] = floppy, bev[1] = hd, bev[2] = cdrom, and
> bev[3] to be the first network card - it does this so that the boot
> order can be read from qemu.  However, it's a pain to force this
> order.
> 
What if there are more then one disk?

--
			Gleb.
Kevin O'Connor - May 12, 2010, 12:44 a.m.
On Tue, May 11, 2010 at 03:47:40PM +0300, Gleb Natapov wrote:
> On Tue, May 11, 2010 at 08:45:29AM -0400, Kevin O'Connor wrote:
> > On Tue, May 11, 2010 at 10:04:25AM +0100, Stefan Hajnoczi wrote:
> > > From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
> > > CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
> > > bev[], which contains IPL entries (the drives).
> > > 
> > > Is the order of bev[] entries well-defined?  Is there a way for QEMU
> > > command-line to know that the first virtio-blk device corresponds to x
> > > and the IDE CD-ROM corresponds to y?
> > 
> > SeaBIOS arranges for bev[0] = floppy, bev[1] = hd, bev[2] = cdrom, and
> > bev[3] to be the first network card - it does this so that the boot
> > order can be read from qemu.  However, it's a pain to force this
> > order.
> > 
> What if there are more then one disk?

It's possible to boot from the A drive (floppy) or the C drive (hd).
There's no standard way to boot from the D drive.  So, when booting
from the second hard drive, SeaBIOS arranges for that drive to be
mapped as the C drive.

The boot order (eg, floppy, hd, cdroms, network cards) is determined
by the BEV (Boot Execution Vector) list.  The harddrive registration
order (eg, C, D, E) is determined by the BCV (Boot Connection Vector)
list.

When one selects a hard drive in SeaBIOS' boot menu, SeaBIOS actually
does two things - it makes hd booting the first entry in the BEV list
and it makes the selected hd the first entry in the BCV list.

It's a mess - but that's what the BIOS Boot Specification (BBS)
defines.  Both option roms and bootloaders depend on this behavior.

-Kevin
Avi Kivity - May 12, 2010, 7:22 a.m.
On 05/11/2010 03:31 PM, Gleb Natapov wrote:
> On Tue, May 11, 2010 at 11:19:07AM +0300, Avi Kivity wrote:
>    
>> On 05/10/2010 06:48 PM, Anthony Liguori wrote:
>>      
>>> On 05/10/2010 03:11 AM, Gleb Natapov wrote:
>>>        
>>>> This patch adds native support for booting from virtio disks to Seabios.
>>>>
>>>> Signed-off-by: Gleb Natapov<gleb@redhat.com>
>>>>          
>>> A related problem that I think we need to think about how we solve
>>> is indicating to Seabios which device we want to boot from
>>>
>>> With your patch, a user can select a virtio device explicitly or
>>> if they use only one virtio device, it will Just Work.
>>>
>>> However, if a user uses IDE and virtio, or a user has multiple
>>> disks, they cannot select a device via -boot.
>>>
>>> Is this something we need to address?  I don't think we'd break
>>> libvirt if we didn't.
>>>        
>> BIOSes traditionally address this by storing the boot order in RTC
>> non-volatile memory, and allow the user to configure the order via a
>> menu.  We could do the same (storing the RTC memory in a small disk
>> image).
>>
>>      
> Real BIOS can do that because it enumerates all bootable devices,
> attach name for each one of them and then asks user to configure
> boot order using names it attached to devices. In our case we
> want to provide boot order on qemu command line before BIOS
> enumerated devices, so qemu should be able to pass enough information
> about boot device so that BIOS can uniquely identify it after it will
> discover all bootable devices. bus/device pair can be such thing.
>    

Having a BIOS menu is also useful, you don't have to drop to the 
management tool, instead you do everything from the console.

>    
>> Alternatively we can seed the order from the command line (-boot
>> id1,id2,id3 where id* are some qdev property attached to disks, this
>> is more flexible than the current syntax I think).
>>
>>      
> The problem is how to communicate this order to Seabios.
>    

Topology (bus/device/lun).
Gleb Natapov - May 12, 2010, 7:55 a.m.
On Wed, May 12, 2010 at 10:22:59AM +0300, Avi Kivity wrote:
> On 05/11/2010 03:31 PM, Gleb Natapov wrote:
> >On Tue, May 11, 2010 at 11:19:07AM +0300, Avi Kivity wrote:
> >>On 05/10/2010 06:48 PM, Anthony Liguori wrote:
> >>>On 05/10/2010 03:11 AM, Gleb Natapov wrote:
> >>>>This patch adds native support for booting from virtio disks to Seabios.
> >>>>
> >>>>Signed-off-by: Gleb Natapov<gleb@redhat.com>
> >>>A related problem that I think we need to think about how we solve
> >>>is indicating to Seabios which device we want to boot from
> >>>
> >>>With your patch, a user can select a virtio device explicitly or
> >>>if they use only one virtio device, it will Just Work.
> >>>
> >>>However, if a user uses IDE and virtio, or a user has multiple
> >>>disks, they cannot select a device via -boot.
> >>>
> >>>Is this something we need to address?  I don't think we'd break
> >>>libvirt if we didn't.
> >>BIOSes traditionally address this by storing the boot order in RTC
> >>non-volatile memory, and allow the user to configure the order via a
> >>menu.  We could do the same (storing the RTC memory in a small disk
> >>image).
> >>
> >Real BIOS can do that because it enumerates all bootable devices,
> >attach name for each one of them and then asks user to configure
> >boot order using names it attached to devices. In our case we
> >want to provide boot order on qemu command line before BIOS
> >enumerated devices, so qemu should be able to pass enough information
> >about boot device so that BIOS can uniquely identify it after it will
> >discover all bootable devices. bus/device pair can be such thing.
> 
> Having a BIOS menu is also useful, you don't have to drop to the
> management tool, instead you do everything from the console.
> 
In Seabios we have functional boot menu. But it is management who
controls what disk plugged were.

> >>Alternatively we can seed the order from the command line (-boot
> >>id1,id2,id3 where id* are some qdev property attached to disks, this
> >>is more flexible than the current syntax I think).
> >>
> >The problem is how to communicate this order to Seabios.
> 
> Topology (bus/device/lun).
> 
Yeah, that what I proposed too actually.

--
			Gleb.
Kevin O'Connor - May 12, 2010, 12:57 p.m.
On Wed, May 12, 2010 at 10:22:59AM +0300, Avi Kivity wrote:
> On 05/11/2010 03:31 PM, Gleb Natapov wrote:
> >Real BIOS can do that because it enumerates all bootable devices,
> >attach name for each one of them and then asks user to configure
> >boot order using names it attached to devices. In our case we
> >want to provide boot order on qemu command line before BIOS
> >enumerated devices, so qemu should be able to pass enough information
> >about boot device so that BIOS can uniquely identify it after it will
> >discover all bootable devices. bus/device pair can be such thing.
> 
> Having a BIOS menu is also useful, you don't have to drop to the
> management tool, instead you do everything from the console.

Having a "setup menu" is something real hardware could use as well.  I
don't think the setup menu should be in SeaBIOS - instead, SeaBIOS
could launch another program (stored in flash or qemu_fw) dedicated to
doing setup.

> >>Alternatively we can seed the order from the command line (-boot
> >>id1,id2,id3 where id* are some qdev property attached to disks, this
> >>is more flexible than the current syntax I think).
> >>
> >The problem is how to communicate this order to Seabios.
> 
> Topology (bus/device/lun).

USB is a pain here.  It's posible with BDF (Bus/Dev/Fn) and port
number (which accounts for hubs having ports as well).

-Kevin
Avi Kivity - May 13, 2010, 4:49 p.m.
On 05/10/2010 06:58 PM, Anthony Liguori wrote:
>> Isn't this problem unrelated to this patch?  I mean if I start qemu with
>> two ide devices can I specify from qemu command line which one I want to
>> boot from?
>
> That's sort of what I'm asking.  If you compare this approach to 
> extboot, extboot provided a capability to select a disk.  I think it 
> can be argued though that this isn't a necessary feature to carry over 
> and I'm looking for additional opinions on that.

I'd say it's a necessary feature, but not one to carry over from the 
extboot implementation.  We have the seabios boot menu (how to reach 
it?), we need to store the nvram persistently,  and we need to extend 
the selection menu to qemu, but that's unrelated to this patch.
Gleb Natapov - May 16, 2010, 8:28 a.m.
On Thu, May 13, 2010 at 07:49:40PM +0300, Avi Kivity wrote:
> On 05/10/2010 06:58 PM, Anthony Liguori wrote:
> >>Isn't this problem unrelated to this patch?  I mean if I start qemu with
> >>two ide devices can I specify from qemu command line which one I want to
> >>boot from?
> >
> >That's sort of what I'm asking.  If you compare this approach to
> >extboot, extboot provided a capability to select a disk.  I think
> >it can be argued though that this isn't a necessary feature to
> >carry over and I'm looking for additional opinions on that.
> 
> I'd say it's a necessary feature, but not one to carry over from the
> extboot implementation.  We have the seabios boot menu (how to reach
> it?), we need to store the nvram persistently,  and we need to
> extend the selection menu to qemu, but that's unrelated to this
> patch.
> 
To reach seabios boot menu run qemu with "-boot menu=on" option and press
f12 when prompted.

--
			Gleb.

Patch

diff --git a/Makefile b/Makefile
index 327a1bf..d0b8881 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,8 @@  OUT=out/
 SRCBOTH=misc.c pmm.c stacks.c output.c util.c block.c floppy.c ata.c mouse.c \
         kbd.c pci.c serial.c clock.c pic.c cdrom.c ps2port.c smp.c resume.c \
         pnpbios.c pirtable.c vgahooks.c ramdisk.c pcibios.c blockcmd.c \
-        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c
+        usb.c usb-uhci.c usb-ohci.c usb-ehci.c usb-hid.c usb-msc.c \
+        virtio-ring.c virtio-pci.c virtio-blk.c
 SRC16=$(SRCBOTH) system.c disk.c apm.c font.c
 SRC32FLAT=$(SRCBOTH) post.c shadow.c memmap.c coreboot.c boot.c \
       acpi.c smm.c mptable.c smbios.c pciinit.c optionroms.c mtrr.c \
diff --git a/src/block.c b/src/block.c
index ddf441f..b6b1902 100644
--- a/src/block.c
+++ b/src/block.c
@@ -11,6 +11,7 @@ 
 #include "util.h" // dprintf
 #include "ata.h" // process_ata_op
 #include "usb-msc.h" // process_usb_op
+#include "virtio-blk.h" // process_virtio_op
 
 struct drives_s Drives VAR16VISIBLE;
 
@@ -289,6 +290,8 @@  process_op(struct disk_op_s *op)
         return process_cdemu_op(op);
     case DTYPE_USB:
         return process_usb_op(op);
+    case DTYPE_VIRTIO:
+	return process_virtio_op(op);
     default:
         op->count = 0;
         return DISK_RET_EPARAM;
diff --git a/src/config.h b/src/config.h
index b101174..ad569c6 100644
--- a/src/config.h
+++ b/src/config.h
@@ -136,6 +136,9 @@ 
 #define CONFIG_SUBMODEL_ID   0x00
 #define CONFIG_BIOS_REVISION 0x01
 
+// Support boot from virtio storage
+#define CONFIG_VIRTIO_BLK 1
+
 // Various memory addresses used by the code.
 #define BUILD_STACK_ADDR          0x7000
 #define BUILD_S3RESUME_STACK_ADDR 0x1000
diff --git a/src/disk.h b/src/disk.h
index 0cd1b74..9e5b083 100644
--- a/src/disk.h
+++ b/src/disk.h
@@ -197,6 +197,7 @@  struct drive_s {
 #define DTYPE_RAMDISK  0x04
 #define DTYPE_CDEMU    0x05
 #define DTYPE_USB      0x06
+#define DTYPE_VIRTIO   0x07
 
 #define MAXDESCSIZE 80
 
diff --git a/src/pci_ids.h b/src/pci_ids.h
index 1800f1d..e1cded2 100644
--- a/src/pci_ids.h
+++ b/src/pci_ids.h
@@ -2605,3 +2605,6 @@ 
 #define PCI_DEVICE_ID_RME_DIGI32	0x9896
 #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
 #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET	0x1af4
+#define PCI_DEVICE_ID_VIRTIO_BLK	0x1001
diff --git a/src/post.c b/src/post.c
index 638b0f7..25535e2 100644
--- a/src/post.c
+++ b/src/post.c
@@ -23,6 +23,7 @@ 
 #include "smbios.h" // smbios_init
 #include "paravirt.h" // qemu_cfg_port_probe
 #include "ps2port.h" // ps2port_setup
+#include "virtio-blk.h" // virtio_blk_setup
 
 void
 __set_irq(int vector, void *loc)
@@ -184,6 +185,7 @@  init_hw(void)
     floppy_setup();
     ata_setup();
     ramdisk_setup();
+    virtio_blk_setup();
 }
 
 // Main setup code.
diff --git a/src/virtio-blk.c b/src/virtio-blk.c
new file mode 100644
index 0000000..a41c336
--- /dev/null
+++ b/src/virtio-blk.c
@@ -0,0 +1,155 @@ 
+// Virtio blovl boot support.
+//
+// Copyright (C) 2010 Red Hat Inc.
+//
+// Authors:
+//  Gleb Natapov <gnatapov@redhat.com>
+//
+// This file may be distributed under the terms of the GNU LGPLv3 license.
+
+#include "util.h" // dprintf
+#include "pci.h" // foreachpci
+#include "config.h" // CONFIG_*
+#include "virtio-pci.h"
+#include "virtio-blk.h"
+#include "disk.h"
+
+struct virtiodrive_s {
+    struct drive_s drive;
+    struct vring_virtqueue *vq;
+    u16 ioaddr;
+};
+
+static int
+virtio_blk_read(struct disk_op_s *op)
+{
+    struct virtiodrive_s *vdrive_g =
+        container_of(op->drive_g, struct virtiodrive_s, drive);
+    struct vring_virtqueue *vq = GET_GLOBAL(vdrive_g->vq);
+    struct virtio_blk_outhdr hdr = {
+        .type = VIRTIO_BLK_T_IN,
+        .ioprio = 0,
+        .sector = op->lba,
+    };
+    u8 status = VIRTIO_BLK_S_UNSUPP;
+    struct vring_list sg[] = {
+        {
+            .addr	= MAKE_FLATPTR(GET_SEG(SS), &hdr),
+            .length	= sizeof(hdr),
+        },
+        {
+            .addr	= op->buf_fl,
+            .length	= GET_GLOBAL(vdrive_g->drive.blksize) * op->count,
+        },
+        {
+            .addr	= MAKE_FLATPTR(GET_SEG(SS), &status),
+            .length	= sizeof(status),
+        },
+    };
+
+    /* Add to virtqueue and kick host */
+    vring_add_buf(vq, sg, 1, 2, 0, 0);
+    vring_kick(GET_GLOBAL(vdrive_g->ioaddr), vq, 1);
+
+    /* Wait for reply */
+    while (!vring_more_used(vq))
+        udelay(5);
+
+    /* Reclaim virtqueue element */
+    vring_get_buf(vq, NULL);
+    return status == VIRTIO_BLK_S_OK ? DISK_RET_SUCCESS : DISK_RET_EBADTRACK;
+}
+
+int
+process_virtio_op(struct disk_op_s *op)
+{
+    switch (op->command) {
+    case CMD_READ:
+        return virtio_blk_read(op);
+    case CMD_FORMAT:
+    case CMD_WRITE:
+        return DISK_RET_EWRITEPROTECT;
+    case CMD_RESET:
+    case CMD_ISREADY:
+    case CMD_VERIFY:
+    case CMD_SEEK:
+        return DISK_RET_SUCCESS;
+    default:
+        op->count = 0;
+        return DISK_RET_EPARAM;
+    }
+}
+
+void
+virtio_blk_setup(void)
+{
+    ASSERT32FLAT();
+    if (! CONFIG_VIRTIO_BLK)
+        return;
+
+    dprintf(3, "init virtio-blk\n");
+
+    int bdf, max;
+    u32 id = PCI_VENDOR_ID_REDHAT_QUMRANET | (PCI_DEVICE_ID_VIRTIO_BLK << 16);
+    foreachpci(bdf, max) {
+        u32 v = pci_config_readl(bdf, PCI_VENDOR_ID);
+        if (v != id)
+            continue;
+        dprintf(3, "found virtio-blk at %x:%x\n", pci_bdf_to_bus(bdf),
+                pci_bdf_to_dev(bdf));
+        char *desc = malloc_tmphigh(MAXDESCSIZE);
+        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
+        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
+        if (!vdrive_g || !desc || !vq) {
+            warn_noalloc();
+            return;
+        }
+        memset(vdrive_g, 0, sizeof(*vdrive_g));
+        vdrive_g->drive.type = DTYPE_VIRTIO;
+        vdrive_g->drive.cntl_id = bdf;
+        vdrive_g->vq = vq;
+
+        u16 ioaddr = pci_config_readl(bdf, PCI_BASE_ADDRESS_0) &
+            PCI_BASE_ADDRESS_IO_MASK;
+
+        vdrive_g->ioaddr = ioaddr;
+
+        vp_reset(ioaddr);
+        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+                      VIRTIO_CONFIG_S_DRIVER );
+
+        if (vp_find_vq(ioaddr, 0, vdrive_g->vq) < 0 ) {
+            free(vdrive_g);
+            free(desc);
+            free(vq);
+            dprintf(1, "fail to find vq for virtio-blk %x:%x\n",
+                    pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf));
+            continue;
+        }
+
+        struct virtio_blk_config cfg;
+        vp_get(ioaddr, 0, &cfg, sizeof(cfg));
+
+        vdrive_g->drive.blksize = cfg.blk_size;
+        vdrive_g->drive.sectors = cfg.capacity;
+        dprintf(3, "virtio-blk %x:%x blksize=%d sectors=%u\n",
+                pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf),
+                vdrive_g->drive.blksize, (u32)vdrive_g->drive.sectors);
+
+        vdrive_g->drive.pchs.cylinders = cfg.cylinders;
+        vdrive_g->drive.pchs.heads = cfg.heads;
+        vdrive_g->drive.pchs.spt = cfg.sectors;
+
+        setup_translation(&vdrive_g->drive);
+        add_bcv_internal(&vdrive_g->drive);
+
+        snprintf(desc, MAXDESCSIZE, "Virtio disk PCI:%x:%x",
+                 pci_bdf_to_bus(bdf), pci_bdf_to_dev(bdf));
+
+        vdrive_g->drive.desc = desc;
+
+        vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+                      VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK);
+    }
+}
+
diff --git a/src/virtio-blk.h b/src/virtio-blk.h
new file mode 100644
index 0000000..3369ea4
--- /dev/null
+++ b/src/virtio-blk.h
@@ -0,0 +1,40 @@ 
+#ifndef _VIRTIO_BLK_H
+#define _VIRTIO_BLK_H
+
+struct virtio_blk_config
+{
+    u64 capacity;
+    u32 size_max;
+    u32 seg_max;
+    u16 cylinders;
+    u8 heads;
+    u8 sectors;
+    u32 blk_size;
+    u8 physical_block_exp;
+    u8 alignment_offset;
+    u16 min_io_size;
+    u32 opt_io_size;
+} __attribute__((packed));
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN		0
+#define VIRTIO_BLK_T_OUT	1
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr {
+    /* VIRTIO_BLK_T* */
+    u32 type;
+    /* io priority. */
+    u32 ioprio;
+    /* Sector (ie. 512 byte offset) */
+    u64 sector;
+};
+
+#define VIRTIO_BLK_S_OK		0
+#define VIRTIO_BLK_S_IOERR	1
+#define VIRTIO_BLK_S_UNSUPP	2
+
+int process_virtio_op(struct disk_op_s *op);
+void virtio_blk_setup(void);
+
+#endif /* _VIRTIO_BLK_H */
diff --git a/src/virtio-pci.c b/src/virtio-pci.c
new file mode 100644
index 0000000..e171ea3
--- /dev/null
+++ b/src/virtio-pci.c
@@ -0,0 +1,67 @@ 
+/* virtio-pci.c - pci interface for virtio interface
+ *
+ * (c) Copyright 2008 Bull S.A.S.
+ *
+ *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * some parts from Linux Virtio PCI driver
+ *
+ *  Copyright IBM Corp. 2007
+ *  Authors: Anthony Liguori  <aliguori@us.ibm.com>
+ *
+ *  Adopted for Seabios: Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPLv3
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "virtio-ring.h"
+#include "virtio-pci.h"
+
+int vp_find_vq(unsigned int ioaddr, int queue_index,
+               struct vring_virtqueue *vq)
+{
+   struct vring * vr = &vq->vring;
+   u16 num;
+
+   ASSERT32FLAT();
+   /* select the queue */
+
+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
+
+   /* check if the queue is available */
+
+   num = inw(ioaddr + VIRTIO_PCI_QUEUE_NUM);
+   if (!num) {
+       dprintf(1, "ERROR: queue size is 0\n");
+       return -1;
+   }
+
+   if (num > MAX_QUEUE_NUM) {
+       dprintf(1, "ERROR: queue size %d > %d\n", num, MAX_QUEUE_NUM);
+       return -1;
+   }
+
+   /* check if the queue is already active */
+
+   if (inl(ioaddr + VIRTIO_PCI_QUEUE_PFN)) {
+       dprintf(1, "ERROR: queue already active\n");
+       return -1;
+   }
+
+   vq->queue_index = queue_index;
+
+   /* initialize the queue */
+
+   vring_init(vr, num, (unsigned char*)&vq->queue);
+
+   /* activate the queue
+    *
+    * NOTE: vr->desc is initialized by vring_init()
+    */
+
+   outl((unsigned long)virt_to_phys(vr->desc) >> PAGE_SHIFT,
+        ioaddr + VIRTIO_PCI_QUEUE_PFN);
+
+   return num;
+}
diff --git a/src/virtio-pci.h b/src/virtio-pci.h
new file mode 100644
index 0000000..6932036
--- /dev/null
+++ b/src/virtio-pci.h
@@ -0,0 +1,97 @@ 
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* A 32-bit r/o bitmask of the features supported by the host */
+#define VIRTIO_PCI_HOST_FEATURES        0
+
+/* A 32-bit r/w bitmask of features activated by the guest */
+#define VIRTIO_PCI_GUEST_FEATURES       4
+
+/* A 32-bit r/w PFN for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_PFN            8
+
+/* A 16-bit r/o queue size for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_NUM            12
+
+/* A 16-bit r/w queue selector */
+#define VIRTIO_PCI_QUEUE_SEL            14
+
+/* A 16-bit r/w queue notifier */
+#define VIRTIO_PCI_QUEUE_NOTIFY         16
+
+/* An 8-bit device status register.  */
+#define VIRTIO_PCI_STATUS               18
+
+/* An 8-bit r/o interrupt status register.  Reading the value will return the
+ * current contents of the ISR and will also clear it.  This is effectively
+ * a read-and-acknowledge. */
+#define VIRTIO_PCI_ISR                  19
+
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG           0x2
+
+/* The remaining space is defined by each driver as the per-driver
+ * configuration space */
+#define VIRTIO_PCI_CONFIG               20
+
+/* Virtio ABI version, this must match exactly */
+#define VIRTIO_PCI_ABI_VERSION          0
+
+static inline u32 vp_get_features(unsigned int ioaddr)
+{
+   return inl(ioaddr + VIRTIO_PCI_HOST_FEATURES);
+}
+
+static inline void vp_set_features(unsigned int ioaddr, u32 features)
+{
+        outl(features, ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+}
+
+static inline void vp_get(unsigned int ioaddr, unsigned offset,
+                     void *buf, unsigned len)
+{
+   u8 *ptr = buf;
+   unsigned i;
+
+   for (i = 0; i < len; i++)
+           ptr[i] = inb(ioaddr + VIRTIO_PCI_CONFIG + offset + i);
+}
+
+static inline u8 vp_get_status(unsigned int ioaddr)
+{
+   return inb(ioaddr + VIRTIO_PCI_STATUS);
+}
+
+static inline void vp_set_status(unsigned int ioaddr, u8 status)
+{
+   if (status == 0)        /* reset */
+           return;
+   outb(status, ioaddr + VIRTIO_PCI_STATUS);
+}
+
+
+static inline void vp_reset(unsigned int ioaddr)
+{
+   outb(0, ioaddr + VIRTIO_PCI_STATUS);
+   (void)inb(ioaddr + VIRTIO_PCI_ISR);
+}
+
+static inline void vp_notify(unsigned int ioaddr, int queue_index)
+{
+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+}
+
+static inline void vp_del_vq(unsigned int ioaddr, int queue_index)
+{
+   /* select the queue */
+
+   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);
+
+   /* deactivate the queue */
+
+   outl(0, ioaddr + VIRTIO_PCI_QUEUE_PFN);
+}
+
+int vp_find_vq(unsigned int ioaddr, int queue_index,
+               struct vring_virtqueue *vq);
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/src/virtio-ring.c b/src/virtio-ring.c
new file mode 100644
index 0000000..f4a2efe
--- /dev/null
+++ b/src/virtio-ring.c
@@ -0,0 +1,152 @@ 
+/* virtio-pci.c - virtio ring management
+ *
+ * (c) Copyright 2008 Bull S.A.S.
+ *
+ *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ *  some parts from Linux Virtio Ring
+ *
+ *  Copyright Rusty Russell IBM Corporation 2007
+ *
+ *  Adopted for Seabios: Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPLv3
+ * See the COPYING file in the top-level directory.
+ *
+ *
+ */
+
+#include "virtio-ring.h"
+#include "virtio-pci.h"
+
+#define BUG() do {                                      \
+        dprintf(1, "BUG: failure at %s:%d/%s()!\n",     \
+                __FILE__, __LINE__, __FUNCTION__);      \
+                while(1);                               \
+        } while (0)
+#define BUG_ON(condition) do { if (condition) BUG(); } while (0)
+
+/*
+ * vring_more_used
+ *
+ * is there some used buffers ?
+ *
+ */
+
+int vring_more_used(struct vring_virtqueue *vq)
+{
+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
+    wmb();
+    return GET_FLATPTR(vq->last_used_idx) != GET_FLATPTR(used->idx);
+}
+
+/*
+ * vring_free
+ *
+ * put at the begin of the free list the current desc[head]
+ */
+
+void vring_detach(struct vring_virtqueue *vq, unsigned int head)
+{
+    struct vring *vr = &vq->vring;
+    struct vring_desc *desc = GET_FLATPTR(vr->desc);
+    unsigned int i;
+
+    /* find end of given descriptor */
+
+    i = head;
+    while (GET_FLATPTR(desc[i].flags) & VRING_DESC_F_NEXT)
+        i = GET_FLATPTR(desc[i].next);
+
+    /* link it with free list and point to it */
+
+    SET_FLATPTR(desc[i].next, GET_FLATPTR(vq->free_head));
+    wmb();
+    SET_FLATPTR(vq->free_head, head);
+}
+
+/*
+ * vring_get_buf
+ *
+ * get a buffer from the used list
+ *
+ */
+
+int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len)
+{
+    struct vring *vr = &vq->vring;
+    struct vring_used_elem *elem;
+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
+    u32 id;
+    int ret;
+
+//    BUG_ON(!vring_more_used(vq));
+
+    elem = &used->ring[GET_FLATPTR(vq->last_used_idx) % GET_FLATPTR(vr->num)];
+    wmb();
+    id = GET_FLATPTR(elem->id);
+    if (len != NULL)
+        *len = GET_FLATPTR(elem->len);
+
+    ret = GET_FLATPTR(vq->vdata[id]);
+
+    vring_detach(vq, id);
+
+    SET_FLATPTR(vq->last_used_idx, GET_FLATPTR(vq->last_used_idx) + 1);
+
+    return ret;
+}
+
+void vring_add_buf(struct vring_virtqueue *vq,
+                   struct vring_list list[],
+                   unsigned int out, unsigned int in,
+                   int index, int num_added)
+{
+    struct vring *vr = &vq->vring;
+    int i, av, head, prev;
+    struct vring_desc *desc = GET_FLATPTR(vr->desc);
+    struct vring_avail *avail = GET_FLATPTR(vr->avail);
+
+    BUG_ON(out + in == 0);
+
+    prev = 0;
+    head = GET_FLATPTR(vq->free_head);
+    for (i = head; out; i = GET_FLATPTR(desc[i].next), out--) {
+        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT);
+        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
+        SET_FLATPTR(desc[i].len, list->length);
+        prev = i;
+        list++;
+    }
+    for ( ; in; i = GET_FLATPTR(desc[i].next), in--) {
+        SET_FLATPTR(desc[i].flags, VRING_DESC_F_NEXT|VRING_DESC_F_WRITE);
+        SET_FLATPTR(desc[i].addr, (u64)virt_to_phys(list->addr));
+        SET_FLATPTR(desc[i].len, list->length);
+        prev = i;
+        list++;
+    }
+    SET_FLATPTR(desc[prev].flags,
+                GET_FLATPTR(desc[prev].flags) & ~VRING_DESC_F_NEXT);
+
+    SET_FLATPTR(vq->free_head, i);
+
+    SET_FLATPTR(vq->vdata[head], index);
+
+    av = (GET_FLATPTR(avail->idx) + num_added) % GET_FLATPTR(vr->num);
+    SET_FLATPTR(avail->ring[av], head);
+    wmb();
+}
+
+void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added)
+{
+    struct vring *vr = &vq->vring;
+    struct vring_avail *avail = GET_FLATPTR(vr->avail);
+    struct vring_used *used = GET_FLATPTR(vq->vring.used);
+
+    wmb();
+    SET_FLATPTR(avail->idx, GET_FLATPTR(avail->idx) + num_added);
+
+    mb();
+    if (!(GET_FLATPTR(used->flags) & VRING_USED_F_NO_NOTIFY))
+        vp_notify(ioaddr, GET_FLATPTR(vq->queue_index));
+}
diff --git a/src/virtio-ring.h b/src/virtio-ring.h
new file mode 100644
index 0000000..b97d572
--- /dev/null
+++ b/src/virtio-ring.h
@@ -0,0 +1,125 @@ 
+#ifndef _VIRTIO_RING_H
+#define _VIRTIO_RING_H
+
+#define PAGE_SHIFT 12
+#define PAGE_MASK  (PAGE_SIZE-1)
+
+#define virt_to_phys(v) (unsigned long)(v)
+#define phys_to_virt(p) (void*)(p)
+#define wmb() barrier()
+#define mb() barrier()
+
+/* Status byte for guest to report progress, and synchronize features. */
+/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
+#define VIRTIO_CONFIG_S_ACKNOWLEDGE     1
+/* We have found a driver for the device. */
+#define VIRTIO_CONFIG_S_DRIVER          2
+/* Driver has used its parts of the config, and is happy */
+#define VIRTIO_CONFIG_S_DRIVER_OK       4
+/* We've given up on this device. */
+#define VIRTIO_CONFIG_S_FAILED          0x80
+
+#define MAX_QUEUE_NUM      (128)
+
+#define VRING_DESC_F_NEXT  1
+#define VRING_DESC_F_WRITE 2
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+#define VRING_USED_F_NO_NOTIFY     1
+
+struct vring_desc
+{
+   u64 addr;
+   u32 len;
+   u16 flags;
+   u16 next;
+};
+
+struct vring_avail
+{
+   u16 flags;
+   u16 idx;
+   u16 ring[0];
+};
+
+struct vring_used_elem
+{
+   u32 id;
+   u32 len;
+};
+
+struct vring_used
+{
+   u16 flags;
+   u16 idx;
+   struct vring_used_elem ring[];
+};
+
+struct vring {
+   unsigned int num;
+   struct vring_desc *desc;
+   struct vring_avail *avail;
+   struct vring_used *used;
+};
+
+#define vring_size(num) \
+   (((((sizeof(struct vring_desc) * num) + \
+      (sizeof(struct vring_avail) + sizeof(u16) * num)) \
+         + PAGE_MASK) & ~PAGE_MASK) + \
+         (sizeof(struct vring_used) + sizeof(struct vring_used_elem) * num))
+
+typedef unsigned char virtio_queue_t[PAGE_MASK + vring_size(MAX_QUEUE_NUM)];
+
+struct vring_virtqueue {
+   virtio_queue_t queue;
+   struct vring vring;
+   u16 free_head;
+   u16 last_used_idx;
+   u16 vdata[MAX_QUEUE_NUM];
+   /* PCI */
+   int queue_index;
+};
+
+struct vring_list {
+  char *addr;
+  unsigned int length;
+};
+
+static inline void vring_init(struct vring *vr,
+                         unsigned int num, unsigned char *queue)
+{
+   unsigned int i;
+   unsigned long pa;
+
+   ASSERT32FLAT();
+   vr->num = num;
+
+   /* physical address of desc must be page aligned */
+
+   pa = virt_to_phys(queue);
+   pa = (pa + PAGE_MASK) & ~PAGE_MASK;
+   vr->desc = phys_to_virt(pa);
+
+   vr->avail = (struct vring_avail *)&vr->desc[num];
+
+   /* physical address of used must be page aligned */
+
+   pa = virt_to_phys(&vr->avail->ring[num]);
+   pa = (pa + PAGE_MASK) & ~PAGE_MASK;
+   vr->used = phys_to_virt(pa);
+
+   for (i = 0; i < num - 1; i++)
+           vr->desc[i].next = i + 1;
+   vr->desc[i].next = 0;
+}
+
+int vring_more_used(struct vring_virtqueue *vq);
+void vring_detach(struct vring_virtqueue *vq, unsigned int head);
+int vring_get_buf(struct vring_virtqueue *vq, unsigned int *len);
+void vring_add_buf(struct vring_virtqueue *vq, struct vring_list list[],
+                   unsigned int out, unsigned int in,
+                   int index, int num_added);
+void vring_kick(unsigned int ioaddr, struct vring_virtqueue *vq, int num_added);
+
+#endif /* _VIRTIO_RING_H_ */