diff mbox

[v6,2/2] block: Support GlusterFS as a QEMU block backend

Message ID 20120809130216.GC7960@in.ibm.com
State New
Headers show

Commit Message

Bharata B Rao Aug. 9, 2012, 1:02 p.m. UTC
block: Support GlusterFS as a QEMU block backend.

From: Bharata B Rao <bharata@linux.vnet.ibm.com>

This patch adds gluster as the new block backend in QEMU. This gives
QEMU the ability to boot VM images from gluster volumes. Its already
possible to boot from VM images on gluster volumes using FUSE mount, but
this patchset provides the ability to boot VM images from gluster volumes
by by-passing the FUSE layer in gluster. This is made possible by
using libgfapi routines to perform IO on gluster volumes directly.

VM Image on gluster volume is specified like this:

file=gluster://server:[port]/volname/image[?transport=socket]

'gluster' is the protocol.

'server' specifies the server where the volume file specification for
the given volume resides. This can be either hostname or ipv4 address
or ipv6 address. ipv6 address needs to be with in square brackets [ ].

port' is the port number on which gluster management daemon (glusterd) is
listening. This is optional and if not specified, QEMU will send 0 which
will make libgfapi to use the default port.

'volname' is the name of the gluster volume which contains the VM image.

'image' is the path to the actual VM image in the gluster volume.

'transport' specifies the transport used to connect to glusterd. This is
optional and if not specified, socket transport is used.

Examples:

file=gluster://1.2.3.4/testvol/a.img
file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
file=gluster://server.domain.com:5000/testvol/dir/a.img

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---

 block/Makefile.objs |    1 
 block/gluster.c     |  623 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 624 insertions(+), 0 deletions(-)
 create mode 100644 block/gluster.c

Comments

Kevin Wolf Aug. 13, 2012, 12:50 p.m. UTC | #1
Am 09.08.2012 15:02, schrieb Bharata B Rao:
> block: Support GlusterFS as a QEMU block backend.
> 
> From: Bharata B Rao <bharata@linux.vnet.ibm.com>
> 
> This patch adds gluster as the new block backend in QEMU. This gives
> QEMU the ability to boot VM images from gluster volumes. Its already
> possible to boot from VM images on gluster volumes using FUSE mount, but
> this patchset provides the ability to boot VM images from gluster volumes
> by by-passing the FUSE layer in gluster. This is made possible by
> using libgfapi routines to perform IO on gluster volumes directly.
> 
> VM Image on gluster volume is specified like this:
> 
> file=gluster://server:[port]/volname/image[?transport=socket]
> 
> 'gluster' is the protocol.
> 
> 'server' specifies the server where the volume file specification for
> the given volume resides. This can be either hostname or ipv4 address
> or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> 
> port' is the port number on which gluster management daemon (glusterd) is
> listening. This is optional and if not specified, QEMU will send 0 which
> will make libgfapi to use the default port.
> 
> 'volname' is the name of the gluster volume which contains the VM image.
> 
> 'image' is the path to the actual VM image in the gluster volume.
> 
> 'transport' specifies the transport used to connect to glusterd. This is
> optional and if not specified, socket transport is used.
> 
> Examples:
> 
> file=gluster://1.2.3.4/testvol/a.img
> file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> file=gluster://server.domain.com:5000/testvol/dir/a.img
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
> 
>  block/Makefile.objs |    1 
>  block/gluster.c     |  623 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 624 insertions(+), 0 deletions(-)
>  create mode 100644 block/gluster.c
> 
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index b5754d3..a1ae67f 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>  block-obj-$(CONFIG_CURL) += curl.o
>  block-obj-$(CONFIG_RBD) += rbd.o
> +block-obj-$(CONFIG_GLUSTERFS) += gluster.o
> diff --git a/block/gluster.c b/block/gluster.c
> new file mode 100644
> index 0000000..bbbaea8
> --- /dev/null
> +++ b/block/gluster.c
> @@ -0,0 +1,623 @@
> +/*
> + * GlusterFS backend for QEMU
> + *
> + * (AIO implementation is derived from block/rbd.c)

Most parts of block/rbd.c are GPLv2 only. You're also missing the
copyright statements from there.

> + *
> + * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * (at your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +#include <glusterfs/api/glfs.h>
> +#include "block_int.h"
> +
> +typedef struct GlusterAIOCB {
> +    BlockDriverAIOCB common;
> +    bool canceled;
> +    int64_t size;
> +    int ret;
> +} GlusterAIOCB;
> +
> +typedef struct BDRVGlusterState {
> +    struct glfs *glfs;
> +    int fds[2];
> +    struct glfs_fd *fd;
> +    int qemu_aio_count;
> +} BDRVGlusterState;
> +
> +#define GLUSTER_FD_READ 0
> +#define GLUSTER_FD_WRITE 1
> +
> +typedef struct GlusterURI {
> +    char *server;
> +    int port;
> +    char *volname;
> +    char *image;
> +    char *transport;
> +} GlusterURI;
> +
> +static void qemu_gluster_uri_free(GlusterURI *uri)
> +{
> +    g_free(uri->server);
> +    g_free(uri->volname);
> +    g_free(uri->image);
> +    g_free(uri->transport);
> +    g_free(uri);
> +}
> +
> +/*
> + * We don't validate the transport option obtained here but
> + * instead depend on gluster to flag an error.
> + */
> +static int parse_transport(GlusterURI *uri, char *transport)
> +{
> +    char *token, *saveptr;
> +    int ret = -EINVAL;
> +
> +    if (!transport) {
> +        uri->transport = g_strdup("socket");
> +        ret = 0;
> +        goto out;

There is no cleanup code after out:, so please use a direct return 0.

> +    }
> +
> +    token = strtok_r(transport, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    if (strcmp(token, "transport")) {
> +        goto out;
> +    }

if (!token || strcmp(...)) {
    return -EINVAL;
}

Many more unnecessary "goto out" follow, I won't comment on each single one.

> +    token = strtok_r(NULL, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    uri->transport = g_strdup(token);
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +static int parse_server(GlusterURI *uri, char *server)
> +{

It looks to me as if there are two options:

Either this functionality already exists and is used in the various
places where qemu deals with network addresses. Then you should reuse
that code.

Or it actually isn't implemented properly in other places, then this
should be added as a globally available helper functions and other
places that deal with network addresses should be changed to use it.

Having a one-off implementation in block/gluster.c is most certainly not
the right thing.

> +    int ret = -EINVAL;
> +    char *token, *saveptr;
> +    char *p, *q = server;
> +
> +    p = strchr(server, '[');
> +    if (p) {
> +        /* [ipv6] */
> +        if (p != server) {
> +            /* [ not in the beginning */
> +            goto out;
> +        }
> +        q++;
> +        p = strrchr(p, ']');
> +        if (!p) {
> +            /* No matching ] */
> +            goto out;
> +        }
> +        *p++ = '\0';
> +        uri->server = g_strdup(q);
> +
> +        if (*p) {
> +            if (*p != ':') {
> +                /* [ipv6] followed by something other than : */
> +                goto out;
> +            }
> +            uri->port = strtoul(++p, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }

This accepts inputs where the colon isn't followed by any number.

> +        } else {
> +            /* port not specified, use default */
> +            uri->port = 0;
> +        }
> +
> +    } else {
> +        /* ipv4 or hostname */
> +        if (*server == ':') {
> +            /* port specified w/o a server */
> +            goto out;

Other places in qemu interpret this as localhost instead of failing.

> +        }
> +        token = strtok_r(server, ":", &saveptr);
> +        if (!token) {
> +            goto out;
> +        }
> +        uri->server = g_strdup(token);
> +        token = strtok_r(NULL, ":", &saveptr);
> +        if (token) {
> +            uri->port = strtoul(token, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            uri->port = 0;
> +        }

The port parsing code is duplicated in IPv4 and IPv6, even though it's
really the same.

> +    }
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +/*
> + * file=gluster://server:[port]/volname/image[?transport=socket]

I think you mean "server[:port]" (the colon is part of the optional port
specification).

> + *
> + * 'gluster' is the protocol.
> + *
> + * 'server' specifies the server where the volume file specification for
> + * the given volume resides. This can be either hostname or ipv4 address
> + * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> + *
> + * 'port' is the port number on which gluster management daemon (glusterd) is
> + * listening. This is optional and if not specified, QEMU will send 0 which
> + * will make libgfapi to use the default port.
> + *
> + * 'volname' is the name of the gluster volume which contains the VM image.
> + *
> + * 'image' is the path to the actual VM image in the gluster volume.
> + *
> + * 'transport' specifies the transport used to connect to glusterd. This is
> + * optional and if not specified, socket transport is used.
> + *
> + * Examples:
> + *
> + * file=gluster://1.2.3.4/testvol/a.img
> + * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> + * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://server.domain.com:5000/testvol/dir/a.img
> + *
> + * We just do minimal checking of the gluster options and mostly ensure
> + * that all the expected elements of the URI are present. We depend on libgfapi
> + * APIs to return appropriate errors in case of invalid arguments.
> + */
> +static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
> +{
> +    char *token, *saveptr;
> +    char *p, *r;
> +    int ret = -EINVAL;
> +
> +    p = r = g_strdup(filename);
> +    if (strncmp(p, "gluster://", 10)) {
> +        goto out;
> +    }
> +
> +    /* Discard the protocol */
> +    p += 10;

strstart() from cutils.c

> +
> +    /* server */
> +    token = strtok_r(p, "/", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +
> +    ret = parse_server(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* volname */
> +    token = strtok_r(NULL, "/", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->volname = g_strdup(token);
> +
> +    /* image */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->image = g_strdup(token);

Image file whose names contain a question mark cannot be used. Probably
tolerable.

> +
> +    /* transport */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    ret = parse_transport(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +     }
> +
> +    /* Flag error for extra options */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    ret = 0;
> +out:
> +    g_free(r);
> +    return ret;
> +}
> +
> +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
> +{
> +    struct glfs *glfs = NULL;
> +    int ret;
> +
> +    ret = qemu_gluster_parseuri(uri, filename);
> +    if (ret < 0) {
> +        error_report("Usage: file=gluster://server[:port]/volname/image"
> +            "[?transport=socket]");

Is 'socket' really the only valid transport and will it stay like this
without changes to qemu?

> +        errno = -ret;
> +        goto out;
> +    }
> +
> +    glfs = glfs_new(uri->volname);
> +    if (!glfs) {
> +        goto out;
> +    }
> +
> +    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
> +        uri->port);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /*
> +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
> +     * GlusterFS exports it in a header.
> +     */
> +    ret = glfs_set_logging(glfs, "-", 4);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = glfs_init(glfs);
> +    if (ret) {
> +        error_report("Gluster connection failed for server=%s port=%d "
> +             "volume=%s image=%s transport=%s\n", uri->server, uri->port,
> +             uri->volname, uri->image, uri->transport);
> +        goto out;
> +    }
> +    return glfs;
> +
> +out:
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return NULL;
> +}
> +
> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> +{
> +    int ret;
> +
> +    if (acb->canceled) {
> +        qemu_aio_release(acb);
> +        return;
> +    }
> +
> +    if (acb->ret == acb->size) {
> +        ret = 0; /* Success */
> +    } else if (acb->ret < 0) {
> +        ret = acb->ret; /* Read/Write failed */
> +    } else {
> +        ret = -EIO; /* Partial read/write - fail it */
> +    }
> +    acb->common.cb(acb->common.opaque, ret);
> +    qemu_aio_release(acb);
> +}
> +
> +static void qemu_gluster_aio_event_reader(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +    GlusterAIOCB *event_acb;
> +    int event_reader_pos = 0;
> +    ssize_t ret;
> +
> +    do {
> +        char *p = (char *)&event_acb;
> +
> +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
> +                   sizeof(event_acb) - event_reader_pos);

So you're reading in a pointer address from a pipe? This is fun.

> +        if (ret > 0) {
> +            event_reader_pos += ret;
> +            if (event_reader_pos == sizeof(event_acb)) {
> +                event_reader_pos = 0;
> +                qemu_gluster_complete_aio(event_acb);
> +                s->qemu_aio_count--;
> +            }
> +        }
> +    } while (ret < 0 && errno == EINTR);

In case of a short read the read data is just discarded? Maybe
event_reader_pos was supposed to static?

> +}
> +
> +static int qemu_gluster_aio_flush_cb(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +
> +    return (s->qemu_aio_count > 0);
> +}
> +
> +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> +    int bdrv_flags)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    int open_flags = 0;
> +    int ret = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    s->glfs = qemu_gluster_init(uri, filename);
> +    if (!s->glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    open_flags |=  O_BINARY;
> +    open_flags &= ~O_ACCMODE;
> +    if (bdrv_flags & BDRV_O_RDWR) {
> +        open_flags |= O_RDWR;
> +    } else {
> +        open_flags |= O_RDONLY;
> +    }
> +
> +    if ((bdrv_flags & BDRV_O_NOCACHE)) {
> +        open_flags |= O_DIRECT;
> +    }
> +
> +    s->fd = glfs_open(s->glfs, uri->image, open_flags);
> +    if (!s->fd) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    ret = qemu_pipe(s->fds);
> +    if (ret < 0) {

ret = -errno;

> +        goto out;
> +    }
> +    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
> +    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
> +    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
> +        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
> +
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (!ret) {
> +        return ret;
> +    }
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +    }
> +    if (s->glfs) {
> +        glfs_fini(s->glfs);
> +    }
> +    return ret;
> +}
> +
> +static int qemu_gluster_create(const char *filename,
> +        QEMUOptionParameter *options)
> +{
> +    struct glfs *glfs;
> +    struct glfs_fd *fd;
> +    int ret = 0;
> +    int64_t total_size = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    glfs = qemu_gluster_init(uri, filename);
> +    if (!glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            total_size = options->value.n / BDRV_SECTOR_SIZE;
> +        }
> +        options++;
> +    }
> +
> +    fd = glfs_creat(glfs, uri->image,
> +        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
> +    if (!fd) {
> +        ret = -errno;
> +    } else {
> +        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
> +            ret = -errno;
> +        }
> +        if (glfs_close(fd) != 0) {
> +            ret = -errno;
> +        }
> +    }
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return ret;
> +}
> +
> +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
> +
> +    acb->common.cb(acb->common.opaque, -ECANCELED);
> +    acb->canceled = true;
> +}

After having called acb->common.cb you must not write any longer to the
memory pointed at by the qiov. Either you can really cancel the request,
or you need to wait until it completes.

> +
> +static AIOPool gluster_aio_pool = {
> +    .aiocb_size = sizeof(GlusterAIOCB),
> +    .cancel = qemu_gluster_aio_cancel,
> +};
> +
> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> +{
> +    int ret = 0;
> +    while (1) {
> +        fd_set wfd;
> +        int fd = s->fds[GLUSTER_FD_WRITE];
> +
> +        ret = write(fd, (void *)&acb, sizeof(acb));
> +        if (ret >= 0) {
> +            break;
> +        }
> +        if (errno == EINTR) {
> +            continue;
> +        }
> +        if (errno != EAGAIN) {
> +            break;
> +        }
> +
> +        FD_ZERO(&wfd);
> +        FD_SET(fd, &wfd);
> +        do {
> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> +        } while (ret < 0 && errno == EINTR);

What's the idea behind this? While we're hanging in this loop noone will
read anything from the pipe, so it's unlikely that it magically becomes
ready.

> +    }
> +    return ret;
> +}
> +
> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> +    BDRVGlusterState *s = acb->common.bs->opaque;
> +
> +    acb->ret = ret;
> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> +        /*
> +         * Gluster AIO callback thread failed to notify the waiting
> +         * QEMU thread about IO completion. Nothing much can be done
> +         * here but to abruptly abort.
> +         *
> +         * FIXME: Check if the read side of the fd handler can somehow
> +         * be notified of this failure paving the way for a graceful exit.
> +         */
> +        error_report("Gluster failed to notify QEMU about IO completion");
> +        abort();

In the extreme case you may choose to make this disk inaccessible
(something like bs->drv = NULL), but abort() kills the whole VM and
should only be called when there is a bug.

> +    }
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque, int write)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +    size_t size;
> +    off_t offset;
> +
> +    offset = sector_num * BDRV_SECTOR_SIZE;
> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> +    s->qemu_aio_count++;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = size;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +
> +    if (write) {
> +        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    } else {
> +        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    }
> +
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = 0;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +    s->qemu_aio_count++;
> +
> +    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static int64_t qemu_gluster_getlength(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    struct stat st;
> +    int ret;
> +
> +    ret = glfs_fstat(s->fd, &st);
> +    if (ret < 0) {
> +        return -errno;
> +    } else {
> +        return st.st_size;
> +    }
> +}
> +
> +static void qemu_gluster_close(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +        s->fd = NULL;
> +    }
> +    glfs_fini(s->glfs);
> +}
> +
> +static QEMUOptionParameter qemu_gluster_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_gluster = {
> +    .format_name = "gluster",
> +    .protocol_name = "gluster",
> +    .instance_size = sizeof(BDRVGlusterState),
> +    .bdrv_file_open = qemu_gluster_open,
> +    .bdrv_close = qemu_gluster_close,
> +    .bdrv_create = qemu_gluster_create,
> +    .bdrv_getlength = qemu_gluster_getlength,
> +
> +    .bdrv_aio_readv = qemu_gluster_aio_readv,
> +    .bdrv_aio_writev = qemu_gluster_aio_writev,
> +    .bdrv_aio_flush = qemu_gluster_aio_flush,
> +
> +    .create_options = qemu_gluster_create_options,

Please align the = on the same column.

> +};
> +
> +static void bdrv_gluster_init(void)
> +{
> +    bdrv_register(&bdrv_gluster);
> +}
> +
> +block_init(bdrv_gluster_init);

Kevin
Bharata B Rao Aug. 14, 2012, 4:38 a.m. UTC | #2
Kevin, Thanks for your review. I will address all of your comments
in the next iteration, but have a few questions/comments on the others...

On Mon, Aug 13, 2012 at 02:50:29PM +0200, Kevin Wolf wrote:
> > +static int parse_server(GlusterURI *uri, char *server)
> > +{
> > +    int ret = -EINVAL;
> > +    char *token, *saveptr;
> > +    char *p, *q = server;
> > +
> > +    p = strchr(server, '[');
> > +    if (p) {
> > +        /* [ipv6] */
> > +        if (p != server) {
> > +            /* [ not in the beginning */
> > +            goto out;
> > +        }
> > +        q++;
> > +        p = strrchr(p, ']');
> > +        if (!p) {
> > +            /* No matching ] */
> > +            goto out;
> > +        }
> > +        *p++ = '\0';
> > +        uri->server = g_strdup(q);
> > +
> > +        if (*p) {
> > +            if (*p != ':') {
> > +                /* [ipv6] followed by something other than : */
> > +                goto out;
> > +            }
> > +            uri->port = strtoul(++p, NULL, 0);
> > +            if (uri->port < 0) {
> > +                goto out;
> > +            }
> 
> This accepts inputs where the colon isn't followed by any number.

Yes, and that will result in port=0, which is default. So this is to
cater for cases like gluster://[1:2:3:4:5]:/volname/image

In any case, let me see if I can get rid of this altogether and reuse
qemu-sockets.c:inet_parse().

> > +        if (token) {
> > +            uri->port = strtoul(token, NULL, 0);
> > +            if (uri->port < 0) {
> > +                goto out;
> > +            }
> > +        } else {
> > +            uri->port = 0;
> > +        }
> 
> The port parsing code is duplicated in IPv4 and IPv6, even though it's
> really the same.

Being such a small piece of code, I didn't think it deserves to be made a
function on its own and re-used. But if you really want it that way, I can do.

> > +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
> > +{
> > +    struct glfs *glfs = NULL;
> > +    int ret;
> > +
> > +    ret = qemu_gluster_parseuri(uri, filename);
> > +    if (ret < 0) {
> > +        error_report("Usage: file=gluster://server[:port]/volname/image"
> > +            "[?transport=socket]");
> 
> Is 'socket' really the only valid transport and will it stay like this
> without changes to qemu?

There are others like 'unix' and 'rdma'. I will fix this error message to
reflect that.

However QEMU needn't change for such transport types because I am not
interpreting the transport type in QEMU but instead passing it on directly
to GlusterFS.

> > +
> > +static void qemu_gluster_aio_event_reader(void *opaque)
> > +{
> > +    BDRVGlusterState *s = opaque;
> > +    GlusterAIOCB *event_acb;
> > +    int event_reader_pos = 0;
> > +    ssize_t ret;
> > +
> > +    do {
> > +        char *p = (char *)&event_acb;
> > +
> > +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
> > +                   sizeof(event_acb) - event_reader_pos);
> 
> So you're reading in a pointer address from a pipe? This is fun.
> 
> > +        if (ret > 0) {
> > +            event_reader_pos += ret;
> > +            if (event_reader_pos == sizeof(event_acb)) {
> > +                event_reader_pos = 0;
> > +                qemu_gluster_complete_aio(event_acb);
> > +                s->qemu_aio_count--;
> > +            }
> > +        }
> > +    } while (ret < 0 && errno == EINTR);
> 
> In case of a short read the read data is just discarded? Maybe
> event_reader_pos was supposed to static?

In earlier versions event_reader_pos was part of BDRVGlusterState and I made
it local in subsequent versions and that is causing this problem. Will fix.

> > +
> > +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
> > +{
> > +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
> > +
> > +    acb->common.cb(acb->common.opaque, -ECANCELED);
> > +    acb->canceled = true;
> > +}
> 
> After having called acb->common.cb you must not write any longer to the
> memory pointed at by the qiov. Either you can really cancel the request,
> or you need to wait until it completes.

I don't think I can cancel the request that has already been sent to
gluster server. block/qed.c introduces acb->finished and waits on it in
qed_aio_canel. Do you suggest I follow that model ?

> 
> > +
> > +static AIOPool gluster_aio_pool = {
> > +    .aiocb_size = sizeof(GlusterAIOCB),
> > +    .cancel = qemu_gluster_aio_cancel,
> > +};
> > +
> > +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> > +{
> > +    int ret = 0;
> > +    while (1) {
> > +        fd_set wfd;
> > +        int fd = s->fds[GLUSTER_FD_WRITE];
> > +
> > +        ret = write(fd, (void *)&acb, sizeof(acb));
> > +        if (ret >= 0) {
> > +            break;
> > +        }
> > +        if (errno == EINTR) {
> > +            continue;
> > +        }
> > +        if (errno != EAGAIN) {
> > +            break;
> > +        }
> > +
> > +        FD_ZERO(&wfd);
> > +        FD_SET(fd, &wfd);
> > +        do {
> > +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> > +        } while (ret < 0 && errno == EINTR);
> 
> What's the idea behind this? While we're hanging in this loop noone will
> read anything from the pipe, so it's unlikely that it magically becomes
> ready.

I write to the pipe and wait for the reader to read it. The reader
(qemu_gluster_aio_event_reader) is already waiting on the other end of the
pipe. 

> 
> > +    }
> > +    return ret;
> > +}
> > +
> > +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> > +{
> > +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> > +    BDRVGlusterState *s = acb->common.bs->opaque;
> > +
> > +    acb->ret = ret;
> > +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> > +        /*
> > +         * Gluster AIO callback thread failed to notify the waiting
> > +         * QEMU thread about IO completion. Nothing much can be done
> > +         * here but to abruptly abort.
> > +         *
> > +         * FIXME: Check if the read side of the fd handler can somehow
> > +         * be notified of this failure paving the way for a graceful exit.
> > +         */
> > +        error_report("Gluster failed to notify QEMU about IO completion");
> > +        abort();
> 
> In the extreme case you may choose to make this disk inaccessible
> (something like bs->drv = NULL), but abort() kills the whole VM and
> should only be called when there is a bug.

There have been concerns raised about this earlier too. I settled for this
since I couldn't see a better way out and I could see the precedence
for this in posix-aio-compat.c

So I could just do the necessary cleanup, set bs->drv to NULL and return from
here ? But how do I wake up the QEMU thread that is waiting on the read side
of the pipe ? W/o that, the QEMU thread that waits on the read side of the
pipe is still hung.

Regards,
Bharata.
Kevin Wolf Aug. 14, 2012, 8:29 a.m. UTC | #3
Am 14.08.2012 06:38, schrieb Bharata B Rao:
> Kevin, Thanks for your review. I will address all of your comments
> in the next iteration, but have a few questions/comments on the others...
> 
> On Mon, Aug 13, 2012 at 02:50:29PM +0200, Kevin Wolf wrote:
>>> +static int parse_server(GlusterURI *uri, char *server)
>>> +{
>>> +    int ret = -EINVAL;
>>> +    char *token, *saveptr;
>>> +    char *p, *q = server;
>>> +
>>> +    p = strchr(server, '[');
>>> +    if (p) {
>>> +        /* [ipv6] */
>>> +        if (p != server) {
>>> +            /* [ not in the beginning */
>>> +            goto out;
>>> +        }
>>> +        q++;
>>> +        p = strrchr(p, ']');
>>> +        if (!p) {
>>> +            /* No matching ] */
>>> +            goto out;
>>> +        }
>>> +        *p++ = '\0';
>>> +        uri->server = g_strdup(q);
>>> +
>>> +        if (*p) {
>>> +            if (*p != ':') {
>>> +                /* [ipv6] followed by something other than : */
>>> +                goto out;
>>> +            }
>>> +            uri->port = strtoul(++p, NULL, 0);
>>> +            if (uri->port < 0) {
>>> +                goto out;
>>> +            }
>>
>> This accepts inputs where the colon isn't followed by any number.
> 
> Yes, and that will result in port=0, which is default. So this is to
> cater for cases like gluster://[1:2:3:4:5]:/volname/image

So you consider this a valid URL? I would have expected it to invalid.
But let me see, there must be some official definition of an URL...

Alright, so RFC 2234 says that having no digits after the colon is
valid. It also says that you shouldn't generate such URLs. And it
doesn't say what it means when it's there... Common interpretation seems
to be that it's treated as if it wasn't specified, i.e. the default port
for the schema is used.

So if 0 is the default port for glusterfs, your code looks okay. But it
doesn't seem to be a very useful default port number.

> In any case, let me see if I can get rid of this altogether and reuse
> qemu-sockets.c:inet_parse().

Cool, thanks!

>>> +        if (token) {
>>> +            uri->port = strtoul(token, NULL, 0);
>>> +            if (uri->port < 0) {
>>> +                goto out;
>>> +            }
>>> +        } else {
>>> +            uri->port = 0;
>>> +        }
>>
>> The port parsing code is duplicated in IPv4 and IPv6, even though it's
>> really the same.
> 
> Being such a small piece of code, I didn't think it deserves to be made a
> function on its own and re-used. But if you really want it that way, I can do.

It's not worth a separate function, but it can just be code outside the
if statement.

>>> +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
>>> +{
>>> +    struct glfs *glfs = NULL;
>>> +    int ret;
>>> +
>>> +    ret = qemu_gluster_parseuri(uri, filename);
>>> +    if (ret < 0) {
>>> +        error_report("Usage: file=gluster://server[:port]/volname/image"
>>> +            "[?transport=socket]");
>>
>> Is 'socket' really the only valid transport and will it stay like this
>> without changes to qemu?
> 
> There are others like 'unix' and 'rdma'. I will fix this error message to
> reflect that.
> 
> However QEMU needn't change for such transport types because I am not
> interpreting the transport type in QEMU but instead passing it on directly
> to GlusterFS.

Maybe then just specify "[?transport=...]" instead of giving a specific
option value?

>>> +
>>> +static void qemu_gluster_aio_event_reader(void *opaque)
>>> +{
>>> +    BDRVGlusterState *s = opaque;
>>> +    GlusterAIOCB *event_acb;
>>> +    int event_reader_pos = 0;
>>> +    ssize_t ret;
>>> +
>>> +    do {
>>> +        char *p = (char *)&event_acb;
>>> +
>>> +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
>>> +                   sizeof(event_acb) - event_reader_pos);
>>
>> So you're reading in a pointer address from a pipe? This is fun.
>>
>>> +        if (ret > 0) {
>>> +            event_reader_pos += ret;
>>> +            if (event_reader_pos == sizeof(event_acb)) {
>>> +                event_reader_pos = 0;
>>> +                qemu_gluster_complete_aio(event_acb);
>>> +                s->qemu_aio_count--;
>>> +            }
>>> +        }
>>> +    } while (ret < 0 && errno == EINTR);
>>
>> In case of a short read the read data is just discarded? Maybe
>> event_reader_pos was supposed to static?
> 
> In earlier versions event_reader_pos was part of BDRVGlusterState and I made
> it local in subsequent versions and that is causing this problem. Will fix.

Ah, yes, it needs to be in BDRVGlusterState, static doesn't work when
you have multiple gluster drives.

>>> +
>>> +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
>>> +{
>>> +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
>>> +
>>> +    acb->common.cb(acb->common.opaque, -ECANCELED);
>>> +    acb->canceled = true;
>>> +}
>>
>> After having called acb->common.cb you must not write any longer to the
>> memory pointed at by the qiov. Either you can really cancel the request,
>> or you need to wait until it completes.
> 
> I don't think I can cancel the request that has already been sent to
> gluster server. block/qed.c introduces acb->finished and waits on it in
> qed_aio_canel. Do you suggest I follow that model ?

This is a possible solution, yes.

>>> +
>>> +static AIOPool gluster_aio_pool = {
>>> +    .aiocb_size = sizeof(GlusterAIOCB),
>>> +    .cancel = qemu_gluster_aio_cancel,
>>> +};
>>> +
>>> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
>>> +{
>>> +    int ret = 0;
>>> +    while (1) {
>>> +        fd_set wfd;
>>> +        int fd = s->fds[GLUSTER_FD_WRITE];
>>> +
>>> +        ret = write(fd, (void *)&acb, sizeof(acb));
>>> +        if (ret >= 0) {
>>> +            break;
>>> +        }
>>> +        if (errno == EINTR) {
>>> +            continue;
>>> +        }
>>> +        if (errno != EAGAIN) {
>>> +            break;
>>> +        }
>>> +
>>> +        FD_ZERO(&wfd);
>>> +        FD_SET(fd, &wfd);
>>> +        do {
>>> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
>>> +        } while (ret < 0 && errno == EINTR);
>>
>> What's the idea behind this? While we're hanging in this loop noone will
>> read anything from the pipe, so it's unlikely that it magically becomes
>> ready.
> 
> I write to the pipe and wait for the reader to read it. The reader
> (qemu_gluster_aio_event_reader) is already waiting on the other end of the
> pipe.

qemu_gluster_aio_even_reader() isn't called while we're looping here. It
will only be called from the main loop, after this function has returned.

>>
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
>>> +{
>>> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
>>> +    BDRVGlusterState *s = acb->common.bs->opaque;
>>> +
>>> +    acb->ret = ret;
>>> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
>>> +        /*
>>> +         * Gluster AIO callback thread failed to notify the waiting
>>> +         * QEMU thread about IO completion. Nothing much can be done
>>> +         * here but to abruptly abort.
>>> +         *
>>> +         * FIXME: Check if the read side of the fd handler can somehow
>>> +         * be notified of this failure paving the way for a graceful exit.
>>> +         */
>>> +        error_report("Gluster failed to notify QEMU about IO completion");
>>> +        abort();
>>
>> In the extreme case you may choose to make this disk inaccessible
>> (something like bs->drv = NULL), but abort() kills the whole VM and
>> should only be called when there is a bug.
> 
> There have been concerns raised about this earlier too. I settled for this
> since I couldn't see a better way out and I could see the precedence
> for this in posix-aio-compat.c
> 
> So I could just do the necessary cleanup, set bs->drv to NULL and return from
> here ? But how do I wake up the QEMU thread that is waiting on the read side
> of the pipe ? W/o that, the QEMU thread that waits on the read side of the
> pipe is still hung.

There is no other thread. But you're right, you should probably
unregister the aio_fd_handler and any other pending callbacks.

Kevin
Bharata B Rao Aug. 14, 2012, 9:34 a.m. UTC | #4
On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
> > 
> > Yes, and that will result in port=0, which is default. So this is to
> > cater for cases like gluster://[1:2:3:4:5]:/volname/image
> 
> So you consider this a valid URL? I would have expected it to invalid.
> But let me see, there must be some official definition of an URL...
> 
> Alright, so RFC 2234 says that having no digits after the colon is
> valid. It also says that you shouldn't generate such URLs. And it
> doesn't say what it means when it's there... Common interpretation seems
> to be that it's treated as if it wasn't specified, i.e. the default port
> for the schema is used.
> 
> So if 0 is the default port for glusterfs, your code looks okay. But it
> doesn't seem to be a very useful default port number.

I know, but gluster prefers to be called with port=0 which will be interpreted
as "default" by it.

While we are at this, let me bring out another issue. Gluster supports 3
transport types:

- socket in which case the server will be hostname, ipv4 or ipv4 address.
- rdma in which case server will be interpreted similar to socket.
- unix in which case server will be a path to unix domain socket and this
  will look like any other filesystem path. (Eg. /tmp/glusterd.socket)

I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
easily, but I am planning to specify the 'unix' transport as below:

gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix

i,e., I am asking the user to put the unix domain socket path within
square brackets when transport type is unix.

Do you think this is fine ?

> >> Is 'socket' really the only valid transport and will it stay like this
> >> without changes to qemu?
> > 
> > There are others like 'unix' and 'rdma'. I will fix this error message to
> > reflect that.
> > 
> > However QEMU needn't change for such transport types because I am not
> > interpreting the transport type in QEMU but instead passing it on directly
> > to GlusterFS.
> 
> Maybe then just specify "[?transport=...]" instead of giving a specific
> option value?

Sure, but as I noted above, let me finalize on how to specify 'unix'
transport type.

> >>> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> >>> +{
> >>> +    int ret = 0;
> >>> +    while (1) {
> >>> +        fd_set wfd;
> >>> +        int fd = s->fds[GLUSTER_FD_WRITE];
> >>> +
> >>> +        ret = write(fd, (void *)&acb, sizeof(acb));
> >>> +        if (ret >= 0) {
> >>> +            break;
> >>> +        }
> >>> +        if (errno == EINTR) {
> >>> +            continue;
> >>> +        }
> >>> +        if (errno != EAGAIN) {
> >>> +            break;
> >>> +        }
> >>> +
> >>> +        FD_ZERO(&wfd);
> >>> +        FD_SET(fd, &wfd);
> >>> +        do {
> >>> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> >>> +        } while (ret < 0 && errno == EINTR);
> >>
> >> What's the idea behind this? While we're hanging in this loop noone will
> >> read anything from the pipe, so it's unlikely that it magically becomes
> >> ready.
> > 
> > I write to the pipe and wait for the reader to read it. The reader
> > (qemu_gluster_aio_event_reader) is already waiting on the other end of the
> > pipe.
> 
> qemu_gluster_aio_even_reader() isn't called while we're looping here. It
> will only be called from the main loop, after this function has returned.

May be I am not understanding you correctly here. Let me be a bit verbose.

This routine is called by aio callback routine that we registered to be
called by gluster after aio completion. Hence this routine will be called
in the context of a separate (gluster) thread. This routine will write to
the pipe and wait until the data is read from the read-end of it.

As soon as the data is available to be read from this pipe, I think the
routine registered for reading (qemu_gluster_aio_event_reader) would be
called which will further handle the AIO completion. As per my understanding,
the original co-routine thread that initiated the aio read or write request
would be blocked on qemu_aio_wait(). That thread would wake up and run
qemu_gluster_aio_event_reader().

So I am not clear why qemu_gluster_aio_even_reader() won't be called until
this routine (qemu_gluster_send_pipe) returns.

Regards,
Bharata.
Kevin Wolf Aug. 14, 2012, 9:58 a.m. UTC | #5
Am 14.08.2012 11:34, schrieb Bharata B Rao:
> On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
>>>
>>> Yes, and that will result in port=0, which is default. So this is to
>>> cater for cases like gluster://[1:2:3:4:5]:/volname/image
>>
>> So you consider this a valid URL? I would have expected it to invalid.
>> But let me see, there must be some official definition of an URL...
>>
>> Alright, so RFC 2234 says that having no digits after the colon is
>> valid. It also says that you shouldn't generate such URLs. And it
>> doesn't say what it means when it's there... Common interpretation seems
>> to be that it's treated as if it wasn't specified, i.e. the default port
>> for the schema is used.
>>
>> So if 0 is the default port for glusterfs, your code looks okay. But it
>> doesn't seem to be a very useful default port number.
> 
> I know, but gluster prefers to be called with port=0 which will be interpreted
> as "default" by it.

Ok, that makes sense.

> While we are at this, let me bring out another issue. Gluster supports 3
> transport types:
> 
> - socket in which case the server will be hostname, ipv4 or ipv4 address.
> - rdma in which case server will be interpreted similar to socket.
> - unix in which case server will be a path to unix domain socket and this
>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
> 
> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
> easily, but I am planning to specify the 'unix' transport as below:
> 
> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> 
> i,e., I am asking the user to put the unix domain socket path within
> square brackets when transport type is unix.
> 
> Do you think this is fine ?

Never saw something like this before, but it does seem reasonable to me.
Excludes ] from the valid characters in the file name of the socket, but
that shouldn't be a problem in practice.

>>>>> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
>>>>> +{
>>>>> +    int ret = 0;
>>>>> +    while (1) {
>>>>> +        fd_set wfd;
>>>>> +        int fd = s->fds[GLUSTER_FD_WRITE];
>>>>> +
>>>>> +        ret = write(fd, (void *)&acb, sizeof(acb));
>>>>> +        if (ret >= 0) {
>>>>> +            break;
>>>>> +        }
>>>>> +        if (errno == EINTR) {
>>>>> +            continue;
>>>>> +        }
>>>>> +        if (errno != EAGAIN) {
>>>>> +            break;
>>>>> +        }
>>>>> +
>>>>> +        FD_ZERO(&wfd);
>>>>> +        FD_SET(fd, &wfd);
>>>>> +        do {
>>>>> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
>>>>> +        } while (ret < 0 && errno == EINTR);
>>>>
>>>> What's the idea behind this? While we're hanging in this loop noone will
>>>> read anything from the pipe, so it's unlikely that it magically becomes
>>>> ready.
>>>
>>> I write to the pipe and wait for the reader to read it. The reader
>>> (qemu_gluster_aio_event_reader) is already waiting on the other end of the
>>> pipe.
>>
>> qemu_gluster_aio_even_reader() isn't called while we're looping here. It
>> will only be called from the main loop, after this function has returned.
> 
> May be I am not understanding you correctly here. Let me be a bit verbose.
> [...]

Sorry, my mistake. I was assuming that this code runs in a thread
created by qemu, which isn't true. Your explanation makes perfect sense.

Kevin
Bharata B Rao Aug. 15, 2012, 5:21 a.m. UTC | #6
On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
> >>> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> >>> +{
> >>> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> >>> +    BDRVGlusterState *s = acb->common.bs->opaque;
> >>> +
> >>> +    acb->ret = ret;
> >>> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> >>> +        /*
> >>> +         * Gluster AIO callback thread failed to notify the waiting
> >>> +         * QEMU thread about IO completion. Nothing much can be done
> >>> +         * here but to abruptly abort.
> >>> +         *
> >>> +         * FIXME: Check if the read side of the fd handler can somehow
> >>> +         * be notified of this failure paving the way for a graceful exit.
> >>> +         */
> >>> +        error_report("Gluster failed to notify QEMU about IO completion");
> >>> +        abort();
> >>
> >> In the extreme case you may choose to make this disk inaccessible
> >> (something like bs->drv = NULL), but abort() kills the whole VM and
> >> should only be called when there is a bug.
> > 
> > There have been concerns raised about this earlier too. I settled for this
> > since I couldn't see a better way out and I could see the precedence
> > for this in posix-aio-compat.c
> > 
> > So I could just do the necessary cleanup, set bs->drv to NULL and return from
> > here ? But how do I wake up the QEMU thread that is waiting on the read side
> > of the pipe ? W/o that, the QEMU thread that waits on the read side of the
> > pipe is still hung.
> 
> There is no other thread. But you're right, you should probably
> unregister the aio_fd_handler and any other pending callbacks.

As I clarified in the other mail, this (gluster_finish_aiocb) is called
from gluster thread context and hence QEMU thread that raised the original
read/write request is still blocked on qemu_aio_wait().

I tried the following cleanup instead of abrupt abort:

close(read_fd); /* This will wake up the QEMU thread blocked on select(read_fd...) */
close(write_fd);
qemu_aio_set_fd_handler(read_fd, NULL, NULL, NULL, NULL);
qemu_aio_release(acb);
s->qemu_aio_count--;
bs->drv = NULL;

I tested this by manually injecting faults into qemu_gluster_send_pipe().
With the above cleanup, the guest kernel crashes with IO errors.

Is there anything else that I need to do or do differently to retain the
VM running w/o disk access ?

I thought of completing the aio callback by doing
acb->common.cb(acb->common.opaque, -EIO);
but that would do a coroutine enter from gluster thread, which I don't think
should be done.

Regards,
Bharata.
Kevin Wolf Aug. 15, 2012, 8 a.m. UTC | #7
Am 15.08.2012 07:21, schrieb Bharata B Rao:
> On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
>>>>> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
>>>>> +{
>>>>> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
>>>>> +    BDRVGlusterState *s = acb->common.bs->opaque;
>>>>> +
>>>>> +    acb->ret = ret;
>>>>> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
>>>>> +        /*
>>>>> +         * Gluster AIO callback thread failed to notify the waiting
>>>>> +         * QEMU thread about IO completion. Nothing much can be done
>>>>> +         * here but to abruptly abort.
>>>>> +         *
>>>>> +         * FIXME: Check if the read side of the fd handler can somehow
>>>>> +         * be notified of this failure paving the way for a graceful exit.
>>>>> +         */
>>>>> +        error_report("Gluster failed to notify QEMU about IO completion");
>>>>> +        abort();
>>>>
>>>> In the extreme case you may choose to make this disk inaccessible
>>>> (something like bs->drv = NULL), but abort() kills the whole VM and
>>>> should only be called when there is a bug.
>>>
>>> There have been concerns raised about this earlier too. I settled for this
>>> since I couldn't see a better way out and I could see the precedence
>>> for this in posix-aio-compat.c
>>>
>>> So I could just do the necessary cleanup, set bs->drv to NULL and return from
>>> here ? But how do I wake up the QEMU thread that is waiting on the read side
>>> of the pipe ? W/o that, the QEMU thread that waits on the read side of the
>>> pipe is still hung.
>>
>> There is no other thread. But you're right, you should probably
>> unregister the aio_fd_handler and any other pending callbacks.
> 
> As I clarified in the other mail, this (gluster_finish_aiocb) is called
> from gluster thread context and hence QEMU thread that raised the original
> read/write request is still blocked on qemu_aio_wait().
> 
> I tried the following cleanup instead of abrupt abort:
> 
> close(read_fd); /* This will wake up the QEMU thread blocked on select(read_fd...) */
> close(write_fd);
> qemu_aio_set_fd_handler(read_fd, NULL, NULL, NULL, NULL);
> qemu_aio_release(acb);
> s->qemu_aio_count--;
> bs->drv = NULL;
> 
> I tested this by manually injecting faults into qemu_gluster_send_pipe().
> With the above cleanup, the guest kernel crashes with IO errors.

What does "crash" really mean? IO errors certainly shouldn't cause a
kernel to crash?

> Is there anything else that I need to do or do differently to retain the
> VM running w/o disk access ?
> 
> I thought of completing the aio callback by doing
> acb->common.cb(acb->common.opaque, -EIO);
> but that would do a coroutine enter from gluster thread, which I don't think
> should be done.

You would have to take the global qemu mutex at least. I agree it's not
a good thing to do.

Kevin
Bharata B Rao Aug. 15, 2012, 8:51 a.m. UTC | #8
On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
> Am 14.08.2012 06:38, schrieb Bharata B Rao:
> > Kevin, Thanks for your review. I will address all of your comments
> > in the next iteration, but have a few questions/comments on the others...
> > 
> > On Mon, Aug 13, 2012 at 02:50:29PM +0200, Kevin Wolf wrote:
> >>> +static int parse_server(GlusterURI *uri, char *server)
> >>> +{
> >>> +    int ret = -EINVAL;
> >>> +    char *token, *saveptr;
> >>> +    char *p, *q = server;
> >>> +
> >>> +    p = strchr(server, '[');
> >>> +    if (p) {
> >>> +        /* [ipv6] */
> >>> +        if (p != server) {
> >>> +            /* [ not in the beginning */
> >>> +            goto out;
> >>> +        }
> >>> +        q++;
> >>> +        p = strrchr(p, ']');
> >>> +        if (!p) {
> >>> +            /* No matching ] */
> >>> +            goto out;
> >>> +        }
> >>> +        *p++ = '\0';
> >>> +        uri->server = g_strdup(q);
> >>> +
> >>> +        if (*p) {
> >>> +            if (*p != ':') {
> >>> +                /* [ipv6] followed by something other than : */
> >>> +                goto out;
> >>> +            }
> >>> +            uri->port = strtoul(++p, NULL, 0);
> >>> +            if (uri->port < 0) {
> >>> +                goto out;
> >>> +            }
> >>
> 
> > In any case, let me see if I can get rid of this altogether and reuse
> > qemu-sockets.c:inet_parse().

Using inet_parse() outside of qemu-sockets.c needs some work like
making it non-static, defining QemuOptsList for inet opts and fixing
some issues with inet_parse(like making it successfully parse 'hostname' also
instead of just 'hostname:port').

I will give this a try and send a patch.

Regards,
Bharata.
Bharata B Rao Aug. 15, 2012, 9:22 a.m. UTC | #9
On Wed, Aug 15, 2012 at 10:00:27AM +0200, Kevin Wolf wrote:
> Am 15.08.2012 07:21, schrieb Bharata B Rao:
> > On Tue, Aug 14, 2012 at 10:29:26AM +0200, Kevin Wolf wrote:
> >>>>> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> >>>>> +{
> >>>>> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> >>>>> +    BDRVGlusterState *s = acb->common.bs->opaque;
> >>>>> +
> >>>>> +    acb->ret = ret;
> >>>>> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> >>>>> +        /*
> >>>>> +         * Gluster AIO callback thread failed to notify the waiting
> >>>>> +         * QEMU thread about IO completion. Nothing much can be done
> >>>>> +         * here but to abruptly abort.
> >>>>> +         *
> >>>>> +         * FIXME: Check if the read side of the fd handler can somehow
> >>>>> +         * be notified of this failure paving the way for a graceful exit.
> >>>>> +         */
> >>>>> +        error_report("Gluster failed to notify QEMU about IO completion");
> >>>>> +        abort();
> >>>>
> >>>> In the extreme case you may choose to make this disk inaccessible
> >>>> (something like bs->drv = NULL), but abort() kills the whole VM and
> >>>> should only be called when there is a bug.
> >>>
> >>> There have been concerns raised about this earlier too. I settled for this
> >>> since I couldn't see a better way out and I could see the precedence
> >>> for this in posix-aio-compat.c
> >>>
> >>> So I could just do the necessary cleanup, set bs->drv to NULL and return from
> >>> here ? But how do I wake up the QEMU thread that is waiting on the read side
> >>> of the pipe ? W/o that, the QEMU thread that waits on the read side of the
> >>> pipe is still hung.
> >>
> >> There is no other thread. But you're right, you should probably
> >> unregister the aio_fd_handler and any other pending callbacks.
> > 
> > As I clarified in the other mail, this (gluster_finish_aiocb) is called
> > from gluster thread context and hence QEMU thread that raised the original
> > read/write request is still blocked on qemu_aio_wait().
> > 
> > I tried the following cleanup instead of abrupt abort:
> > 
> > close(read_fd); /* This will wake up the QEMU thread blocked on select(read_fd...) */
> > close(write_fd);
> > qemu_aio_set_fd_handler(read_fd, NULL, NULL, NULL, NULL);
> > qemu_aio_release(acb);
> > s->qemu_aio_count--;
> > bs->drv = NULL;
> > 
> > I tested this by manually injecting faults into qemu_gluster_send_pipe().
> > With the above cleanup, the guest kernel crashes with IO errors.
> 
> What does "crash" really mean? IO errors certainly shouldn't cause a
> kernel to crash?

Since an IO failed, it resulted in root file system corruption which
subsequently led to a panic.

[    1.529042] dracut: Switching root
qemu-system-x86_64: Gluster failed to notify QEMU about IO completion
qemu-system-x86_64: Gluster failed to notify QEMU about IO completion
qemu-system-x86_64: Gluster failed to notify QEMU about IO completion
qemu-system-x86_64: Gluster failed to notify QEMU about IO completion
[    1.584130] end_request: I/O error, dev vda, sector 13615224
[    1.585119] end_request: I/O error, dev vda, sector 13615344
[    1.585119] end_request: I/O error, dev vda, sector 13615352
[    1.585119] end_request: I/O error, dev vda, sector 13615360
[    1.593188] end_request: I/O error, dev vda, sector 1030144
[    1.594169] Buffer I/O error on device vda3, logical block 0
[    1.594169] lost page write due to I/O error on vda3
[    1.594169] EXT4-fs error (device vda3): __ext4_get_inode_loc:3539: inode #392441: block 1573135: comm systemd: unable to read itable block
[...]
[    1.620064] EXT4-fs error (device vda3): __ext4_get_inode_loc:3539: inode #392441: block 1573135: comm systemd: unable to read itable block
/usr/lib/systemd/systemd: error while loading shared libraries: libselinux.so.1: cannot open shared object file: Input/output error
[    1.626193] Kernel panic - not syncing: Attempted to kill init!
[    1.627789] Pid: 1, comm: systemd Not tainted 3.3.4-5.fc17.x86_64 #1
[    1.630063] Call Trace:
[    1.631120]  [<ffffffff815e21eb>] panic+0xba/0x1c6
[    1.632477]  [<ffffffff8105aff1>] do_exit+0x8b1/0x8c0
[    1.633851]  [<ffffffff8105b34f>] do_group_exit+0x3f/0xa0
[    1.635258]  [<ffffffff8105b3c7>] sys_exit_group+0x17/0x20
[    1.636619]  [<ffffffff815f38e9>] system_call_fastpath+0x16/0x1b

> 
> > Is there anything else that I need to do or do differently to retain the
> > VM running w/o disk access ?
> > 
> > I thought of completing the aio callback by doing
> > acb->common.cb(acb->common.opaque, -EIO);
> > but that would do a coroutine enter from gluster thread, which I don't think
> > should be done.
> 
> You would have to take the global qemu mutex at least. I agree it's not
> a good thing to do.

So is it really worth doing all this to handle this unlikely error ? The
chances of this error happening is quite remote I believe.

Regards,
Bharata.
Bharata B Rao Sept. 5, 2012, 7:41 a.m. UTC | #10
On Thu, Aug 09, 2012 at 06:32:16PM +0530, Bharata B Rao wrote:
> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> +{
> +    int ret;
> +
> +    if (acb->canceled) {
> +        qemu_aio_release(acb);
> +        return;
> +    }
> +
> +    if (acb->ret == acb->size) {
> +        ret = 0; /* Success */
> +    } else if (acb->ret < 0) {
> +        ret = acb->ret; /* Read/Write failed */
> +    } else {
> +        ret = -EIO; /* Partial read/write - fail it */
> +    }
> +    acb->common.cb(acb->common.opaque, ret);

The .cb() here is bdrv_co_io_em_complete(). It does qemu_coroutine_enter(),
handles the return value and comes back here.

But if the bdrv_read or bdrv_write or bdrv_flush was called from a
coroutine context (as against they themselves creating a new coroutine),
the above .cb() call above doesn't return to this point. Hence I won't
be able to release the acb and decrement the qemu_aio_count.

What could be the issue here ? In general, how do I ensure that my
aio calls get completed correctly in such scenarios where bdrv_read etc
are called from coroutine context rather than from main thread context ?

Creating qcow2 image would lead to this scenario where
->bdrv_create (=qcow2_create) will create a coroutine and subsequently
read and write are called within qcow2_create in coroutine context itself.

> +    qemu_aio_release(acb);
> +}
> +
> +static void qemu_gluster_aio_event_reader(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +    GlusterAIOCB *event_acb;
> +    int event_reader_pos = 0;
> +    ssize_t ret;
> +
> +    do {
> +        char *p = (char *)&event_acb;
> +
> +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
> +                   sizeof(event_acb) - event_reader_pos);
> +        if (ret > 0) {
> +            event_reader_pos += ret;
> +            if (event_reader_pos == sizeof(event_acb)) {
> +                event_reader_pos = 0;
> +                qemu_gluster_complete_aio(event_acb);
> +                s->qemu_aio_count--;
> +            }
> +        }
> +    } while (ret < 0 && errno == EINTR);
> +}
> +
Bharata B Rao Sept. 5, 2012, 9:57 a.m. UTC | #11
On Wed, Sep 05, 2012 at 01:11:06PM +0530, Bharata B Rao wrote:
> On Thu, Aug 09, 2012 at 06:32:16PM +0530, Bharata B Rao wrote:
> > +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> > +{
> > +    int ret;
> > +
> > +    if (acb->canceled) {
> > +        qemu_aio_release(acb);
> > +        return;
> > +    }
> > +
> > +    if (acb->ret == acb->size) {
> > +        ret = 0; /* Success */
> > +    } else if (acb->ret < 0) {
> > +        ret = acb->ret; /* Read/Write failed */
> > +    } else {
> > +        ret = -EIO; /* Partial read/write - fail it */
> > +    }
> > +    acb->common.cb(acb->common.opaque, ret);
> 
> The .cb() here is bdrv_co_io_em_complete(). It does qemu_coroutine_enter(),
> handles the return value and comes back here.
> 
> But if the bdrv_read or bdrv_write or bdrv_flush was called from a
> coroutine context (as against they themselves creating a new coroutine),
> the above .cb() call above doesn't return to this point. Hence I won't
> be able to release the acb and decrement the qemu_aio_count.
> 
> What could be the issue here ? In general, how do I ensure that my
> aio calls get completed correctly in such scenarios where bdrv_read etc
> are called from coroutine context rather than from main thread context ?

One way to handle this is not to do completion from gluster thread but
instead schedule a BH that does the completion. In fact I had this approach
in the earlier versions, but resorted to directly calling completion from
gluster thread as I didn't see the value of using a BH for completion.
But I guess its indeed needed to support such scenarios (qcow2 image creation
on gluster backend).

Regards,
Bharata.
Kevin Wolf Sept. 5, 2012, 10:01 a.m. UTC | #12
Am 05.09.2012 09:41, schrieb Bharata B Rao:
> On Thu, Aug 09, 2012 at 06:32:16PM +0530, Bharata B Rao wrote:
>> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
>> +{
>> +    int ret;
>> +
>> +    if (acb->canceled) {
>> +        qemu_aio_release(acb);
>> +        return;
>> +    }
>> +
>> +    if (acb->ret == acb->size) {
>> +        ret = 0; /* Success */
>> +    } else if (acb->ret < 0) {
>> +        ret = acb->ret; /* Read/Write failed */
>> +    } else {
>> +        ret = -EIO; /* Partial read/write - fail it */
>> +    }
>> +    acb->common.cb(acb->common.opaque, ret);
> 
> The .cb() here is bdrv_co_io_em_complete(). It does qemu_coroutine_enter(),
> handles the return value and comes back here.

Right.

.cb is set by qemu_gluster_aio_rw/flush(), and the only way these can be
called is through bdrv_co_io_em() and bdrv_co_flush(), which both set
bdrv_co_io_em_complete as the callback.

> But if the bdrv_read or bdrv_write or bdrv_flush was called from a
> coroutine context (as against they themselves creating a new coroutine),
> the above .cb() call above doesn't return to this point.

Why?

A coroutine that yields before it's completed must be reentered, no
matter whether it's been created for a single request or if it already
existed.

Conversely, a coroutine that you enter, always yields at some point and
then you return from the qemu_coroutine_enter() and get back to this
line of code.

If you never come back to this point, there's a bug somewhere.

> Hence I won't
> be able to release the acb and decrement the qemu_aio_count.
> 
> What could be the issue here ? In general, how do I ensure that my
> aio calls get completed correctly in such scenarios where bdrv_read etc
> are called from coroutine context rather than from main thread context ?
> 
> Creating qcow2 image would lead to this scenario where
> ->bdrv_create (=qcow2_create) will create a coroutine and subsequently
> read and write are called within qcow2_create in coroutine context itself.

Can you describe in more detail what code paths it's taking and at which
point you're thinking it's wrong?

Kevin
Bharata B Rao Sept. 5, 2012, 10:43 a.m. UTC | #13
On Wed, Sep 05, 2012 at 12:01:58PM +0200, Kevin Wolf wrote:
> Am 05.09.2012 09:41, schrieb Bharata B Rao:
> > On Thu, Aug 09, 2012 at 06:32:16PM +0530, Bharata B Rao wrote:
> >> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> >> +{
> >> +    int ret;
> >> +
> >> +    if (acb->canceled) {
> >> +        qemu_aio_release(acb);
> >> +        return;
> >> +    }
> >> +
> >> +    if (acb->ret == acb->size) {
> >> +        ret = 0; /* Success */
> >> +    } else if (acb->ret < 0) {
> >> +        ret = acb->ret; /* Read/Write failed */
> >> +    } else {
> >> +        ret = -EIO; /* Partial read/write - fail it */
> >> +    }
> >> +    acb->common.cb(acb->common.opaque, ret);
> > 
> > The .cb() here is bdrv_co_io_em_complete(). It does qemu_coroutine_enter(),
> > handles the return value and comes back here.
> 
> Right.
> 
> .cb is set by qemu_gluster_aio_rw/flush(), and the only way these can be
> called is through bdrv_co_io_em() and bdrv_co_flush(), which both set
> bdrv_co_io_em_complete as the callback.

Right.

> 
> > But if the bdrv_read or bdrv_write or bdrv_flush was called from a
> > coroutine context (as against they themselves creating a new coroutine),
> > the above .cb() call above doesn't return to this point.
> 
> Why?

Note that in this particular scenario (qemu-img create -f qcow2), bdrv_read
and bdrv_write are called from the coroutine thread that is running
qcow2_create(). So bdrv_read will find itself running in coroutine context
and hence will continue to use the same coroutine thread.

    if (qemu_in_coroutine()) {
        /* Fast-path if already in coroutine context */
        bdrv_rw_co_entry(&rwco);
    }

The path taken is.

bdrv_rw_co_entry -> bdrv_co_do_readv -> bdrv_co_readv_em -> bdrv_co_io_em
-> qemu_gluster_aio_readv

bdrv_co_io_em does qemu_coroutine_yield() next.

When the AIO is completed, qemu_gluster_complete_aio() is run as the read end
of the pipe becomes ready, so I assume it is in non-coroutine context to start
with. When it does acb->common.cb(), it enters the co-routine which was yielded
by bdrv_co_io_em.

Now the read call returns back and we ultimately end up in bdrv_rw_co_entry
which takes us back to bdrv_read and back to bdrv_pwrite where all this
originated (Note that qcow2_create2 called bdrv_pwrite in the first place).

So I never come back to the next statement in qemu_gluster_complete_aio()
after acb->common.cb(acb->common.opaque, ret). So the coroutine didn't
end and continued futher by issuing another bdrv_write call.

The effect of this is seen next when qcow2_create calls bdrv_close which does
bdrv_drain_all which calls qemu_aio_wait and I never come out of it.
In qemu_aio_wait, node->io_flush(node->opaque) returns a non-zero value
always, because node->io_flush which is qemu_gluster_aio_flush_cb() returns
non zero always. This is happening since I never got a chance to decrement
s->qemu_aio_count which was supposed to happen after qemu_gluster_complete_aio
came back from .cb() call.

So this is what I think is happening, hoping that I got it right.
Note that when I schedule a BH in qemu_gluster_complete_aio(), then
things work fine apparently because I am able to continue and decrement
s->qemu_aio_count.

Regards,
Bharata.
Paolo Bonzini Sept. 6, 2012, 7:23 a.m. UTC | #14
Il 05/09/2012 11:57, Bharata B Rao ha scritto:
>> > What could be the issue here ? In general, how do I ensure that my
>> > aio calls get completed correctly in such scenarios where bdrv_read etc
>> > are called from coroutine context rather than from main thread context ?
> One way to handle this is not to do completion from gluster thread but
> instead schedule a BH that does the completion. In fact I had this approach
> in the earlier versions, but resorted to directly calling completion from
> gluster thread as I didn't see the value of using a BH for completion.
> But I guess its indeed needed to support such scenarios (qcow2 image creation
> on gluster backend).

I think the problem is that we're calling bdrv_drain_all() from
coroutine context.  This is problematic because then the current
coroutine won't yield and won't give other coroutines an occasion to run.

This could be fixed by checking whether we're in coroutine context in
bdrv_drain_all().  If so, instead of draining the queues directly,
schedule a bottom half that does bdrv_drain_all() followed by
qemu_coroutine_enter(), and yield.

If it works, I think this change would be preferrable to using a "magic"
BH in every driver.

Paolo
Paolo Bonzini Sept. 6, 2012, 7:35 a.m. UTC | #15
Il 09/08/2012 15:02, Bharata B Rao ha scritto:
> block: Support GlusterFS as a QEMU block backend.
> 
> From: Bharata B Rao <bharata@linux.vnet.ibm.com>
> 
> This patch adds gluster as the new block backend in QEMU. This gives
> QEMU the ability to boot VM images from gluster volumes. Its already
> possible to boot from VM images on gluster volumes using FUSE mount, but
> this patchset provides the ability to boot VM images from gluster volumes
> by by-passing the FUSE layer in gluster. This is made possible by
> using libgfapi routines to perform IO on gluster volumes directly.
> 
> VM Image on gluster volume is specified like this:
> 
> file=gluster://server:[port]/volname/image[?transport=socket]
> 
> 'gluster' is the protocol.
> 
> 'server' specifies the server where the volume file specification for
> the given volume resides. This can be either hostname or ipv4 address
> or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> 
> port' is the port number on which gluster management daemon (glusterd) is
> listening. This is optional and if not specified, QEMU will send 0 which
> will make libgfapi to use the default port.
> 
> 'volname' is the name of the gluster volume which contains the VM image.
> 
> 'image' is the path to the actual VM image in the gluster volume.
> 
> 'transport' specifies the transport used to connect to glusterd. This is
> optional and if not specified, socket transport is used.
> 
> Examples:
> 
> file=gluster://1.2.3.4/testvol/a.img
> file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> file=gluster://server.domain.com:5000/testvol/dir/a.img
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
> 
>  block/Makefile.objs |    1 
>  block/gluster.c     |  623 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 624 insertions(+), 0 deletions(-)
>  create mode 100644 block/gluster.c
> 
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index b5754d3..a1ae67f 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>  block-obj-$(CONFIG_CURL) += curl.o
>  block-obj-$(CONFIG_RBD) += rbd.o
> +block-obj-$(CONFIG_GLUSTERFS) += gluster.o
> diff --git a/block/gluster.c b/block/gluster.c
> new file mode 100644
> index 0000000..bbbaea8
> --- /dev/null
> +++ b/block/gluster.c
> @@ -0,0 +1,623 @@
> +/*
> + * GlusterFS backend for QEMU
> + *
> + * (AIO implementation is derived from block/rbd.c)
> + *
> + * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * (at your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +#include <glusterfs/api/glfs.h>
> +#include "block_int.h"
> +
> +typedef struct GlusterAIOCB {
> +    BlockDriverAIOCB common;
> +    bool canceled;
> +    int64_t size;
> +    int ret;
> +} GlusterAIOCB;
> +
> +typedef struct BDRVGlusterState {
> +    struct glfs *glfs;
> +    int fds[2];
> +    struct glfs_fd *fd;
> +    int qemu_aio_count;
> +} BDRVGlusterState;
> +
> +#define GLUSTER_FD_READ 0
> +#define GLUSTER_FD_WRITE 1
> +
> +typedef struct GlusterURI {
> +    char *server;
> +    int port;
> +    char *volname;
> +    char *image;
> +    char *transport;
> +} GlusterURI;
> +
> +static void qemu_gluster_uri_free(GlusterURI *uri)
> +{
> +    g_free(uri->server);
> +    g_free(uri->volname);
> +    g_free(uri->image);
> +    g_free(uri->transport);
> +    g_free(uri);
> +}
> +
> +/*
> + * We don't validate the transport option obtained here but
> + * instead depend on gluster to flag an error.
> + */
> +static int parse_transport(GlusterURI *uri, char *transport)
> +{
> +    char *token, *saveptr;
> +    int ret = -EINVAL;
> +
> +    if (!transport) {
> +        uri->transport = g_strdup("socket");
> +        ret = 0;
> +        goto out;
> +    }
> +
> +    token = strtok_r(transport, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    if (strcmp(token, "transport")) {
> +        goto out;
> +    }
> +    token = strtok_r(NULL, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    uri->transport = g_strdup(token);
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +static int parse_server(GlusterURI *uri, char *server)
> +{
> +    int ret = -EINVAL;
> +    char *token, *saveptr;
> +    char *p, *q = server;
> +
> +    p = strchr(server, '[');
> +    if (p) {
> +        /* [ipv6] */
> +        if (p != server) {
> +            /* [ not in the beginning */
> +            goto out;
> +        }
> +        q++;
> +        p = strrchr(p, ']');
> +        if (!p) {
> +            /* No matching ] */
> +            goto out;
> +        }
> +        *p++ = '\0';
> +        uri->server = g_strdup(q);
> +
> +        if (*p) {
> +            if (*p != ':') {
> +                /* [ipv6] followed by something other than : */
> +                goto out;
> +            }
> +            uri->port = strtoul(++p, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            /* port not specified, use default */
> +            uri->port = 0;
> +        }
> +
> +    } else {
> +        /* ipv4 or hostname */
> +        if (*server == ':') {
> +            /* port specified w/o a server */
> +            goto out;
> +        }
> +        token = strtok_r(server, ":", &saveptr);
> +        if (!token) {
> +            goto out;
> +        }
> +        uri->server = g_strdup(token);
> +        token = strtok_r(NULL, ":", &saveptr);
> +        if (token) {
> +            uri->port = strtoul(token, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            uri->port = 0;
> +        }
> +    }
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +/*
> + * file=gluster://server:[port]/volname/image[?transport=socket]
> + *
> + * 'gluster' is the protocol.
> + *
> + * 'server' specifies the server where the volume file specification for
> + * the given volume resides. This can be either hostname or ipv4 address
> + * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> + *
> + * 'port' is the port number on which gluster management daemon (glusterd) is
> + * listening. This is optional and if not specified, QEMU will send 0 which
> + * will make libgfapi to use the default port.
> + *
> + * 'volname' is the name of the gluster volume which contains the VM image.
> + *
> + * 'image' is the path to the actual VM image in the gluster volume.
> + *
> + * 'transport' specifies the transport used to connect to glusterd. This is
> + * optional and if not specified, socket transport is used.
> + *
> + * Examples:
> + *
> + * file=gluster://1.2.3.4/testvol/a.img
> + * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> + * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://server.domain.com:5000/testvol/dir/a.img
> + *
> + * We just do minimal checking of the gluster options and mostly ensure
> + * that all the expected elements of the URI are present. We depend on libgfapi
> + * APIs to return appropriate errors in case of invalid arguments.
> + */
> +static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
> +{
> +    char *token, *saveptr;
> +    char *p, *r;
> +    int ret = -EINVAL;
> +
> +    p = r = g_strdup(filename);
> +    if (strncmp(p, "gluster://", 10)) {
> +        goto out;
> +    }
> +
> +    /* Discard the protocol */
> +    p += 10;
> +
> +    /* server */
> +    token = strtok_r(p, "/", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +
> +    ret = parse_server(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* volname */
> +    token = strtok_r(NULL, "/", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->volname = g_strdup(token);
> +
> +    /* image */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->image = g_strdup(token);
> +
> +    /* transport */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    ret = parse_transport(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +     }
> +
> +    /* Flag error for extra options */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    ret = 0;
> +out:
> +    g_free(r);
> +    return ret;
> +}
> +
> +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
> +{
> +    struct glfs *glfs = NULL;
> +    int ret;
> +
> +    ret = qemu_gluster_parseuri(uri, filename);
> +    if (ret < 0) {
> +        error_report("Usage: file=gluster://server[:port]/volname/image"
> +            "[?transport=socket]");
> +        errno = -ret;
> +        goto out;
> +    }
> +
> +    glfs = glfs_new(uri->volname);
> +    if (!glfs) {
> +        goto out;
> +    }
> +
> +    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
> +        uri->port);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /*
> +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
> +     * GlusterFS exports it in a header.
> +     */
> +    ret = glfs_set_logging(glfs, "-", 4);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = glfs_init(glfs);
> +    if (ret) {
> +        error_report("Gluster connection failed for server=%s port=%d "
> +             "volume=%s image=%s transport=%s\n", uri->server, uri->port,
> +             uri->volname, uri->image, uri->transport);
> +        goto out;
> +    }
> +    return glfs;
> +
> +out:
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return NULL;
> +}
> +
> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> +{
> +    int ret;
> +
> +    if (acb->canceled) {
> +        qemu_aio_release(acb);
> +        return;
> +    }
> +
> +    if (acb->ret == acb->size) {
> +        ret = 0; /* Success */
> +    } else if (acb->ret < 0) {
> +        ret = acb->ret; /* Read/Write failed */
> +    } else {
> +        ret = -EIO; /* Partial read/write - fail it */
> +    }
> +    acb->common.cb(acb->common.opaque, ret);
> +    qemu_aio_release(acb);
> +}
> +
> +static void qemu_gluster_aio_event_reader(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +    GlusterAIOCB *event_acb;
> +    int event_reader_pos = 0;
> +    ssize_t ret;
> +
> +    do {
> +        char *p = (char *)&event_acb;
> +
> +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
> +                   sizeof(event_acb) - event_reader_pos);
> +        if (ret > 0) {
> +            event_reader_pos += ret;
> +            if (event_reader_pos == sizeof(event_acb)) {
> +                event_reader_pos = 0;
> +                qemu_gluster_complete_aio(event_acb);
> +                s->qemu_aio_count--;
> +            }
> +        }
> +    } while (ret < 0 && errno == EINTR);
> +}
> +
> +static int qemu_gluster_aio_flush_cb(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +
> +    return (s->qemu_aio_count > 0);
> +}
> +
> +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> +    int bdrv_flags)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    int open_flags = 0;
> +    int ret = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    s->glfs = qemu_gluster_init(uri, filename);
> +    if (!s->glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    open_flags |=  O_BINARY;
> +    open_flags &= ~O_ACCMODE;
> +    if (bdrv_flags & BDRV_O_RDWR) {
> +        open_flags |= O_RDWR;
> +    } else {
> +        open_flags |= O_RDONLY;
> +    }
> +
> +    if ((bdrv_flags & BDRV_O_NOCACHE)) {
> +        open_flags |= O_DIRECT;
> +    }
> +
> +    s->fd = glfs_open(s->glfs, uri->image, open_flags);
> +    if (!s->fd) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    ret = qemu_pipe(s->fds);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
> +    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);

A small thing I noticed while reviewing: since the write end of the pipe
is used from the gluster thread, you do not need to make this nonblocking.

Also, please use GLUSTER_FD_{READ,WRITE} instead.

> +    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
> +        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
> +
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (!ret) {
> +        return ret;
> +    }
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +    }
> +    if (s->glfs) {
> +        glfs_fini(s->glfs);
> +    }
> +    return ret;
> +}
> +
> +static int qemu_gluster_create(const char *filename,
> +        QEMUOptionParameter *options)
> +{
> +    struct glfs *glfs;
> +    struct glfs_fd *fd;
> +    int ret = 0;
> +    int64_t total_size = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    glfs = qemu_gluster_init(uri, filename);
> +    if (!glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            total_size = options->value.n / BDRV_SECTOR_SIZE;
> +        }
> +        options++;
> +    }
> +
> +    fd = glfs_creat(glfs, uri->image,
> +        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
> +    if (!fd) {
> +        ret = -errno;
> +    } else {
> +        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
> +            ret = -errno;
> +        }
> +        if (glfs_close(fd) != 0) {
> +            ret = -errno;
> +        }
> +    }
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return ret;
> +}
> +
> +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
> +
> +    acb->common.cb(acb->common.opaque, -ECANCELED);
> +    acb->canceled = true;

I think this is wrong, because the write could still complete later and
undo the effects of a second write that is done by the guest.  That is:

   gluster                  QEMU                      guest
----------------------------------------------------------
                                             <---     write #1
                     <---   write #1
                                             <---     cancel write #1
                            write #1 canceled --->
                                             <---     write #2
                     <---   write #2
   write #2 completed --->
                            write #2 completed -->
   write #1 completed --->

Now, the persistent storage recorded the effect of write #1, but the
guest thinks that it recorded the effect of write #2 instead.

You can simply do qemu_aio_flush() here.

> +}
> +
> +static AIOPool gluster_aio_pool = {
> +    .aiocb_size = sizeof(GlusterAIOCB),
> +    .cancel = qemu_gluster_aio_cancel,
> +};
> +
> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> +{
> +    int ret = 0;
> +    while (1) {
> +        fd_set wfd;
> +        int fd = s->fds[GLUSTER_FD_WRITE];
> +
> +        ret = write(fd, (void *)&acb, sizeof(acb));
> +        if (ret >= 0) {
> +            break;
> +        }
> +        if (errno == EINTR) {
> +            continue;
> +        }
> +        if (errno != EAGAIN) {
> +            break;
> +        }
> +
> +        FD_ZERO(&wfd);
> +        FD_SET(fd, &wfd);
> +        do {
> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> +        } while (ret < 0 && errno == EINTR);

If you make the fd non-blocking, you can avoid the select here.

> +    }
> +    return ret;
> +}
> +
> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> +    BDRVGlusterState *s = acb->common.bs->opaque;
> +
> +    acb->ret = ret;
> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> +        /*
> +         * Gluster AIO callback thread failed to notify the waiting
> +         * QEMU thread about IO completion. Nothing much can be done
> +         * here but to abruptly abort.
> +         *
> +         * FIXME: Check if the read side of the fd handler can somehow
> +         * be notified of this failure paving the way for a graceful exit.
> +         */
> +        error_report("Gluster failed to notify QEMU about IO completion");
> +        abort();

We can fix it later with a list of AIOCBs that are ready to process (and
a QemuMutex to protect the list).  An EventNotifier can trigger the read
handler to examine the list.

> +    }
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque, int write)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +    size_t size;
> +    off_t offset;
> +
> +    offset = sector_num * BDRV_SECTOR_SIZE;
> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> +    s->qemu_aio_count++;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = size;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +
> +    if (write) {
> +        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    } else {
> +        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    }
> +
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = 0;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +    s->qemu_aio_count++;
> +
> +    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static int64_t qemu_gluster_getlength(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    struct stat st;
> +    int ret;
> +
> +    ret = glfs_fstat(s->fd, &st);
> +    if (ret < 0) {
> +        return -errno;
> +    } else {
> +        return st.st_size;
> +    }
> +}
> +
> +static void qemu_gluster_close(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +        s->fd = NULL;
> +    }
> +    glfs_fini(s->glfs);
> +}
> +
> +static QEMUOptionParameter qemu_gluster_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_gluster = {
> +    .format_name = "gluster",
> +    .protocol_name = "gluster",
> +    .instance_size = sizeof(BDRVGlusterState),
> +    .bdrv_file_open = qemu_gluster_open,
> +    .bdrv_close = qemu_gluster_close,
> +    .bdrv_create = qemu_gluster_create,
> +    .bdrv_getlength = qemu_gluster_getlength,
> +
> +    .bdrv_aio_readv = qemu_gluster_aio_readv,
> +    .bdrv_aio_writev = qemu_gluster_aio_writev,
> +    .bdrv_aio_flush = qemu_gluster_aio_flush,
> +
> +    .create_options = qemu_gluster_create_options,
> +};
> +
> +static void bdrv_gluster_init(void)
> +{
> +    bdrv_register(&bdrv_gluster);
> +}
> +
> +block_init(bdrv_gluster_init);
> 
> 
>
Avi Kivity Sept. 6, 2012, 8:29 a.m. UTC | #16
On 08/14/2012 12:58 PM, Kevin Wolf wrote:
> 
>> While we are at this, let me bring out another issue. Gluster supports 3
>> transport types:
>> 
>> - socket in which case the server will be hostname, ipv4 or ipv4 address.
>> - rdma in which case server will be interpreted similar to socket.
>> - unix in which case server will be a path to unix domain socket and this
>>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
>> 
>> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
>> easily, but I am planning to specify the 'unix' transport as below:
>> 
>> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
>> 
>> i,e., I am asking the user to put the unix domain socket path within
>> square brackets when transport type is unix.
>> 
>> Do you think this is fine ?
> 
> Never saw something like this before, but it does seem reasonable to me.
> Excludes ] from the valid characters in the file name of the socket, but
> that shouldn't be a problem in practice.

Bikeshedding, but I prefer

  gluster:///path/to/unix/domain/socket:/volname/image?transport=unix

as being more similar to file://, or even

  gluster:///path/to/unix/domain/socket/volname/image?transport=unix

with the last two components implied to be part of the payload, not the
path.
Kevin Wolf Sept. 6, 2012, 9:06 a.m. UTC | #17
Am 06.09.2012 09:23, schrieb Paolo Bonzini:
> Il 05/09/2012 11:57, Bharata B Rao ha scritto:
>>>> What could be the issue here ? In general, how do I ensure that my
>>>> aio calls get completed correctly in such scenarios where bdrv_read etc
>>>> are called from coroutine context rather than from main thread context ?
>> One way to handle this is not to do completion from gluster thread but
>> instead schedule a BH that does the completion. In fact I had this approach
>> in the earlier versions, but resorted to directly calling completion from
>> gluster thread as I didn't see the value of using a BH for completion.
>> But I guess its indeed needed to support such scenarios (qcow2 image creation
>> on gluster backend).
> 
> I think the problem is that we're calling bdrv_drain_all() from
> coroutine context.  This is problematic because then the current
> coroutine won't yield and won't give other coroutines an occasion to run.
> 
> This could be fixed by checking whether we're in coroutine context in
> bdrv_drain_all().  If so, instead of draining the queues directly,
> schedule a bottom half that does bdrv_drain_all() followed by
> qemu_coroutine_enter(), and yield.

Hm, could be an option, but I'm not sure if there's too much magic going
on then...

> If it works, I think this change would be preferrable to using a "magic"
> BH in every driver.

The way it works in posix-aio-compat is that the request is first
removed from the list and then the callback is called. This way
posix_aio_flush() can return 0 and bdrv_drain_all() completes.

Kevin
Paolo Bonzini Sept. 6, 2012, 9:38 a.m. UTC | #18
Il 06/09/2012 11:06, Kevin Wolf ha scritto:
>> > If it works, I think this change would be preferrable to using a "magic"
>> > BH in every driver.
> The way it works in posix-aio-compat is that the request is first
> removed from the list and then the callback is called. This way
> posix_aio_flush() can return 0 and bdrv_drain_all() completes.

So the same could be done in gluster: first decrease qemu_aio_count,
then call the callback, then call qemu_aio_release.

But in either case, wouldn't that leak the AIOCBs until the end of
qcow2_create?

The AIOCB is already invalid at the time the callback is entered, so we
could release it before the call.  However, not all implementation of
AIO are ready for that and I'm not really in the mood for large scale
refactoring...

Paolo
Kevin Wolf Sept. 6, 2012, 10:07 a.m. UTC | #19
Am 06.09.2012 11:38, schrieb Paolo Bonzini:
> Il 06/09/2012 11:06, Kevin Wolf ha scritto:
>>>> If it works, I think this change would be preferrable to using a "magic"
>>>> BH in every driver.
>> The way it works in posix-aio-compat is that the request is first
>> removed from the list and then the callback is called. This way
>> posix_aio_flush() can return 0 and bdrv_drain_all() completes.
> 
> So the same could be done in gluster: first decrease qemu_aio_count,
> then call the callback, then call qemu_aio_release.
> 
> But in either case, wouldn't that leak the AIOCBs until the end of
> qcow2_create?
> 
> The AIOCB is already invalid at the time the callback is entered, so we
> could release it before the call.  However, not all implementation of
> AIO are ready for that and I'm not really in the mood for large scale
> refactoring...

But the way, what I'd really want to see in the end is to get rid of
qemu_aio_flush() and replace it by .bdrv_drain() callbacks in each
BlockDriver. The way we're doing it today is a layering violation.

Doesn't change anything about this problem, though. So the options that
we have are:

1. Delay the callback using a BH. Doing this in each driver is ugly.
   But is there actually more than one possible callback in today's
   coroutine world? I only see bdrv_co_io_em_complete(), which could
   reenter the coroutine from a BH.

2. Delay the callback by just calling it later when the cleanup has
   been completed and .io_flush() can return 0. You say that it's hard
   to implement for some drivers, except if the AIOCB are leaked until
   the end of functions like qcow2_create().

3. Add a delay only later in functions like bdrv_drain_all() that assume
   that the request has completed. Are there more of this type? AIOCBs
   are leaked until a bdrv_drain_all() call. Does it work with draining
   specific BDSes instead of everything?

Unless I forgot some important point, it almost looks like option 1 is
the easiest and safest.

Kevin
Paolo Bonzini Sept. 6, 2012, 10:18 a.m. UTC | #20
Il 06/09/2012 12:07, Kevin Wolf ha scritto:
>> The AIOCB is already invalid at the time the callback is entered, so we
>> could release it before the call.  However, not all implementation of
>> AIO are ready for that and I'm not really in the mood for large scale
>> refactoring...
> 
> But the way, what I'd really want to see in the end is to get rid of
> qemu_aio_flush() and replace it by .bdrv_drain() callbacks in each
> BlockDriver. The way we're doing it today is a layering violation.

That's quite difficult.  Completion of an I/O operation can trigger
another I/O operation on another block device, and so on until we go
back to the first device (think of a hypothetical RAID-5 device).

> Doesn't change anything about this problem, though. So the options that
> we have are:
> 
> 1. Delay the callback using a BH. Doing this in each driver is ugly.
>    But is there actually more than one possible callback in today's
>    coroutine world? I only see bdrv_co_io_em_complete(), which could
>    reenter the coroutine from a BH.

Easy and safe, but it feels a bit like a timebomb.  Also, I'm not
entirely sure of _why_ the bottom half works. :)

> 2. Delay the callback by just calling it later when the cleanup has
>    been completed and .io_flush() can return 0. You say that it's hard
>    to implement for some drivers, except if the AIOCB are leaked until
>    the end of functions like qcow2_create().

... which is what we do in posix-aio-compat.c; nobody screamed so far.

Not really hard, it just has to be assessed for each driver separately.
 We can just do it in gluster and refactor it later.

> 3. Add a delay only later in functions like bdrv_drain_all() that assume
>    that the request has completed. Are there more of this type? AIOCBs
>    are leaked until a bdrv_drain_all() call. Does it work with draining
>    specific BDSes instead of everything?
> 
> Unless I forgot some important point, it almost looks like option 1 is
> the easiest and safest.

I agree with your opinion, but I would feel better if I understood
better why it works.  (2) can be done easily in each driver (no
ugliness) and refactored later.

Paolo
Kevin Wolf Sept. 6, 2012, 10:29 a.m. UTC | #21
Am 06.09.2012 12:18, schrieb Paolo Bonzini:
> Il 06/09/2012 12:07, Kevin Wolf ha scritto:
>>> The AIOCB is already invalid at the time the callback is entered, so we
>>> could release it before the call.  However, not all implementation of
>>> AIO are ready for that and I'm not really in the mood for large scale
>>> refactoring...
>>
>> But the way, what I'd really want to see in the end is to get rid of
>> qemu_aio_flush() and replace it by .bdrv_drain() callbacks in each
>> BlockDriver. The way we're doing it today is a layering violation.
> 
> That's quite difficult.  Completion of an I/O operation can trigger
> another I/O operation on another block device, and so on until we go
> back to the first device (think of a hypothetical RAID-5 device).

You always have a tree of BDSes, and children should only ever trigger
completion of I/O operations in their parents. Am I missing anything?

>> Doesn't change anything about this problem, though. So the options that
>> we have are:
>>
>> 1. Delay the callback using a BH. Doing this in each driver is ugly.
>>    But is there actually more than one possible callback in today's
>>    coroutine world? I only see bdrv_co_io_em_complete(), which could
>>    reenter the coroutine from a BH.
> 
> Easy and safe, but it feels a bit like a timebomb.  Also, I'm not
> entirely sure of _why_ the bottom half works. :)

Hm, safe and time bomb is contradictory in my book. :-)

The bottom half work because we're not reentering the qcow2_create
coroutine immediately, so the gluster AIO callback can complete all of
its cleanup work without being interrupted by code that might wait on
this particular request and create a deadlock this way.

>> 2. Delay the callback by just calling it later when the cleanup has
>>    been completed and .io_flush() can return 0. You say that it's hard
>>    to implement for some drivers, except if the AIOCB are leaked until
>>    the end of functions like qcow2_create().
> 
> ... which is what we do in posix-aio-compat.c; nobody screamed so far.

True. Would be easy to fix in posix-aio-compat, though, or can a
callback expect that the AIOCB is still valid?

> Not really hard, it just has to be assessed for each driver separately.
>  We can just do it in gluster and refactor it later.

Okay, so let's keep it as an option for now.

>> 3. Add a delay only later in functions like bdrv_drain_all() that assume
>>    that the request has completed. Are there more of this type? AIOCBs
>>    are leaked until a bdrv_drain_all() call. Does it work with draining
>>    specific BDSes instead of everything?
>>
>> Unless I forgot some important point, it almost looks like option 1 is
>> the easiest and safest.
> 
> I agree with your opinion, but I would feel better if I understood
> better why it works.  (2) can be done easily in each driver (no
> ugliness) and refactored later.

I think option 2 must be done in each driver by design, or do you see
even a theoretical way how to do it generically?

Kevin
Paolo Bonzini Sept. 6, 2012, 11:01 a.m. UTC | #22
Il 06/09/2012 12:29, Kevin Wolf ha scritto:
>> That's quite difficult.  Completion of an I/O operation can trigger
>> another I/O operation on another block device, and so on until we go
>> back to the first device (think of a hypothetical RAID-5 device).
> 
> You always have a tree of BDSes, and children should only ever trigger
> completion of I/O operations in their parents. Am I missing anything?

Yes, it can only ever trigger completion in the parents.  So if you had
bdrv_drain in the children, it could leave other I/O pending on the
siblings, but that's okay.  Very nice!!  I hadn't thought of the tree.

>>> Doesn't change anything about this problem, though. So the options that
>>> we have are:
>>>
>>> 1. Delay the callback using a BH. Doing this in each driver is ugly.
>>>    But is there actually more than one possible callback in today's
>>>    coroutine world? I only see bdrv_co_io_em_complete(), which could
>>>    reenter the coroutine from a BH.
>>
>> Easy and safe, but it feels a bit like a timebomb.  Also, I'm not
>> entirely sure of _why_ the bottom half works. :)
> 
> Hm, safe and time bomb is contradictory in my book. :-)

Well, safe for now. :)

> The bottom half work because we're not reentering the qcow2_create
> coroutine immediately, so the gluster AIO callback can complete all of
> its cleanup work without being interrupted by code that might wait on
> this particular request and create a deadlock this way.

Got it now.  It's just (2) with a bottom half instead of simple code
reorganization.

>>> 2. Delay the callback by just calling it later when the cleanup has
>>>    been completed and .io_flush() can return 0. You say that it's hard
>>>    to implement for some drivers, except if the AIOCB are leaked until
>>>    the end of functions like qcow2_create().
>>
>> ... which is what we do in posix-aio-compat.c; nobody screamed so far.
> 
> True. Would be easy to fix in posix-aio-compat, though, or can a
> callback expect that the AIOCB is still valid?

IMO no.  What would you use it for, anyway?  It's opaque, all you could
do is bdrv_aio_cancel it.  I checked all of the callers of
bdrv_aio_cancel.  SCSI always zeroes their aiocb pointers on entry to
the AIO callback.  IDE is a bit less explicit, but in the end will
always zero the aiocb as well.  AHCI probably has a bug there (it does
not NULL the aiocb in ncq_cb).

virtio and Xen do not even store the aiocb, i.e. they couldn't care less.

>> Not really hard, it just has to be assessed for each driver separately.
>>  We can just do it in gluster and refactor it later.
> 
> Okay, so let's keep it as an option for now.
> 
>>> 3. Add a delay only later in functions like bdrv_drain_all() that assume
>>>    that the request has completed. Are there more of this type? AIOCBs
>>>    are leaked until a bdrv_drain_all() call. Does it work with draining
>>>    specific BDSes instead of everything?
>>>
>>> Unless I forgot some important point, it almost looks like option 1 is
>>> the easiest and safest.
>>
>> I agree with your opinion, but I would feel better if I understood
>> better why it works.  (2) can be done easily in each driver (no
>> ugliness) and refactored later.
> 
> I think option 2 must be done in each driver by design, or do you see
> even a theoretical way how to do it generically?

Yes, it has to be done in every driver.  If we added something like
qemu_aio_complete(acb, ret) that does

    cb = acb->cb;
    opaque = acb->opaque;
    qemu_aio_release(acb);
    cb(opaque, ret);

by converting the driver to qemu_aio_complete you would avoid the leak.

Paolo
Bharata B Rao Sept. 6, 2012, 3:40 p.m. UTC | #23
On Thu, Sep 06, 2012 at 11:29:36AM +0300, Avi Kivity wrote:
> On 08/14/2012 12:58 PM, Kevin Wolf wrote:
> > 
> >> While we are at this, let me bring out another issue. Gluster supports 3
> >> transport types:
> >> 
> >> - socket in which case the server will be hostname, ipv4 or ipv4 address.
> >> - rdma in which case server will be interpreted similar to socket.
> >> - unix in which case server will be a path to unix domain socket and this
> >>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
> >> 
> >> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
> >> easily, but I am planning to specify the 'unix' transport as below:
> >> 
> >> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> >> 
> >> i,e., I am asking the user to put the unix domain socket path within
> >> square brackets when transport type is unix.
> >> 
> >> Do you think this is fine ?
> > 
> > Never saw something like this before, but it does seem reasonable to me.
> > Excludes ] from the valid characters in the file name of the socket, but
> > that shouldn't be a problem in practice.
> 
> Bikeshedding, but I prefer
> 
>   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix

So if the unix domain socket is /tmp/glusterd.socket, then this would look like:

gluster:///tmp/glusterd.socket:/volname/image?transport=unix.

So you are saying : will the separator b/n the unix domain socket path
and rest of the URI components.

Unless you or others strongly feel about this, I would like to go with
[ ] based spec, which I feel is less prone to errors like missing a colon
by mistake :)

> 
> as being more similar to file://, or even
> 
>   gluster:///path/to/unix/domain/socket/volname/image?transport=unix
> 
> with the last two components implied to be part of the payload, not the
> path.

Note that image is a file path by itself like /dir1/a.img. So I guess it
becomes difficult to figure out where the unix domain socket path ends
and rest of the URI components begin w/o a separator in between.

Regards,
Bharata.
Paolo Bonzini Sept. 6, 2012, 3:44 p.m. UTC | #24
Il 06/09/2012 17:40, Bharata B Rao ha scritto:
> > > > I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
> > > > easily, but I am planning to specify the 'unix' transport as below:
> > > > 
> > > > gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> > > > 
> > > > i,e., I am asking the user to put the unix domain socket path within
> > > > square brackets when transport type is unix.
> > > 
> > > Never saw something like this before, but it does seem reasonable to me.
> > > Excludes ] from the valid characters in the file name of the socket, but
> > > that shouldn't be a problem in practice.
> > 
> > Bikeshedding, but I prefer
> > 
> >   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix
> 
> Unless you or others strongly feel about this, I would like to go with
> [ ] based spec, which I feel is less prone to errors like missing a colon
> by mistake :)

Your proposed spec has the disadvantage of not being a proper URL.

What about this instead:

gluster:///path/to/unix/domain/socket!/volname/image?transport=unix

since (http://www.w3.org/Addressing/URL/uri-spec.html) "The asterisk
("*", ASCII 2A hex) and exclamation mark ("!" , ASCII 21 hex) are
reserved for use as having special signifiance within specific schemes".

Paolo
Daniel P. Berrangé Sept. 6, 2012, 3:47 p.m. UTC | #25
On Thu, Sep 06, 2012 at 09:10:04PM +0530, Bharata B Rao wrote:
> On Thu, Sep 06, 2012 at 11:29:36AM +0300, Avi Kivity wrote:
> > On 08/14/2012 12:58 PM, Kevin Wolf wrote:
> > > 
> > >> While we are at this, let me bring out another issue. Gluster supports 3
> > >> transport types:
> > >> 
> > >> - socket in which case the server will be hostname, ipv4 or ipv4 address.
> > >> - rdma in which case server will be interpreted similar to socket.
> > >> - unix in which case server will be a path to unix domain socket and this
> > >>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
> > >> 
> > >> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
> > >> easily, but I am planning to specify the 'unix' transport as below:
> > >> 
> > >> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> > >> 
> > >> i,e., I am asking the user to put the unix domain socket path within
> > >> square brackets when transport type is unix.
> > >> 
> > >> Do you think this is fine ?
> > > 
> > > Never saw something like this before, but it does seem reasonable to me.
> > > Excludes ] from the valid characters in the file name of the socket, but
> > > that shouldn't be a problem in practice.
> > 
> > Bikeshedding, but I prefer
> > 
> >   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix
> 
> So if the unix domain socket is /tmp/glusterd.socket, then this would look like:
> 
> gluster:///tmp/glusterd.socket:/volname/image?transport=unix.
> 
> So you are saying : will the separator b/n the unix domain socket path
> and rest of the URI components.
> 
> Unless you or others strongly feel about this, I would like to go with
> [ ] based spec, which I feel is less prone to errors like missing a colon
> by mistake :)
> 
> > 
> > as being more similar to file://, or even
> > 
> >   gluster:///path/to/unix/domain/socket/volname/image?transport=unix
> > 
> > with the last two components implied to be part of the payload, not the
> > path.
> 
> Note that image is a file path by itself like /dir1/a.img. So I guess it
> becomes difficult to figure out where the unix domain socket path ends
> and rest of the URI components begin w/o a separator in between.

IMHO this is all gross. URIs already have a well defined way to provide
multiple parameters, dealing with escaping of special characters. ie query
parameters. The whole benefit of using URI syntax is to let apps process
the URIs using standard APIs. We should not be trying to define some extra
magic encoding to let us stuff 2 separate parameters into the path component
since that means apps have to now write custom parsing code again. Either
the UNIX socket path, or the volume path should be in the URI path, not
both. The other part should be a URI parameter. I'd really expect us to
use:

  gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock


Daniel
ronnie sahlberg Sept. 6, 2012, 4:04 p.m. UTC | #26
On Fri, Sep 7, 2012 at 1:47 AM, Daniel P. Berrange <berrange@redhat.com>wrote:

> On Thu, Sep 06, 2012 at 09:10:04PM +0530, Bharata B Rao wrote:
> > On Thu, Sep 06, 2012 at 11:29:36AM +0300, Avi Kivity wrote:
> > > On 08/14/2012 12:58 PM, Kevin Wolf wrote:
> > > >
> > > >> While we are at this, let me bring out another issue. Gluster
> supports 3
> > > >> transport types:
> > > >>
> > > >> - socket in which case the server will be hostname, ipv4 or ipv4
> address.
> > > >> - rdma in which case server will be interpreted similar to socket.
> > > >> - unix in which case server will be a path to unix domain socket
> and this
> > > >>   will look like any other filesystem path. (Eg.
> /tmp/glusterd.socket)
> > > >>
> > > >> I don't think we can fit 'unix' within the standard URI scheme (RFC
> 3986)
> > > >> easily, but I am planning to specify the 'unix' transport as below:
> > > >>
> > > >> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> > > >>
> > > >> i,e., I am asking the user to put the unix domain socket path within
> > > >> square brackets when transport type is unix.
> > > >>
> > > >> Do you think this is fine ?
> > > >
> > > > Never saw something like this before, but it does seem reasonable to
> me.
> > > > Excludes ] from the valid characters in the file name of the socket,
> but
> > > > that shouldn't be a problem in practice.
> > >
> > > Bikeshedding, but I prefer
> > >
> > >   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix
> >
> > So if the unix domain socket is /tmp/glusterd.socket, then this would
> look like:
> >
> > gluster:///tmp/glusterd.socket:/volname/image?transport=unix.
> >
> > So you are saying : will the separator b/n the unix domain socket path
> > and rest of the URI components.
> >
> > Unless you or others strongly feel about this, I would like to go with
> > [ ] based spec, which I feel is less prone to errors like missing a colon
> > by mistake :)
> >
> > >
> > > as being more similar to file://, or even
> > >
> > >   gluster:///path/to/unix/domain/socket/volname/image?transport=unix
> > >
> > > with the last two components implied to be part of the payload, not the
> > > path.
> >
> > Note that image is a file path by itself like /dir1/a.img. So I guess it
> > becomes difficult to figure out where the unix domain socket path ends
> > and rest of the URI components begin w/o a separator in between.
>
> IMHO this is all gross. URIs already have a well defined way to provide
> multiple parameters, dealing with escaping of special characters. ie query
> parameters. The whole benefit of using URI syntax is to let apps process
> the URIs using standard APIs. We should not be trying to define some extra
> magic encoding to let us stuff 2 separate parameters into the path
> component
> since that means apps have to now write custom parsing code again. Either
> the UNIX socket path, or the volume path should be in the URI path, not
> both. The other part should be a URI parameter. I'd really expect us to
> use:
>
>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
>
>
+1

ronnie sahlberg
Avi Kivity Sept. 6, 2012, 4:06 p.m. UTC | #27
On 09/06/2012 06:47 PM, Daniel P. Berrange wrote:
> 
>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock

Like.
Bharata B Rao Sept. 7, 2012, 3:24 a.m. UTC | #28
On Thu, Sep 06, 2012 at 04:47:17PM +0100, Daniel P. Berrange wrote:
> IMHO this is all gross. URIs already have a well defined way to provide
> multiple parameters, dealing with escaping of special characters. ie query
> parameters. The whole benefit of using URI syntax is to let apps process
> the URIs using standard APIs. We should not be trying to define some extra
> magic encoding to let us stuff 2 separate parameters into the path component
> since that means apps have to now write custom parsing code again. Either
> the UNIX socket path, or the volume path should be in the URI path, not
> both. The other part should be a URI parameter. I'd really expect us to
> use:
> 
>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
            ^why 3 /// here ? volname is not a path, but image is.

So when transport=socket or rdma, is it acceptable to still use the following
format ?

gluster://server[:port]/volname/path/to/image[?transport=...]

With that, the different options possible are

unix - gluster://testvol/dir/a.img?transport=unix&sockpath=/tmp/glusterd.socket
ipv4 - gluster://1.2.3.4:0/testvol/dir/a.img?transport=socket
ipv6 - gluster://[1:2:3:4:5:6:7:8]:0/testvol/a.img
hostname - gluster://example.org/testvol/dir/a.img
rdma - gluster://1.2.3.4:0/testvol/a.img?transport=rdma

Does this look complete from the specification point of view ?

Regards,
Bharata.
Bharata B Rao Sept. 7, 2012, 5:46 a.m. UTC | #29
On Thu, Sep 06, 2012 at 09:35:04AM +0200, Paolo Bonzini wrote:
> > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> > +    int bdrv_flags)
> > +{
> > +    BDRVGlusterState *s = bs->opaque;
> > +    int open_flags = 0;
> > +    int ret = 0;
> > +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> > +
> > +    s->glfs = qemu_gluster_init(uri, filename);
> > +    if (!s->glfs) {
> > +        ret = -errno;
> > +        goto out;
> > +    }
> > +
> > +    open_flags |=  O_BINARY;
> > +    open_flags &= ~O_ACCMODE;
> > +    if (bdrv_flags & BDRV_O_RDWR) {
> > +        open_flags |= O_RDWR;
> > +    } else {
> > +        open_flags |= O_RDONLY;
> > +    }
> > +
> > +    if ((bdrv_flags & BDRV_O_NOCACHE)) {
> > +        open_flags |= O_DIRECT;
> > +    }
> > +
> > +    s->fd = glfs_open(s->glfs, uri->image, open_flags);
> > +    if (!s->fd) {
> > +        ret = -errno;
> > +        goto out;
> > +    }
> > +
> > +    ret = qemu_pipe(s->fds);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
> > +    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
> 
> A small thing I noticed while reviewing: since the write end of the pipe
> is used from the gluster thread, you do not need to make this nonblocking.

Ok.

> 
> Also, please use GLUSTER_FD_{READ,WRITE} instead.

Sure.

> > +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
> > +{
> > +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
> > +
> > +    acb->common.cb(acb->common.opaque, -ECANCELED);
> > +    acb->canceled = true;
> 
> I think this is wrong, because the write could still complete later and
> undo the effects of a second write that is done by the guest.  That is:
> 
>    gluster                  QEMU                      guest
> ----------------------------------------------------------
>                                              <---     write #1
>                      <---   write #1
>                                              <---     cancel write #1
>                             write #1 canceled --->
>                                              <---     write #2
>                      <---   write #2
>    write #2 completed --->
>                             write #2 completed -->
>    write #1 completed --->
> 
> Now, the persistent storage recorded the effect of write #1, but the
> guest thinks that it recorded the effect of write #2 instead.
> 
> You can simply do qemu_aio_flush() here.

Based on my discussions with Kevin, I have now followed block/qed.c for
aio_cancel which is based on using acb->finished.

> > +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> > +{
> > +    int ret = 0;
> > +    while (1) {
> > +        fd_set wfd;
> > +        int fd = s->fds[GLUSTER_FD_WRITE];
> > +
> > +        ret = write(fd, (void *)&acb, sizeof(acb));
> > +        if (ret >= 0) {
> > +            break;
> > +        }
> > +        if (errno == EINTR) {
> > +            continue;
> > +        }
> > +        if (errno != EAGAIN) {
> > +            break;
> > +        }
> > +
> > +        FD_ZERO(&wfd);
> > +        FD_SET(fd, &wfd);
> > +        do {
> > +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> > +        } while (ret < 0 && errno == EINTR);
> 
> If you make the fd non-blocking, you can avoid the select here.

Will change in in the next iteration.

Thanks for taking time to review.

Regards,
Bharata.
Daniel P. Berrangé Sept. 7, 2012, 9:19 a.m. UTC | #30
On Fri, Sep 07, 2012 at 08:54:02AM +0530, Bharata B Rao wrote:
> On Thu, Sep 06, 2012 at 04:47:17PM +0100, Daniel P. Berrange wrote:
> > IMHO this is all gross. URIs already have a well defined way to provide
> > multiple parameters, dealing with escaping of special characters. ie query
> > parameters. The whole benefit of using URI syntax is to let apps process
> > the URIs using standard APIs. We should not be trying to define some extra
> > magic encoding to let us stuff 2 separate parameters into the path component
> > since that means apps have to now write custom parsing code again. Either
> > the UNIX socket path, or the volume path should be in the URI path, not
> > both. The other part should be a URI parameter. I'd really expect us to
> > use:
> > 
> >   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
>             ^why 3 /// here ? volname is not a path, but image is.

The first 2 '/' are the protocol/hostname separator and the 3rd
/ is the hostname/path separator. Since there is no hostname in
this example, you get ///

> So when transport=socket or rdma, is it acceptable to still use the following
> format ?
> 
> gluster://server[:port]/volname/path/to/image[?transport=...]

Of course.

> With that, the different options possible are
> 
> unix - gluster://testvol/dir/a.img?transport=unix&sockpath=/tmp/glusterd.socket
> ipv4 - gluster://1.2.3.4:0/testvol/dir/a.img?transport=socket

I assume you intend 'transport=socket' to be optional here

> ipv6 - gluster://[1:2:3:4:5:6:7:8]:0/testvol/a.img
> hostname - gluster://example.org/testvol/dir/a.img
> rdma - gluster://1.2.3.4:0/testvol/a.img?transport=rdma
> 
> Does this look complete from the specification point of view ?

Yes, this looks like a reasonable use of URIs.

Daniel
Paolo Bonzini Sept. 7, 2012, 9:36 a.m. UTC | #31
Il 07/09/2012 05:24, Bharata B Rao ha scritto:
>>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
>             ^why 3 /// here ? volname is not a path, but image is.

Because the host is the local computer, i.e. empty.

> gluster://server[:port]/volname/path/to/image[?transport=...]
> 
> With that, the different options possible are
> 
> unix - gluster://testvol/dir/a.img?transport=unix&sockpath=/tmp/glusterd.socket
> ipv4 - gluster://1.2.3.4:0/testvol/dir/a.img?transport=socket
> ipv6 - gluster://[1:2:3:4:5:6:7:8]:0/testvol/a.img
> hostname - gluster://example.org/testvol/dir/a.img
> rdma - gluster://1.2.3.4:0/testvol/a.img?transport=rdma
> 
> Does this look complete from the specification point of view ?

Hmm, why don't we do the exact same thing as libvirt (http://libvirt.org/remote.html):

ipv4 - gluster+tcp://1.2.3.4:0/testvol/dir/a.img
ipv6 - gluster+tcp://[1:2:3:4:5:6:7:8]:0/testvol/a.img
host - gluster+tcp://example.org/testvol/dir/a.img
unix - gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
rdma - gluster+rdma://1.2.3.4:0/testvol/a.img

Where the default transport is tcp (i.e. gluster+tcp is the same as gluster).
This is easily extensible, theoretically you could have something like this:

ssh - gluster+ssh://user@host/testvol/a.img?socket=/tmp/glusterd.socket

Paolo
Kevin Wolf Sept. 7, 2012, 9:57 a.m. UTC | #32
Am 07.09.2012 11:36, schrieb Paolo Bonzini:
> Il 07/09/2012 05:24, Bharata B Rao ha scritto:
>>>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
>>             ^why 3 /// here ? volname is not a path, but image is.
> 
> Because the host is the local computer, i.e. empty.
> 
>> gluster://server[:port]/volname/path/to/image[?transport=...]
>>
>> With that, the different options possible are
>>
>> unix - gluster://testvol/dir/a.img?transport=unix&sockpath=/tmp/glusterd.socket
>> ipv4 - gluster://1.2.3.4:0/testvol/dir/a.img?transport=socket
>> ipv6 - gluster://[1:2:3:4:5:6:7:8]:0/testvol/a.img
>> hostname - gluster://example.org/testvol/dir/a.img
>> rdma - gluster://1.2.3.4:0/testvol/a.img?transport=rdma
>>
>> Does this look complete from the specification point of view ?
> 
> Hmm, why don't we do the exact same thing as libvirt (http://libvirt.org/remote.html):
> 
> ipv4 - gluster+tcp://1.2.3.4:0/testvol/dir/a.img
> ipv6 - gluster+tcp://[1:2:3:4:5:6:7:8]:0/testvol/a.img
> host - gluster+tcp://example.org/testvol/dir/a.img
> unix - gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
> rdma - gluster+rdma://1.2.3.4:0/testvol/a.img
> 
> Where the default transport is tcp (i.e. gluster+tcp is the same as gluster).
> This is easily extensible, theoretically you could have something like this:
> 
> ssh - gluster+ssh://user@host/testvol/a.img?socket=/tmp/glusterd.socket

I like this. Having the type only as a parameter when it decides how the
schema must be parsed has bothered me from the start, but I hadn't
thought of this way.

Strong +1 from me.

Kevin
Kevin Wolf Sept. 7, 2012, 10 a.m. UTC | #33
Am 06.09.2012 17:47, schrieb Daniel P. Berrange:
> On Thu, Sep 06, 2012 at 09:10:04PM +0530, Bharata B Rao wrote:
>> On Thu, Sep 06, 2012 at 11:29:36AM +0300, Avi Kivity wrote:
>>> On 08/14/2012 12:58 PM, Kevin Wolf wrote:
>>>>
>>>>> While we are at this, let me bring out another issue. Gluster supports 3
>>>>> transport types:
>>>>>
>>>>> - socket in which case the server will be hostname, ipv4 or ipv4 address.
>>>>> - rdma in which case server will be interpreted similar to socket.
>>>>> - unix in which case server will be a path to unix domain socket and this
>>>>>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
>>>>>
>>>>> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
>>>>> easily, but I am planning to specify the 'unix' transport as below:
>>>>>
>>>>> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
>>>>>
>>>>> i,e., I am asking the user to put the unix domain socket path within
>>>>> square brackets when transport type is unix.
>>>>>
>>>>> Do you think this is fine ?
>>>>
>>>> Never saw something like this before, but it does seem reasonable to me.
>>>> Excludes ] from the valid characters in the file name of the socket, but
>>>> that shouldn't be a problem in practice.
>>>
>>> Bikeshedding, but I prefer
>>>
>>>   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix
>>
>> So if the unix domain socket is /tmp/glusterd.socket, then this would look like:
>>
>> gluster:///tmp/glusterd.socket:/volname/image?transport=unix.
>>
>> So you are saying : will the separator b/n the unix domain socket path
>> and rest of the URI components.
>>
>> Unless you or others strongly feel about this, I would like to go with
>> [ ] based spec, which I feel is less prone to errors like missing a colon
>> by mistake :)
>>
>>>
>>> as being more similar to file://, or even
>>>
>>>   gluster:///path/to/unix/domain/socket/volname/image?transport=unix
>>>
>>> with the last two components implied to be part of the payload, not the
>>> path.
>>
>> Note that image is a file path by itself like /dir1/a.img. So I guess it
>> becomes difficult to figure out where the unix domain socket path ends
>> and rest of the URI components begin w/o a separator in between.
> 
> IMHO this is all gross. URIs already have a well defined way to provide
> multiple parameters, dealing with escaping of special characters. ie query
> parameters. The whole benefit of using URI syntax is to let apps process
> the URIs using standard APIs. We should not be trying to define some extra
> magic encoding to let us stuff 2 separate parameters into the path component
> since that means apps have to now write custom parsing code again. Either
> the UNIX socket path, or the volume path should be in the URI path, not
> both. The other part should be a URI parameter. I'd really expect us to
> use:
> 
>   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock

I think doing it the other way round would be more logical:

  gluster+unix:///path/to/unix/sock?image=volname/image

This way you have the socket first, which you also must open first.
Having it as a parameter without which you can't make sense of the path
feels a bit less than ideal.

Kevin
Daniel P. Berrangé Sept. 7, 2012, 10:03 a.m. UTC | #34
On Fri, Sep 07, 2012 at 12:00:50PM +0200, Kevin Wolf wrote:
> Am 06.09.2012 17:47, schrieb Daniel P. Berrange:
> > On Thu, Sep 06, 2012 at 09:10:04PM +0530, Bharata B Rao wrote:
> >> On Thu, Sep 06, 2012 at 11:29:36AM +0300, Avi Kivity wrote:
> >>> On 08/14/2012 12:58 PM, Kevin Wolf wrote:
> >>>>
> >>>>> While we are at this, let me bring out another issue. Gluster supports 3
> >>>>> transport types:
> >>>>>
> >>>>> - socket in which case the server will be hostname, ipv4 or ipv4 address.
> >>>>> - rdma in which case server will be interpreted similar to socket.
> >>>>> - unix in which case server will be a path to unix domain socket and this
> >>>>>   will look like any other filesystem path. (Eg. /tmp/glusterd.socket)
> >>>>>
> >>>>> I don't think we can fit 'unix' within the standard URI scheme (RFC 3986)
> >>>>> easily, but I am planning to specify the 'unix' transport as below:
> >>>>>
> >>>>> gluster://[/path/to/unix/domain/socket]/volname/image?transport=unix
> >>>>>
> >>>>> i,e., I am asking the user to put the unix domain socket path within
> >>>>> square brackets when transport type is unix.
> >>>>>
> >>>>> Do you think this is fine ?
> >>>>
> >>>> Never saw something like this before, but it does seem reasonable to me.
> >>>> Excludes ] from the valid characters in the file name of the socket, but
> >>>> that shouldn't be a problem in practice.
> >>>
> >>> Bikeshedding, but I prefer
> >>>
> >>>   gluster:///path/to/unix/domain/socket:/volname/image?transport=unix
> >>
> >> So if the unix domain socket is /tmp/glusterd.socket, then this would look like:
> >>
> >> gluster:///tmp/glusterd.socket:/volname/image?transport=unix.
> >>
> >> So you are saying : will the separator b/n the unix domain socket path
> >> and rest of the URI components.
> >>
> >> Unless you or others strongly feel about this, I would like to go with
> >> [ ] based spec, which I feel is less prone to errors like missing a colon
> >> by mistake :)
> >>
> >>>
> >>> as being more similar to file://, or even
> >>>
> >>>   gluster:///path/to/unix/domain/socket/volname/image?transport=unix
> >>>
> >>> with the last two components implied to be part of the payload, not the
> >>> path.
> >>
> >> Note that image is a file path by itself like /dir1/a.img. So I guess it
> >> becomes difficult to figure out where the unix domain socket path ends
> >> and rest of the URI components begin w/o a separator in between.
> > 
> > IMHO this is all gross. URIs already have a well defined way to provide
> > multiple parameters, dealing with escaping of special characters. ie query
> > parameters. The whole benefit of using URI syntax is to let apps process
> > the URIs using standard APIs. We should not be trying to define some extra
> > magic encoding to let us stuff 2 separate parameters into the path component
> > since that means apps have to now write custom parsing code again. Either
> > the UNIX socket path, or the volume path should be in the URI path, not
> > both. The other part should be a URI parameter. I'd really expect us to
> > use:
> > 
> >   gluster:///volname/image?transport=unix&sockpath=/path/to/unix/sock
> 
> I think doing it the other way round would be more logical:
> 
>   gluster+unix:///path/to/unix/sock?image=volname/image
> 
> This way you have the socket first, which you also must open first.
> Having it as a parameter without which you can't make sense of the path
> feels a bit less than ideal.

The issue here is that the volume/path/to/image part is something that is
required for all gluster transports. The /path/to/unix/sock is something
that is only required for the unix transport. To have consistent URI
scheme across all transports, you really want the volume/path/to/image
bit to use the URI path component.


Daniel
Paolo Bonzini Sept. 7, 2012, 10:05 a.m. UTC | #35
Il 07/09/2012 12:03, Daniel P. Berrange ha scritto:
>> > I think doing it the other way round would be more logical:
>> > 
>> >   gluster+unix:///path/to/unix/sock?image=volname/image
>> > 
>> > This way you have the socket first, which you also must open first.
>> > Having it as a parameter without which you can't make sense of the path
>> > feels a bit less than ideal.
> The issue here is that the volume/path/to/image part is something that is
> required for all gluster transports. The /path/to/unix/sock is something
> that is only required for the unix transport. To have consistent URI
> scheme across all transports, you really want the volume/path/to/image
> bit to use the URI path component.

I was writing the same---plus, perhaps there is a default for the Unix
socket path, so that in the common case you could avoid the parameters
altogether?

Paolo
Bharata B Rao Sept. 7, 2012, 3:06 p.m. UTC | #36
On Thu, Sep 06, 2012 at 12:29:30PM +0200, Kevin Wolf wrote:
> Am 06.09.2012 12:18, schrieb Paolo Bonzini:
> > Il 06/09/2012 12:07, Kevin Wolf ha scritto:
> >>> The AIOCB is already invalid at the time the callback is entered, so we
> >>> could release it before the call.  However, not all implementation of
> >>> AIO are ready for that and I'm not really in the mood for large scale
> >>> refactoring...
> >>
> >> But the way, what I'd really want to see in the end is to get rid of
> >> qemu_aio_flush() and replace it by .bdrv_drain() callbacks in each
> >> BlockDriver. The way we're doing it today is a layering violation.
> > 
> > That's quite difficult.  Completion of an I/O operation can trigger
> > another I/O operation on another block device, and so on until we go
> > back to the first device (think of a hypothetical RAID-5 device).
> 
> You always have a tree of BDSes, and children should only ever trigger
> completion of I/O operations in their parents. Am I missing anything?
> 
> >> Doesn't change anything about this problem, though. So the options that
> >> we have are:
> >>
> >> 1. Delay the callback using a BH. Doing this in each driver is ugly.
> >>    But is there actually more than one possible callback in today's
> >>    coroutine world? I only see bdrv_co_io_em_complete(), which could
> >>    reenter the coroutine from a BH.
> > 
> > Easy and safe, but it feels a bit like a timebomb.  Also, I'm not
> > entirely sure of _why_ the bottom half works. :)
> 
> Hm, safe and time bomb is contradictory in my book. :-)
> 
> The bottom half work because we're not reentering the qcow2_create
> coroutine immediately, so the gluster AIO callback can complete all of
> its cleanup work without being interrupted by code that might wait on
> this particular request and create a deadlock this way.
> 
> >> 2. Delay the callback by just calling it later when the cleanup has
> >>    been completed and .io_flush() can return 0. You say that it's hard
> >>    to implement for some drivers, except if the AIOCB are leaked until
> >>    the end of functions like qcow2_create().
> > 
> > ... which is what we do in posix-aio-compat.c; nobody screamed so far.
> 
> True. Would be easy to fix in posix-aio-compat, though, or can a
> callback expect that the AIOCB is still valid?
> 
> > Not really hard, it just has to be assessed for each driver separately.
> >  We can just do it in gluster and refactor it later.
> 
> Okay, so let's keep it as an option for now.

I tried this approach (option 2) in gluster and I was able to go past the hang
I was seeing earlier, but this causes other problems.

Let me restate what I am doing so that you could tell me if I am indeed
following the option 2 you mention above. I am doing the cleanup first
(qemu_aio_count-- and releasing the AIOCB) before calling the callback at
the end.

static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
{
    int ret;
    bool *finished = acb->finished;
    BlockDriverCompletionFunc *cb = acb->common.cb;
    void *opaque = acb->common.opaque;
    
    if (!acb->ret || acb->ret == acb->size) {
        ret = 0; /* Success */
    } else if (acb->ret < 0) {
        ret = acb->ret; /* Read/Write failed */
    } else {
        ret = -EIO; /* Partial read/write - fail it */
    }
    s->qemu_aio_count--;
    qemu_aio_release(acb);

    cb(opaque, ret);
    if (finished) {
        *finished = true;
    }
}

static void qemu_gluster_aio_event_reader(void *opaque)
{
    BDRVGlusterState *s = opaque;
    ssize_t ret;

    do {
        char *p = (char *)&s->event_acb;

        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
                   sizeof(s->event_acb) - s->event_reader_pos);
        if (ret > 0) {
            s->event_reader_pos += ret;
            if (s->event_reader_pos == sizeof(s->event_acb)) {
                s->event_reader_pos = 0;
                qemu_gluster_complete_aio(s->event_acb, s);
                //s->qemu_aio_count--;
            }
        }
    } while (ret < 0 && errno == EINTR);
}

qemu_gluster_aio_event_reader() is the node->io_read in qemu_aio_wait().

qemu_aio_wait() calls node->io_read() which calls qemu_gluster_complete_aio().
Before we return back to qemu_aio_wait(), many other things happen:

bdrv_close() gets called from qcow2_create2()
This closes the gluster connection, closes the pipe, does
qemu_set_fd_hander(read_pipe_fd, NULL, NULL, NULL, NULL), which results
in the AioHandler node being deleted from aio_handlers list.

Now qemu_gluster_aio_event_reader (node->io_read) which was called from
qemu_aio_wait() finally completes and goes ahead and accesses "node"
which has already been deleted. This causes segfault.

So I think the option 1 (scheduling a BH from node->io_read) would
be better for gluster.

Regards,
Bharata.
Bharata B Rao Sept. 12, 2012, 9:22 a.m. UTC | #37
On Fri, Sep 07, 2012 at 11:57:58AM +0200, Kevin Wolf wrote:
> Am 07.09.2012 11:36, schrieb Paolo Bonzini:
> > Hmm, why don't we do the exact same thing as libvirt (http://libvirt.org/remote.html):
> > 
> > ipv4 - gluster+tcp://1.2.3.4:0/testvol/dir/a.img
> > ipv6 - gluster+tcp://[1:2:3:4:5:6:7:8]:0/testvol/a.img
> > host - gluster+tcp://example.org/testvol/dir/a.img
> > unix - gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
> > rdma - gluster+rdma://1.2.3.4:0/testvol/a.img
> > 
> > Where the default transport is tcp (i.e. gluster+tcp is the same as gluster).
> > This is easily extensible, theoretically you could have something like this:
> > 
> > ssh - gluster+ssh://user@host/testvol/a.img?socket=/tmp/glusterd.socket
> 
> I like this. Having the type only as a parameter when it decides how the
> schema must be parsed has bothered me from the start, but I hadn't
> thought of this way.
> 
> Strong +1 from me.

FYI, bdrv_find_protocol() fails for protocols like this. It detects the protocol
as "gluster+tcp" and compares it with drv->protocol_name (which is only
"gluster").

I guess I will have to fix bdrv_find_protocol() to handle the '+' within
protocol string correctly.

Regards,
Bharata.

> 
> Kevin
Paolo Bonzini Sept. 12, 2012, 9:24 a.m. UTC | #38
Il 12/09/2012 11:22, Bharata B Rao ha scritto:
> FYI, bdrv_find_protocol() fails for protocols like this. It detects the protocol
> as "gluster+tcp" and compares it with drv->protocol_name (which is only
> "gluster").
> 
> I guess I will have to fix bdrv_find_protocol() to handle the '+' within
> protocol string correctly.

Yes, or you can declare separate protocols for each supported transport
(there are just 2 or 3, right?).

Paolo
diff mbox

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..a1ae67f 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,3 +9,4 @@  block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 0000000..bbbaea8
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,623 @@ 
+/*
+ * GlusterFS backend for QEMU
+ *
+ * (AIO implementation is derived from block/rbd.c)
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+#include <glusterfs/api/glfs.h>
+#include "block_int.h"
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    bool canceled;
+    int64_t size;
+    int ret;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ 0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterURI {
+    char *server;
+    int port;
+    char *volname;
+    char *image;
+    char *transport;
+} GlusterURI;
+
+static void qemu_gluster_uri_free(GlusterURI *uri)
+{
+    g_free(uri->server);
+    g_free(uri->volname);
+    g_free(uri->image);
+    g_free(uri->transport);
+    g_free(uri);
+}
+
+/*
+ * We don't validate the transport option obtained here but
+ * instead depend on gluster to flag an error.
+ */
+static int parse_transport(GlusterURI *uri, char *transport)
+{
+    char *token, *saveptr;
+    int ret = -EINVAL;
+
+    if (!transport) {
+        uri->transport = g_strdup("socket");
+        ret = 0;
+        goto out;
+    }
+
+    token = strtok_r(transport, "=", &saveptr);
+    if (!token) {
+        goto out;
+    }
+    if (strcmp(token, "transport")) {
+        goto out;
+    }
+    token = strtok_r(NULL, "=", &saveptr);
+    if (!token) {
+        goto out;
+    }
+    uri->transport = g_strdup(token);
+    ret = 0;
+out:
+    return ret;
+}
+
+static int parse_server(GlusterURI *uri, char *server)
+{
+    int ret = -EINVAL;
+    char *token, *saveptr;
+    char *p, *q = server;
+
+    p = strchr(server, '[');
+    if (p) {
+        /* [ipv6] */
+        if (p != server) {
+            /* [ not in the beginning */
+            goto out;
+        }
+        q++;
+        p = strrchr(p, ']');
+        if (!p) {
+            /* No matching ] */
+            goto out;
+        }
+        *p++ = '\0';
+        uri->server = g_strdup(q);
+
+        if (*p) {
+            if (*p != ':') {
+                /* [ipv6] followed by something other than : */
+                goto out;
+            }
+            uri->port = strtoul(++p, NULL, 0);
+            if (uri->port < 0) {
+                goto out;
+            }
+        } else {
+            /* port not specified, use default */
+            uri->port = 0;
+        }
+
+    } else {
+        /* ipv4 or hostname */
+        if (*server == ':') {
+            /* port specified w/o a server */
+            goto out;
+        }
+        token = strtok_r(server, ":", &saveptr);
+        if (!token) {
+            goto out;
+        }
+        uri->server = g_strdup(token);
+        token = strtok_r(NULL, ":", &saveptr);
+        if (token) {
+            uri->port = strtoul(token, NULL, 0);
+            if (uri->port < 0) {
+                goto out;
+            }
+        } else {
+            uri->port = 0;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+ * file=gluster://server:[port]/volname/image[?transport=socket]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname or ipv4 address
+ * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
+ *
+ * 'port' is the port number on which gluster management daemon (glusterd) is
+ * listening. This is optional and if not specified, QEMU will send 0 which
+ * will make libgfapi to use the default port.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image in the gluster volume.
+ *
+ * 'transport' specifies the transport used to connect to glusterd. This is
+ * optional and if not specified, socket transport is used.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
+ * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
+ * file=gluster://server.domain.com:5000/testvol/dir/a.img
+ *
+ * We just do minimal checking of the gluster options and mostly ensure
+ * that all the expected elements of the URI are present. We depend on libgfapi
+ * APIs to return appropriate errors in case of invalid arguments.
+ */
+static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
+{
+    char *token, *saveptr;
+    char *p, *r;
+    int ret = -EINVAL;
+
+    p = r = g_strdup(filename);
+    if (strncmp(p, "gluster://", 10)) {
+        goto out;
+    }
+
+    /* Discard the protocol */
+    p += 10;
+
+    /* server */
+    token = strtok_r(p, "/", &saveptr);
+    if (!token) {
+        goto out;
+    }
+
+    ret = parse_server(uri, token);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* volname */
+    token = strtok_r(NULL, "/", &saveptr);
+    if (!token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    uri->volname = g_strdup(token);
+
+    /* image */
+    token = strtok_r(NULL, "?", &saveptr);
+    if (!token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    uri->image = g_strdup(token);
+
+    /* transport */
+    token = strtok_r(NULL, "?", &saveptr);
+    ret = parse_transport(uri, token);
+    if (ret < 0) {
+        goto out;
+     }
+
+    /* Flag error for extra options */
+    token = strtok_r(NULL, "?", &saveptr);
+    if (token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    ret = 0;
+out:
+    g_free(r);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+
+    ret = qemu_gluster_parseuri(uri, filename);
+    if (ret < 0) {
+        error_report("Usage: file=gluster://server[:port]/volname/image"
+            "[?transport=socket]");
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(uri->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
+        uri->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+     * GlusterFS exports it in a header.
+     */
+    ret = glfs_set_logging(glfs, "-", 4);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret) {
+        error_report("Gluster connection failed for server=%s port=%d "
+             "volume=%s image=%s transport=%s\n", uri->server, uri->port,
+             uri->volname, uri->image, uri->transport);
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
+{
+    int ret;
+
+    if (acb->canceled) {
+        qemu_aio_release(acb);
+        return;
+    }
+
+    if (acb->ret == acb->size) {
+        ret = 0; /* Success */
+    } else if (acb->ret < 0) {
+        ret = acb->ret; /* Read/Write failed */
+    } else {
+        ret = -EIO; /* Partial read/write - fail it */
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    GlusterAIOCB *event_acb;
+    int event_reader_pos = 0;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&event_acb;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
+                   sizeof(event_acb) - event_reader_pos);
+        if (ret > 0) {
+            event_reader_pos += ret;
+            if (event_reader_pos == sizeof(event_acb)) {
+                event_reader_pos = 0;
+                qemu_gluster_complete_aio(event_acb);
+                s->qemu_aio_count--;
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int open_flags = 0;
+    int ret = 0;
+    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
+
+    s->glfs = qemu_gluster_init(uri, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    open_flags |=  O_BINARY;
+    open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, uri->image, open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        goto out;
+    }
+    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
+    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+    qemu_gluster_uri_free(uri);
+    if (!ret) {
+        return ret;
+    }
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    int ret = 0;
+    int64_t total_size = 0;
+    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
+
+    glfs = qemu_gluster_init(uri, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, uri->image,
+        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    qemu_gluster_uri_free(uri);
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+
+    acb->common.cb(acb->common.opaque, -ECANCELED);
+    acb->canceled = true;
+}
+
+static AIOPool gluster_aio_pool = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+    .cancel = qemu_gluster_aio_cancel,
+};
+
+static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
+{
+    int ret = 0;
+    while (1) {
+        fd_set wfd;
+        int fd = s->fds[GLUSTER_FD_WRITE];
+
+        ret = write(fd, (void *)&acb, sizeof(acb));
+        if (ret >= 0) {
+            break;
+        }
+        if (errno == EINTR) {
+            continue;
+        }
+        if (errno != EAGAIN) {
+            break;
+        }
+
+        FD_ZERO(&wfd);
+        FD_SET(fd, &wfd);
+        do {
+            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
+        } while (ret < 0 && errno == EINTR);
+    }
+    return ret;
+}
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+    BDRVGlusterState *s = acb->common.bs->opaque;
+
+    acb->ret = ret;
+    if (qemu_gluster_send_pipe(s, acb) < 0) {
+        /*
+         * Gluster AIO callback thread failed to notify the waiting
+         * QEMU thread about IO completion. Nothing much can be done
+         * here but to abruptly abort.
+         *
+         * FIXME: Check if the read side of the fd handler can somehow
+         * be notified of this failure paving the way for a graceful exit.
+         */
+        error_report("Gluster failed to notify QEMU about IO completion");
+        abort();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = size;
+    acb->ret = 0;
+    acb->canceled = false;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->canceled = false;
+    s->qemu_aio_count++;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_size;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name = "gluster",
+    .protocol_name = "gluster",
+    .instance_size = sizeof(BDRVGlusterState),
+    .bdrv_file_open = qemu_gluster_open,
+    .bdrv_close = qemu_gluster_close,
+    .bdrv_create = qemu_gluster_create,
+    .bdrv_getlength = qemu_gluster_getlength,
+
+    .bdrv_aio_readv = qemu_gluster_aio_readv,
+    .bdrv_aio_writev = qemu_gluster_aio_writev,
+    .bdrv_aio_flush = qemu_gluster_aio_flush,
+
+    .create_options = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);