diff mbox

[v5,2/2] block: Support GlusterFS as a QEMU block backend

Message ID 20120807080157.GD7480@in.ibm.com
State New
Headers show

Commit Message

Bharata B Rao Aug. 7, 2012, 8:01 a.m. UTC
block: Support GlusterFS as a QEMU block backend.

From: Bharata B Rao <bharata@linux.vnet.ibm.com>

This patch adds gluster as the new block backend in QEMU. This gives
QEMU the ability to boot VM images from gluster volumes. Its already
possible to boot from VM images on gluster volumes using FUSE mount, but
this patchset provides the ability to boot VM images from gluster volumes
by by-passing the FUSE layer in gluster. This is made possible by
using libgfapi routines to perform IO on gluster volumes directly.

VM Image on gluster volume is specified like this:

file=gluster://server:[port]/volname/image[?transport=socket]

'gluster' is the protocol.

'server' specifies the server where the volume file specification for
the given volume resides. This can be either hostname or ipv4 address
or ipv6 address. ipv6 address needs to be with in square brackets [ ].

port' is the port number on which gluster management daemon (glusterd) is
listening. This is optional and if not specified, QEMU will send 0 which
will make libgfapi to use the default port.

'volname' is the name of the gluster volume which contains the VM image.

'image' is the path to the actual VM image in the gluster volume.

'transport' specifies the transport used to connect to glusterd. This is
optional and if not specified, socket transport is used.

Examples:

file=gluster://1.2.3.4/testvol/a.img
file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
file=gluster://server.domain.com:5000/testvol/dir/a.img

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---

 block/Makefile.objs |    1 
 block/gluster.c     |  623 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 624 insertions(+), 0 deletions(-)
 create mode 100644 block/gluster.c

Comments

Stefan Hajnoczi Aug. 8, 2012, 2:37 p.m. UTC | #1
On Tue, Aug 7, 2012 at 9:01 AM, Bharata B Rao
<bharata@linux.vnet.ibm.com> wrote:
> block: Support GlusterFS as a QEMU block backend.
>
> From: Bharata B Rao <bharata@linux.vnet.ibm.com>
>
> This patch adds gluster as the new block backend in QEMU. This gives
> QEMU the ability to boot VM images from gluster volumes. Its already
> possible to boot from VM images on gluster volumes using FUSE mount, but
> this patchset provides the ability to boot VM images from gluster volumes
> by by-passing the FUSE layer in gluster. This is made possible by
> using libgfapi routines to perform IO on gluster volumes directly.
>
> VM Image on gluster volume is specified like this:
>
> file=gluster://server:[port]/volname/image[?transport=socket]
>
> 'gluster' is the protocol.
>
> 'server' specifies the server where the volume file specification for
> the given volume resides. This can be either hostname or ipv4 address
> or ipv6 address. ipv6 address needs to be with in square brackets [ ].
>
> port' is the port number on which gluster management daemon (glusterd) is
> listening. This is optional and if not specified, QEMU will send 0 which
> will make libgfapi to use the default port.
>
> 'volname' is the name of the gluster volume which contains the VM image.
>
> 'image' is the path to the actual VM image in the gluster volume.
>
> 'transport' specifies the transport used to connect to glusterd. This is
> optional and if not specified, socket transport is used.
>
> Examples:
>
> file=gluster://1.2.3.4/testvol/a.img
> file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> file=gluster://server.domain.com:5000/testvol/dir/a.img
>
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>
>  block/Makefile.objs |    1
>  block/gluster.c     |  623 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 624 insertions(+), 0 deletions(-)
>  create mode 100644 block/gluster.c

I have left a few small comments.  Perhaps you can resend with your
fixes to Patch 1?

> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index b5754d3..a1ae67f 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>  block-obj-$(CONFIG_CURL) += curl.o
>  block-obj-$(CONFIG_RBD) += rbd.o
> +block-obj-$(CONFIG_GLUSTERFS) += gluster.o
> diff --git a/block/gluster.c b/block/gluster.c
> new file mode 100644
> index 0000000..39c55fe
> --- /dev/null
> +++ b/block/gluster.c
> @@ -0,0 +1,623 @@
> +/*
> + * GlusterFS backend for QEMU
> + *
> + * (AIO implementation is derived from block/rbd.c)
> + *
> + * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * (at your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +#include "block_int.h"
> +#include <glusterfs/api/glfs.h>

System headers followed by user headers is a good order to prevent
application-specific macros from interfering with system headers:

#include <glusterfs/api/glfs.h>
#include "block_int.h"

> +
> +typedef struct GlusterAIOCB {
> +    BlockDriverAIOCB common;
> +    bool canceled;
> +    int64_t size;
> +    int ret;
> +} GlusterAIOCB;
> +
> +typedef struct BDRVGlusterState {
> +    struct glfs *glfs;
> +    int fds[2];
> +    struct glfs_fd *fd;
> +    int qemu_aio_count;
> +} BDRVGlusterState;
> +
> +#define GLUSTER_FD_READ 0
> +#define GLUSTER_FD_WRITE 1
> +
> +typedef struct GlusterURI {
> +    char *server;
> +    int port;
> +    char *volname;
> +    char *image;
> +    char *transport;
> +} GlusterURI;
> +
> +static void qemu_gluster_uri_free(GlusterURI *uri)
> +{
> +    g_free(uri->server);
> +    g_free(uri->volname);
> +    g_free(uri->image);
> +    g_free(uri->transport);
> +    g_free(uri);
> +}
> +
> +/*
> + * We don't validate the transport option obtained here but
> + * instead depend on gluster to flag an error.
> + */
> +static int parse_transport(GlusterURI *uri, char *transport)
> +{
> +    char *token, *saveptr;
> +    int ret = -EINVAL;
> +
> +    if (!transport) {
> +        uri->transport = g_strdup("socket");
> +        ret = 0;
> +        goto out;
> +    }
> +
> +    token = strtok_r(transport, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    if (strcmp(token, "transport")) {
> +        goto out;
> +    }
> +    token = strtok_r(NULL, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    uri->transport = g_strdup(token);
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +static int parse_server(GlusterURI *uri, char *server)
> +{
> +    int ret = -EINVAL;
> +    char *token, *saveptr;
> +    char *p, *q = server;
> +
> +    p = strchr(server, '[');
> +    if (p) {
> +        /* [ipv6] */
> +        if (p != server) {
> +            /* [ not in the beginning */
> +            goto out;
> +        }
> +        q++;
> +        p = strrchr(p, ']');
> +        if (!p) {
> +            /* No matching ] */
> +            goto out;
> +        }
> +        *p++ = '\0';
> +        uri->server = g_strdup(q);
> +
> +        if (*p) {
> +            if (*p != ':') {
> +                /* [ipv6] followed by something other than : */
> +                goto out;
> +            }
> +            uri->port = strtoul(++p, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            /* port not specified, use default */
> +            uri->port = 0;
> +        }
> +
> +    } else {
> +        /* ipv4 or hostname */
> +        if (*server == ':') {
> +            /* port specified w/o a server */
> +            goto out;
> +        }
> +        token = strtok_r(server, ":", &saveptr);
> +        if (!token) {
> +            goto out;
> +        }
> +        uri->server = g_strdup(token);
> +        token = strtok_r(NULL, ":", &saveptr);
> +        if (token) {
> +            uri->port = strtoul(token, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            uri->port = 0;
> +        }
> +    }
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +/*
> + * file=gluster://server:[port]/volname/image[?transport=socket]
> + *
> + * 'gluster' is the protocol.
> + *
> + * 'server' specifies the server where the volume file specification for
> + * the given volume resides. This can be either hostname or ipv4 address
> + * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> + *
> + *'port' is the port number on which gluster management daemon (glusterd) is

Missing space: * 'port'

> + * listening. This is optional and if not specified, QEMU will send 0 which
> + * will make libgfapi to use the default port.
> + *
> + * 'volname' is the name of the gluster volume which contains the VM image.
> + *
> + * 'image' is the path to the actual VM image in the gluster volume.
> + *
> + * 'transport' specifies the transport used to connect to glusterd. This is
> + * optional and if not specified, socket transport is used.
> + *
> + * Examples:
> + *
> + * file=gluster://1.2.3.4/testvol/a.img
> + * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> + * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://server.domain.com:5000/testvol/dir/a.img
> + *
> + * We just do minimal checking of the gluster options and mostly ensure
> + * that all the expected elements of the URI are present. We depend on libgfapi
> + * APIs to return appropriate errors in case of invalid arguments.
> + */
> +static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
> +{
> +    char *token, *saveptr;
> +    char *p, *r;
> +    int ret = -EINVAL;
> +
> +    p = r = g_strdup(filename);
> +    if (strncmp(p, "gluster://", 10)) {
> +        goto out;
> +    }
> +
> +    /* Discard the protocol */
> +    p += 10;
> +
> +    /* server */
> +    token = strtok_r(p, "/", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +
> +    ret = parse_server(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* volname */
> +    token = strtok_r(NULL, "/", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->volname = g_strdup(token);
> +
> +    /* image */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->image = g_strdup(token);
> +
> +    /* transport */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    ret = parse_transport(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +     }
> +
> +    /* Flag error for extra options */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    ret = 0;
> +out:
> +    g_free(r);
> +    return ret;
> +}
> +
> +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
> +{
> +    struct glfs *glfs = NULL;
> +    int ret;
> +
> +    ret = qemu_gluster_parseuri(uri, filename);
> +    if (ret < 0) {
> +        error_report("Usage: file=gluster://server:[port]/volname/image"

server[:port]

> +            "[?transport=socket]");
> +        errno = -ret;
> +        goto out;
> +    }
> +
> +    glfs = glfs_new(uri->volname);
> +    if (!glfs) {
> +        goto out;
> +    }
> +
> +    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
> +        uri->port);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /*
> +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
> +     * GlusterFS exports it in a header.
> +     */
> +    ret = glfs_set_logging(glfs, "-", 4);

Are you submitting the GlusterFS patch to move
gf_loglevel_t/GF_LOG_ERROR to the API headers?
Bharata B Rao Aug. 8, 2012, 3:08 p.m. UTC | #2
On Wed, Aug 08, 2012 at 03:37:31PM +0100, Stefan Hajnoczi wrote:
> 
> I have left a few small comments.  Perhaps you can resend with your
> fixes to Patch 1?

Sure, Will send v6 with fixes to patch 1 and incorporating your suggestions.

> > + */
> > +#include "block_int.h"
> > +#include <glusterfs/api/glfs.h>
> 
> System headers followed by user headers is a good order to prevent
> application-specific macros from interfering with system headers:
> 
> #include <glusterfs/api/glfs.h>
> #include "block_int.h"

Ok.

> > + *'port' is the port number on which gluster management daemon (glusterd) is
> 
> Missing space: * 'port'
> > +        error_report("Usage: file=gluster://server:[port]/volname/image"
> 
> server[:port]

Sharp eyes! Will fix.

> > +
> > +    /*
> > +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
> > +     * GlusterFS exports it in a header.
> > +     */
> > +    ret = glfs_set_logging(glfs, "-", 4);
> 
> Are you submitting the GlusterFS patch to move
> gf_loglevel_t/GF_LOG_ERROR to the API headers?

I have already informed them about this. I will work with them to make sure
that GL_LOG_* definitions are available for the users of libgfapi.

Regards,
Bharata.
diff mbox

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..a1ae67f 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,3 +9,4 @@  block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 0000000..39c55fe
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,623 @@ 
+/*
+ * GlusterFS backend for QEMU
+ *
+ * (AIO implementation is derived from block/rbd.c)
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+#include "block_int.h"
+#include <glusterfs/api/glfs.h>
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    bool canceled;
+    int64_t size;
+    int ret;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ 0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterURI {
+    char *server;
+    int port;
+    char *volname;
+    char *image;
+    char *transport;
+} GlusterURI;
+
+static void qemu_gluster_uri_free(GlusterURI *uri)
+{
+    g_free(uri->server);
+    g_free(uri->volname);
+    g_free(uri->image);
+    g_free(uri->transport);
+    g_free(uri);
+}
+
+/*
+ * We don't validate the transport option obtained here but
+ * instead depend on gluster to flag an error.
+ */
+static int parse_transport(GlusterURI *uri, char *transport)
+{
+    char *token, *saveptr;
+    int ret = -EINVAL;
+
+    if (!transport) {
+        uri->transport = g_strdup("socket");
+        ret = 0;
+        goto out;
+    }
+
+    token = strtok_r(transport, "=", &saveptr);
+    if (!token) {
+        goto out;
+    }
+    if (strcmp(token, "transport")) {
+        goto out;
+    }
+    token = strtok_r(NULL, "=", &saveptr);
+    if (!token) {
+        goto out;
+    }
+    uri->transport = g_strdup(token);
+    ret = 0;
+out:
+    return ret;
+}
+
+static int parse_server(GlusterURI *uri, char *server)
+{
+    int ret = -EINVAL;
+    char *token, *saveptr;
+    char *p, *q = server;
+
+    p = strchr(server, '[');
+    if (p) {
+        /* [ipv6] */
+        if (p != server) {
+            /* [ not in the beginning */
+            goto out;
+        }
+        q++;
+        p = strrchr(p, ']');
+        if (!p) {
+            /* No matching ] */
+            goto out;
+        }
+        *p++ = '\0';
+        uri->server = g_strdup(q);
+
+        if (*p) {
+            if (*p != ':') {
+                /* [ipv6] followed by something other than : */
+                goto out;
+            }
+            uri->port = strtoul(++p, NULL, 0);
+            if (uri->port < 0) {
+                goto out;
+            }
+        } else {
+            /* port not specified, use default */
+            uri->port = 0;
+        }
+
+    } else {
+        /* ipv4 or hostname */
+        if (*server == ':') {
+            /* port specified w/o a server */
+            goto out;
+        }
+        token = strtok_r(server, ":", &saveptr);
+        if (!token) {
+            goto out;
+        }
+        uri->server = g_strdup(token);
+        token = strtok_r(NULL, ":", &saveptr);
+        if (token) {
+            uri->port = strtoul(token, NULL, 0);
+            if (uri->port < 0) {
+                goto out;
+            }
+        } else {
+            uri->port = 0;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+ * file=gluster://server:[port]/volname/image[?transport=socket]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname or ipv4 address
+ * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
+ *
+ *'port' is the port number on which gluster management daemon (glusterd) is
+ * listening. This is optional and if not specified, QEMU will send 0 which
+ * will make libgfapi to use the default port.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image in the gluster volume.
+ *
+ * 'transport' specifies the transport used to connect to glusterd. This is
+ * optional and if not specified, socket transport is used.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
+ * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
+ * file=gluster://server.domain.com:5000/testvol/dir/a.img
+ *
+ * We just do minimal checking of the gluster options and mostly ensure
+ * that all the expected elements of the URI are present. We depend on libgfapi
+ * APIs to return appropriate errors in case of invalid arguments.
+ */
+static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
+{
+    char *token, *saveptr;
+    char *p, *r;
+    int ret = -EINVAL;
+
+    p = r = g_strdup(filename);
+    if (strncmp(p, "gluster://", 10)) {
+        goto out;
+    }
+
+    /* Discard the protocol */
+    p += 10;
+
+    /* server */
+    token = strtok_r(p, "/", &saveptr);
+    if (!token) {
+        goto out;
+    }
+
+    ret = parse_server(uri, token);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* volname */
+    token = strtok_r(NULL, "/", &saveptr);
+    if (!token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    uri->volname = g_strdup(token);
+
+    /* image */
+    token = strtok_r(NULL, "?", &saveptr);
+    if (!token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    uri->image = g_strdup(token);
+
+    /* transport */
+    token = strtok_r(NULL, "?", &saveptr);
+    ret = parse_transport(uri, token);
+    if (ret < 0) {
+        goto out;
+     }
+
+    /* Flag error for extra options */
+    token = strtok_r(NULL, "?", &saveptr);
+    if (token) {
+        ret = -EINVAL;
+        goto out;
+    }
+    ret = 0;
+out:
+    g_free(r);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+
+    ret = qemu_gluster_parseuri(uri, filename);
+    if (ret < 0) {
+        error_report("Usage: file=gluster://server:[port]/volname/image"
+            "[?transport=socket]");
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(uri->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
+        uri->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+     * GlusterFS exports it in a header.
+     */
+    ret = glfs_set_logging(glfs, "-", 4);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret) {
+        error_report("Gluster connection failed for server=%s port=%d "
+             "volume=%s image=%s transport=%s\n", uri->server, uri->port,
+             uri->volname, uri->image, uri->transport);
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
+{
+    int ret;
+
+    if (acb->canceled) {
+        qemu_aio_release(acb);
+        return;
+    }
+
+    if (acb->ret == acb->size) {
+        ret = 0; /* Success */
+    } else if (acb->ret < 0) {
+        ret = acb->ret; /* Read/Write failed */
+    } else {
+        ret = -EIO; /* Partial read/write - fail it */
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    GlusterAIOCB *event_acb;
+    int event_reader_pos = 0;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&event_acb;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
+                   sizeof(event_acb) - event_reader_pos);
+        if (ret > 0) {
+            event_reader_pos += ret;
+            if (event_reader_pos == sizeof(event_acb)) {
+                event_reader_pos = 0;
+                qemu_gluster_complete_aio(event_acb);
+                s->qemu_aio_count--;
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int open_flags = 0;
+    int ret = 0;
+    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
+
+    s->glfs = qemu_gluster_init(uri, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    open_flags |=  O_BINARY;
+    open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, uri->image, open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        goto out;
+    }
+    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
+    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+    qemu_gluster_uri_free(uri);
+    if (!ret) {
+        return ret;
+    }
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    int ret = 0;
+    int64_t total_size = 0;
+    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
+
+    glfs = qemu_gluster_init(uri, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, uri->image,
+        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    qemu_gluster_uri_free(uri);
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+
+    acb->common.cb(acb->common.opaque, -ECANCELED);
+    acb->canceled = true;
+}
+
+static AIOPool gluster_aio_pool = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+    .cancel = qemu_gluster_aio_cancel,
+};
+
+static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
+{
+    int ret = 0;
+    while (1) {
+        fd_set wfd;
+        int fd = s->fds[GLUSTER_FD_WRITE];
+
+        ret = write(fd, (void *)&acb, sizeof(acb));
+        if (ret >= 0) {
+            break;
+        }
+        if (errno == EINTR) {
+            continue;
+        }
+        if (errno != EAGAIN) {
+            break;
+        }
+
+        FD_ZERO(&wfd);
+        FD_SET(fd, &wfd);
+        do {
+            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
+        } while (ret < 0 && errno == EINTR);
+    }
+    return ret;
+}
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+    BDRVGlusterState *s = acb->common.bs->opaque;
+
+    acb->ret = ret;
+    if (qemu_gluster_send_pipe(s, acb) < 0) {
+        /*
+         * Gluster AIO callback thread failed to notify the waiting
+         * QEMU thread about IO completion. Nothing much can be done
+         * here but to abruptly abort.
+         *
+         * FIXME: Check if the read side of the fd handler can somehow
+         * be notified of this failure paving the way for a graceful exit.
+         */
+        error_report("Gluster failed to notify QEMU about IO completion");
+        abort();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = size;
+    acb->ret = 0;
+    acb->canceled = false;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->canceled = false;
+    s->qemu_aio_count++;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_size;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name = "gluster",
+    .protocol_name = "gluster",
+    .instance_size = sizeof(BDRVGlusterState),
+    .bdrv_file_open = qemu_gluster_open,
+    .bdrv_close = qemu_gluster_close,
+    .bdrv_create = qemu_gluster_create,
+    .bdrv_getlength = qemu_gluster_getlength,
+
+    .bdrv_aio_readv = qemu_gluster_aio_readv,
+    .bdrv_aio_writev = qemu_gluster_aio_writev,
+    .bdrv_aio_flush = qemu_gluster_aio_flush,
+
+    .create_options = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);