[RFC,2/2] block: gluster as block backend

Message ID	20120721083159.GE1046@in.ibm.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> Gateway: Authorized Use Only! Violators will be prosecuted for <qemu-devel@nongnu.org> from <bharata@linux.vnet.ibm.com>; Sat, 21 Jul 2012 18:31:11 +1000 Gateway: Authorized Use Only! Violators will be prosecuted; Sat, 21 Jul 2012 18:31:08 +1000 Date: Sat, 21 Jul 2012 14:01:59 +0530 From: Bharata B Rao <bharata@linux.vnet.ibm.com> To: qemu-devel@nongnu.org Message-ID: <20120721083159.GE1046@in.ibm.com> References: <20120721082917.GC1046@in.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20120721082917.GC1046@in.ibm.com> User-Agent: Mutt/1.5.21 (2010-09-15) Cc: Anand Avati <aavati@redhat.com>, Amar Tumballi <amarts@redhat.com>, Vijay Bellur <vbellur@redhat.com> Subject: [Qemu-devel] [RFC PATCH 2/2] block: gluster as block backend Precedence: list Reply-To: bharata@linux.vnet.ibm.com Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Message ID

20120721083159.GE1046@in.ibm.com

State

New

Headers

Date: Sat, 21 Jul 2012 14:01:59 +0530
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
To: qemu-devel@nongnu.org
Message-ID: <20120721083159.GE1046@in.ibm.com>
References: <20120721082917.GC1046@in.ibm.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20120721082917.GC1046@in.ibm.com>
User-Agent: Mutt/1.5.21 (2010-09-15)
Cc: Anand Avati <aavati@redhat.com>, Amar Tumballi <amarts@redhat.com>,
	Vijay Bellur <vbellur@redhat.com>
Subject: [Qemu-devel] [RFC PATCH 2/2] block: gluster as block backend
Precedence: list
Reply-To: bharata@linux.vnet.ibm.com
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

Commit Message

Bharata B Rao July 21, 2012, 8:31 a.m. UTC

block: gluster as block backend

From: Bharata B Rao <bharata@linux.vnet.ibm.com>

This patch adds gluster as the new block backend in QEMU. This gives QEMU
the ability to boot VM images from gluster volumes.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---

 block/Makefile.objs |    1 
 block/gluster.c     |  483 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 484 insertions(+), 0 deletions(-)
 create mode 100644 block/gluster.c

Comments

Stefan Hajnoczi July 22, 2012, 3:38 p.m. UTC | #1

On Sat, Jul 21, 2012 at 9:31 AM, Bharata B Rao
<bharata@linux.vnet.ibm.com> wrote:
> +typedef struct GlusterAIOCB {
> +    BlockDriverAIOCB common;
> +    QEMUIOVector *qiov;

The qiov field is unused.

> +    char *bounce;

Unused.

> +    struct BDRVGlusterState *s;

You can get this through common.bs->opaque, but if you like having a
shortcut, that's fine.

> +    int cancelled;

bool

> +} GlusterAIOCB;
> +
> +typedef struct GlusterCBKData {
> +    GlusterAIOCB *acb;
> +    struct BDRVGlusterState *s;
> +    int64_t size;
> +    int ret;
> +} GlusterCBKData;

I think GlusterCBKData could just be part of GlusterAIOCB.  That would
simplify the code a little and avoid some malloc/free.

> +
> +typedef struct BDRVGlusterState {
> +    struct glfs *glfs;
> +    int fds[2];
> +    int open_flags;
> +    struct glfs_fd *fd;
> +    int qemu_aio_count;
> +    int event_reader_pos;
> +    GlusterCBKData *event_gcbk;
> +} BDRVGlusterState;
> +
> +#define GLUSTER_FD_READ 0
> +#define GLUSTER_FD_WRITE 1
> +
> +static void qemu_gluster_complete_aio(GlusterCBKData *gcbk)
> +{
> +    GlusterAIOCB *acb = gcbk->acb;
> +    int ret;
> +
> +    if (acb->cancelled) {

Where does cancelled get set?

> +        qemu_aio_release(acb);
> +        goto done;
> +    }
> +
> +    if (gcbk->ret == gcbk->size) {
> +        ret = 0; /* Success */
> +    } else if (gcbk->ret < 0) {
> +        ret = gcbk->ret; /* Read/Write failed */
> +    } else {
> +        ret = -EINVAL; /* Partial read/write - fail it */

EINVAL is for invalid arguments.  EIO would be better.

> +/*
> + * file=protocol:server@port:volname:image
> + */
> +static int qemu_gluster_parsename(GlusterConf *c, const char *filename)
> +{
> +    char *file = g_strdup(filename);
> +    char *token, *next_token, *saveptr;
> +    char *token_s, *next_token_s, *saveptr_s;
> +    int ret = -EINVAL;
> +
> +    /* Discard the protocol */
> +    token = strtok_r(file, ":", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +
> +    /* server@port */
> +    next_token = strtok_r(NULL, ":", &saveptr);
> +    if (!next_token) {
> +        goto out;
> +    }
> +    if (strchr(next_token, '@')) {
> +        token_s = strtok_r(next_token, "@", &saveptr_s);
> +        if (!token_s) {
> +            goto out;
> +        }
> +        strncpy(c->server, token_s, HOST_NAME_MAX);

strncpy(3) will not NUL-terminate when token_s is HOST_NAME_MAX
characters long.  QEMU has cutils.c:pstrcpy().

When the argument is too long we should probably report an error
instead of truncating.

Same below.

> +        next_token_s = strtok_r(NULL, "@", &saveptr_s);
> +        if (!next_token_s) {
> +            goto out;
> +        }
> +        c->port = atoi(next_token_s);

No error checking.  If the input is invalid an error message would
help the user here.

> +static struct glfs *qemu_gluster_init(GlusterConf *c, const char *filename)
> +{
> +    struct glfs *glfs = NULL;
> +    int ret;
> +
> +    ret = qemu_gluster_parsename(c, filename);
> +    if (ret < 0) {
> +        errno = -ret;
> +        goto out;
> +    }
> +
> +    glfs = glfs_new(c->volname);
> +    if (!glfs) {
> +        goto out;
> +    }
> +
> +    ret = glfs_set_volfile_server(glfs, "socket", c->server, c->port);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /*
> +     * TODO: Logging is not necessary but instead nice to have.
> +     * Can QEMU optionally log into a standard place ?

QEMU prints to stderr, can you do that here too?  The global log file
is not okay, especially when multiple QEMU instances are running.

> +     * Need to use defines like gf_loglevel_t:GF_LOG_INFO instead of
> +     * hard coded values like 7 here.
> +     */
> +    ret = glfs_set_logging(glfs, "/tmp/qemu-gluster.log", 7);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = glfs_init(glfs);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return glfs;
> +
> +out:
> +    if (glfs) {
> +        (void)glfs_fini(glfs);
> +    }
> +    return NULL;
> +}
> +
> +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> +    int bdrv_flags)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    GlusterConf *c = g_malloc(sizeof(GlusterConf));

Can this be allocated on the stack?

> +    int ret;
> +
> +    s->glfs = qemu_gluster_init(c, filename);
> +    if (!s->glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    s->open_flags |=  O_BINARY;

Can open_flags be a local variable?

> +static int qemu_gluster_create(const char *filename,
> +        QEMUOptionParameter *options)
> +{
> +    struct glfs *glfs;
> +    struct glfs_fd *fd;
> +    GlusterConf *c = g_malloc(sizeof(GlusterConf));
> +    int ret = 0;
> +    int64_t total_size = 0;
> +
> +    glfs = qemu_gluster_init(c, filename);
> +    if (!glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    /* Read out options */
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            total_size = options->value.n / BDRV_SECTOR_SIZE;
> +        }
> +        options++;
> +    }
> +
> +    fd = glfs_creat(glfs, c->image, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU);

Why set the execute permission bit?

> +static void qemu_gluster_close(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +        s->fd = NULL;
> +    }

Why not call glfs_fini() here?

Bharata B Rao July 23, 2012, 8:32 a.m. UTC | #2

On Sun, Jul 22, 2012 at 04:38:00PM +0100, Stefan Hajnoczi wrote:
> On Sat, Jul 21, 2012 at 9:31 AM, Bharata B Rao
> <bharata@linux.vnet.ibm.com> wrote:
> > +typedef struct GlusterAIOCB {
> > +    BlockDriverAIOCB common;
> > +    QEMUIOVector *qiov;
> 
> The qiov field is unused.
> 
> > +    char *bounce;
> 
> Unused.

Yes, removed these two.

> 
> > +    struct BDRVGlusterState *s;
> 
> You can get this through common.bs->opaque, but if you like having a
> shortcut, that's fine.
> 
> > +    int cancelled;
> 
> bool

Ok.

> 
> > +} GlusterAIOCB;
> > +
> > +typedef struct GlusterCBKData {
> > +    GlusterAIOCB *acb;
> > +    struct BDRVGlusterState *s;
> > +    int64_t size;
> > +    int ret;
> > +} GlusterCBKData;
> 
> I think GlusterCBKData could just be part of GlusterAIOCB.  That would
> simplify the code a little and avoid some malloc/free.

Are you suggesting to put a field

GlusterCBKData gcbk;

inside GlusterAIOCB and use gcbk from there or

Are you suggesting that I make the fields of GlusterCBKData part of
GlusterAIOCB and get rid of GlusterCBKData altogether ? This means I would
have to pass the GlusterAIOCB to gluster async calls and update its fields from
gluster callback routine. I can do this, but I am not sure if you can touch
the fields of GlusterAIOCB in non-QEMU threads (gluster callback thread).

> 
> > +
> > +typedef struct BDRVGlusterState {
> > +    struct glfs *glfs;
> > +    int fds[2];
> > +    int open_flags;
> > +    struct glfs_fd *fd;
> > +    int qemu_aio_count;
> > +    int event_reader_pos;
> > +    GlusterCBKData *event_gcbk;
> > +} BDRVGlusterState;
> > +
> > +#define GLUSTER_FD_READ 0
> > +#define GLUSTER_FD_WRITE 1
> > +
> > +static void qemu_gluster_complete_aio(GlusterCBKData *gcbk)
> > +{
> > +    GlusterAIOCB *acb = gcbk->acb;
> > +    int ret;
> > +
> > +    if (acb->cancelled) {
> 
> Where does cancelled get set?

I realised that I am not supporting bdrv_aio_cancel(). I guess I will have
to add support for this in next version.

> 
> > +        qemu_aio_release(acb);
> > +        goto done;
> > +    }
> > +
> > +    if (gcbk->ret == gcbk->size) {
> > +        ret = 0; /* Success */
> > +    } else if (gcbk->ret < 0) {
> > +        ret = gcbk->ret; /* Read/Write failed */
> > +    } else {
> > +        ret = -EINVAL; /* Partial read/write - fail it */
> 
> EINVAL is for invalid arguments.  EIO would be better.

Ok.

> 
> > +/*
> > + * file=protocol:server@port:volname:image
> > + */
> > +static int qemu_gluster_parsename(GlusterConf *c, const char *filename)
> > +{
> > +    char *file = g_strdup(filename);
> > +    char *token, *next_token, *saveptr;
> > +    char *token_s, *next_token_s, *saveptr_s;
> > +    int ret = -EINVAL;
> > +
> > +    /* Discard the protocol */
> > +    token = strtok_r(file, ":", &saveptr);
> > +    if (!token) {
> > +        goto out;
> > +    }
> > +
> > +    /* server@port */
> > +    next_token = strtok_r(NULL, ":", &saveptr);
> > +    if (!next_token) {
> > +        goto out;
> > +    }
> > +    if (strchr(next_token, '@')) {
> > +        token_s = strtok_r(next_token, "@", &saveptr_s);
> > +        if (!token_s) {
> > +            goto out;
> > +        }
> > +        strncpy(c->server, token_s, HOST_NAME_MAX);
> 
> strncpy(3) will not NUL-terminate when token_s is HOST_NAME_MAX
> characters long.  QEMU has cutils.c:pstrcpy().

Will use pstrcpy.

> 
> When the argument is too long we should probably report an error
> instead of truncating.

Or should we let gluster APIs to flag an error with truncated
server and volume names ?

> 
> Same below.
> 
> > +        next_token_s = strtok_r(NULL, "@", &saveptr_s);
> > +        if (!next_token_s) {
> > +            goto out;
> > +        }
> > +        c->port = atoi(next_token_s);
> 
> No error checking.  If the input is invalid an error message would
> help the user here.

Fixed.

> 
> > +static struct glfs *qemu_gluster_init(GlusterConf *c, const char *filename)
> > +{
> > +    struct glfs *glfs = NULL;
> > +    int ret;
> > +
> > +    ret = qemu_gluster_parsename(c, filename);
> > +    if (ret < 0) {
> > +        errno = -ret;
> > +        goto out;
> > +    }
> > +
> > +    glfs = glfs_new(c->volname);
> > +    if (!glfs) {
> > +        goto out;
> > +    }
> > +
> > +    ret = glfs_set_volfile_server(glfs, "socket", c->server, c->port);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    /*
> > +     * TODO: Logging is not necessary but instead nice to have.
> > +     * Can QEMU optionally log into a standard place ?
> 
> QEMU prints to stderr, can you do that here too?  The global log file
> is not okay, especially when multiple QEMU instances are running.

Ok, I can do glfs_set_logging(glfs, "/dev/stderr", loglevel);

> 
> > +     * Need to use defines like gf_loglevel_t:GF_LOG_INFO instead of
> > +     * hard coded values like 7 here.
> > +     */
> > +    ret = glfs_set_logging(glfs, "/tmp/qemu-gluster.log", 7);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    ret = glfs_init(glfs);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +    return glfs;
> > +
> > +out:
> > +    if (glfs) {
> > +        (void)glfs_fini(glfs);
> > +    }
> > +    return NULL;
> > +}
> > +
> > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> > +    int bdrv_flags)
> > +{
> > +    BDRVGlusterState *s = bs->opaque;
> > +    GlusterConf *c = g_malloc(sizeof(GlusterConf));
> 
> Can this be allocated on the stack?

It consists of PATH_MAX(4096), HOST_NAME_MAX(255) and GLUSTERD_MAX_VOLUME_NAME
(1000). A bit heavy to be on stack ?

> 
> > +    int ret;
> > +
> > +    s->glfs = qemu_gluster_init(c, filename);
> > +    if (!s->glfs) {
> > +        ret = -errno;
> > +        goto out;
> > +    }
> > +
> > +    s->open_flags |=  O_BINARY;
> 
> Can open_flags be a local variable?

Yes, fixed.

> 
> > +static int qemu_gluster_create(const char *filename,
> > +        QEMUOptionParameter *options)
> > +{
> > +    struct glfs *glfs;
> > +    struct glfs_fd *fd;
> > +    GlusterConf *c = g_malloc(sizeof(GlusterConf));
> > +    int ret = 0;
> > +    int64_t total_size = 0;
> > +
> > +    glfs = qemu_gluster_init(c, filename);
> > +    if (!glfs) {
> > +        ret = -errno;
> > +        goto out;
> > +    }
> > +
> > +    /* Read out options */
> > +    while (options && options->name) {
> > +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> > +            total_size = options->value.n / BDRV_SECTOR_SIZE;
> > +        }
> > +        options++;
> > +    }
> > +
> > +    fd = glfs_creat(glfs, c->image, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU);
> 
> Why set the execute permission bit?

Changed to read and write only.

> 
> > +static void qemu_gluster_close(BlockDriverState *bs)
> > +{
> > +    BDRVGlusterState *s = bs->opaque;
> > +
> > +    if (s->fd) {
> > +        glfs_close(s->fd);
> > +        s->fd = NULL;
> > +    }
> 
> Why not call glfs_fini() here?

Missed that, fixed now.

Thanks for your comments.

Regards,
Bharata.

Stefan Hajnoczi July 23, 2012, 9:06 a.m. UTC | #3

On Mon, Jul 23, 2012 at 9:32 AM, Bharata B Rao
<bharata@linux.vnet.ibm.com> wrote:
> On Sun, Jul 22, 2012 at 04:38:00PM +0100, Stefan Hajnoczi wrote:
>> On Sat, Jul 21, 2012 at 9:31 AM, Bharata B Rao
>> <bharata@linux.vnet.ibm.com> wrote:
>> > +} GlusterAIOCB;
>> > +
>> > +typedef struct GlusterCBKData {
>> > +    GlusterAIOCB *acb;
>> > +    struct BDRVGlusterState *s;
>> > +    int64_t size;
>> > +    int ret;
>> > +} GlusterCBKData;
>>
>> I think GlusterCBKData could just be part of GlusterAIOCB.  That would
>> simplify the code a little and avoid some malloc/free.
>
> Are you suggesting to put a field
>
> GlusterCBKData gcbk;
>
> inside GlusterAIOCB and use gcbk from there or
>
> Are you suggesting that I make the fields of GlusterCBKData part of
> GlusterAIOCB and get rid of GlusterCBKData altogether ? This means I would
> have to pass the GlusterAIOCB to gluster async calls and update its fields from
> gluster callback routine. I can do this, but I am not sure if you can touch
> the fields of GlusterAIOCB in non-QEMU threads (gluster callback thread).

The fields in GlusterCBKData could become part of GlusterAIOCB.
Different threads can access fields in a struct, they just need to
ensure access is synchronized if they touch the same fields.  In the
case of this code I think there is nothing that requires
synchronization beyond the pipe mechanism that you already use to
complete processing in a QEMU thread.

>> When the argument is too long we should probably report an error
>> instead of truncating.
>
> Or should we let gluster APIs to flag an error with truncated
> server and volume names ?

What if the truncated name is a valid but different object?  For example:
Max chars = 5
Objects:
"helloworld"
"hello"

If "helloworld" is truncated to "hello" we get no error back because
it's a valid object!

We need to either check sizes explicitly without truncating or use a
g_strdup() approach without any size limits and let the gfapi
functions error out if the input string is too long.

>> > +static struct glfs *qemu_gluster_init(GlusterConf *c, const char *filename)
>> > +{
>> > +    struct glfs *glfs = NULL;
>> > +    int ret;
>> > +
>> > +    ret = qemu_gluster_parsename(c, filename);
>> > +    if (ret < 0) {
>> > +        errno = -ret;
>> > +        goto out;
>> > +    }
>> > +
>> > +    glfs = glfs_new(c->volname);
>> > +    if (!glfs) {
>> > +        goto out;
>> > +    }
>> > +
>> > +    ret = glfs_set_volfile_server(glfs, "socket", c->server, c->port);
>> > +    if (ret < 0) {
>> > +        goto out;
>> > +    }
>> > +
>> > +    /*
>> > +     * TODO: Logging is not necessary but instead nice to have.
>> > +     * Can QEMU optionally log into a standard place ?
>>
>> QEMU prints to stderr, can you do that here too?  The global log file
>> is not okay, especially when multiple QEMU instances are running.
>
> Ok, I can do glfs_set_logging(glfs, "/dev/stderr", loglevel);

Yes.  I think "-" is best since it is supported by gfapi
(libglusterfs/src/logging.c:gf_log_init).  /dev/stderr is not POSIX.

>> > +     * Need to use defines like gf_loglevel_t:GF_LOG_INFO instead of
>> > +     * hard coded values like 7 here.
>> > +     */
>> > +    ret = glfs_set_logging(glfs, "/tmp/qemu-gluster.log", 7);
>> > +    if (ret < 0) {
>> > +        goto out;
>> > +    }
>> > +
>> > +    ret = glfs_init(glfs);
>> > +    if (ret < 0) {
>> > +        goto out;
>> > +    }
>> > +    return glfs;
>> > +
>> > +out:
>> > +    if (glfs) {
>> > +        (void)glfs_fini(glfs);
>> > +    }
>> > +    return NULL;
>> > +}
>> > +
>> > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
>> > +    int bdrv_flags)
>> > +{
>> > +    BDRVGlusterState *s = bs->opaque;
>> > +    GlusterConf *c = g_malloc(sizeof(GlusterConf));
>>
>> Can this be allocated on the stack?
>
> It consists of PATH_MAX(4096), HOST_NAME_MAX(255) and GLUSTERD_MAX_VOLUME_NAME
> (1000). A bit heavy to be on stack ?

This is userspace, stacks are big but it's up to you.

Stefan

diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..a1ae67f 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,3 +9,4 @@  block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 0000000..c33a006
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,483 @@ 
+/*
+ * GlusterFS backend for QEMU
+ *
+ * (AIO implementation is derived from block/rbd.c)
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+#include "block_int.h"
+#include <glusterfs/api/glfs.h>
+
+typedef struct GlusterConf {
+    char server[HOST_NAME_MAX];
+    int port;
+    char volname[128]; /* TODO: use GLUSTERD_MAX_VOLUME_NAME */
+    char image[PATH_MAX];
+} GlusterConf;
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    QEMUIOVector *qiov;
+    char *bounce;
+    struct BDRVGlusterState *s;
+    int cancelled;
+} GlusterAIOCB;
+
+typedef struct GlusterCBKData {
+    GlusterAIOCB *acb;
+    struct BDRVGlusterState *s;
+    int64_t size;
+    int ret;
+} GlusterCBKData;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    int open_flags;
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+    int event_reader_pos;
+    GlusterCBKData *event_gcbk;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ 0
+#define GLUSTER_FD_WRITE 1
+
+static void qemu_gluster_complete_aio(GlusterCBKData *gcbk)
+{
+    GlusterAIOCB *acb = gcbk->acb;
+    int ret;
+
+    if (acb->cancelled) {
+        qemu_aio_release(acb);
+        goto done;
+    }
+
+    if (gcbk->ret == gcbk->size) {
+        ret = 0; /* Success */
+    } else if (gcbk->ret < 0) {
+        ret = gcbk->ret; /* Read/Write failed */
+    } else {
+        ret = -EINVAL; /* Partial read/write - fail it */
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+
+done:
+    g_free(gcbk);
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&s->event_gcbk;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
+                   sizeof(s->event_gcbk) - s->event_reader_pos);
+        if (ret > 0) {
+            s->event_reader_pos += ret;
+            if (s->event_reader_pos == sizeof(s->event_gcbk)) {
+                s->event_reader_pos = 0;
+                qemu_gluster_complete_aio(s->event_gcbk);
+                s->qemu_aio_count--;
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+/*
+ * file=protocol:server@port:volname:image
+ */
+static int qemu_gluster_parsename(GlusterConf *c, const char *filename)
+{
+    char *file = g_strdup(filename);
+    char *token, *next_token, *saveptr;
+    char *token_s, *next_token_s, *saveptr_s;
+    int ret = -EINVAL;
+
+    /* Discard the protocol */
+    token = strtok_r(file, ":", &saveptr);
+    if (!token) {
+        goto out;
+    }
+
+    /* server@port */
+    next_token = strtok_r(NULL, ":", &saveptr);
+    if (!next_token) {
+        goto out;
+    }
+    if (strchr(next_token, '@')) {
+        token_s = strtok_r(next_token, "@", &saveptr_s);
+        if (!token_s) {
+            goto out;
+        }
+        strncpy(c->server, token_s, HOST_NAME_MAX);
+        next_token_s = strtok_r(NULL, "@", &saveptr_s);
+        if (!next_token_s) {
+            goto out;
+        }
+        c->port = atoi(next_token_s);
+    } else {
+        strncpy(c->server, next_token, HOST_NAME_MAX);
+        c->port = 0;
+    }
+
+    /* volname */
+    next_token = strtok_r(NULL, ":", &saveptr);
+    if (!next_token) {
+        goto out;
+    }
+    strncpy(c->volname, next_token, 128);
+
+    /* image */
+    next_token = strtok_r(NULL, ":", &saveptr);
+    if (!next_token) {
+        goto out;
+    }
+    strncpy(c->image, next_token, PATH_MAX);
+    ret = 0;
+out:
+    g_free(file);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterConf *c, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+
+    ret = qemu_gluster_parsename(c, filename);
+    if (ret < 0) {
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(c->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, "socket", c->server, c->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Logging is not necessary but instead nice to have.
+     * Can QEMU optionally log into a standard place ?
+     * Need to use defines like gf_loglevel_t:GF_LOG_INFO instead of
+     * hard coded values like 7 here.
+     */
+    ret = glfs_set_logging(glfs, "/tmp/qemu-gluster.log", 7);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret < 0) {
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        (void)glfs_fini(glfs);
+    }
+    return NULL;
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    GlusterConf *c = g_malloc(sizeof(GlusterConf));
+    int ret;
+
+    s->glfs = qemu_gluster_init(c, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    s->open_flags |=  O_BINARY;
+    s->open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        s->open_flags |= O_RDWR;
+    } else {
+        s->open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        s->open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, c->image, s->open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        goto out;
+    }
+    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
+    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+    g_free(c);
+    return ret;
+
+out:
+    g_free(c);
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        (void) glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    GlusterConf *c = g_malloc(sizeof(GlusterConf));
+    int ret = 0;
+    int64_t total_size = 0;
+
+    glfs = qemu_gluster_init(c, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, c->image, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    g_free(c);
+    if (glfs) {
+        (void) glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static AIOPool gluster_aio_pool = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+};
+
+static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterCBKData *gcbk)
+{
+    int ret = 0;
+    while (1) {
+        fd_set wfd;
+        int fd = s->fds[GLUSTER_FD_WRITE];
+
+        ret = write(fd, (void *)&gcbk, sizeof(gcbk));
+        if (ret >= 0) {
+            break;
+        }
+        if (errno == EINTR) {
+            continue;
+        }
+        if (errno != EAGAIN) {
+            break;
+        }
+
+        FD_ZERO(&wfd);
+        FD_SET(fd, &wfd);
+        do {
+            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
+        } while (ret < 0 && errno == EINTR);
+    }
+    return ret;
+}
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterCBKData *gcbk = (GlusterCBKData *)arg;
+    BDRVGlusterState *s = gcbk->s;
+
+    gcbk->ret = ret;
+    if (qemu_gluster_send_pipe(s, gcbk) < 0) {
+        error_report("Could not complete read/write/flush from gluster");
+        abort();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    GlusterCBKData *gcbk;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->qiov = qiov;
+    acb->s = s;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    gcbk = g_malloc(sizeof(GlusterCBKData));
+    gcbk->acb = acb;
+    gcbk->s = s;
+    gcbk->size = size;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, gcbk);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, gcbk);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    g_free(gcbk);
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    GlusterCBKData *gcbk;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->s = s;
+    s->qemu_aio_count++;
+
+    gcbk = g_malloc(sizeof(GlusterCBKData));
+    gcbk->acb = acb;
+    gcbk->s = s;
+    gcbk->size = 0;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, gcbk);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    g_free(gcbk);
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_size;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name = "gluster",
+    .protocol_name = "gluster",
+    .instance_size = sizeof(BDRVGlusterState),
+    .bdrv_file_open = qemu_gluster_open,
+    .bdrv_close = qemu_gluster_close,
+    .bdrv_create = qemu_gluster_create,
+    .bdrv_getlength = qemu_gluster_getlength,
+
+    .bdrv_aio_readv = qemu_gluster_aio_readv,
+    .bdrv_aio_writev = qemu_gluster_aio_writev,
+    .bdrv_aio_flush = qemu_gluster_aio_flush,
+
+    .create_options = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);

[RFC,2/2] block: gluster as block backend

Commit Message

Comments

Patch