Patchwork [RFC,1/1] ceph/rbd block driver for qemu-kvm

login
register
mail settings
Submitter Christian Brunner
Date May 19, 2010, 7:22 p.m.
Message ID <20100519192222.GD61706@ncolin.muc.de>
Download mbox | patch
Permalink /patch/53014/
State New
Headers show

Comments

Christian Brunner - May 19, 2010, 7:22 p.m.
The attached patch is a block driver for the distributed file system
Ceph (http://ceph.newdream.net/). This driver uses librados (which
is part of the Ceph server) for direct access to the Ceph object
store and is running entirely in userspace. Therefore it is
called "rbd" - rados block device.

To compile the driver a recent version of ceph (>= 0.20.1) is needed
and you have to "--enable-rbd" when running configure.

Additional information is available on the Ceph-Wiki:

http://ceph.newdream.net/wiki/Kvm-rbd

---
 Makefile          |    3 +
 Makefile.objs     |    1 +
 block/rados.h     |  376 ++++++++++++++++++++++++++++++++++
 block/rbd.c       |  585 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/rbd_types.h |   48 +++++
 configure         |   27 +++
 6 files changed, 1040 insertions(+), 0 deletions(-)
 create mode 100644 block/rados.h
 create mode 100644 block/rbd.c
 create mode 100644 block/rbd_types.h
Blue Swirl - May 20, 2010, 8:31 p.m.
On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
> The attached patch is a block driver for the distributed file system
> Ceph (http://ceph.newdream.net/). This driver uses librados (which
> is part of the Ceph server) for direct access to the Ceph object
> store and is running entirely in userspace. Therefore it is
> called "rbd" - rados block device.
>
> To compile the driver a recent version of ceph (>= 0.20.1) is needed
> and you have to "--enable-rbd" when running configure.
>
> Additional information is available on the Ceph-Wiki:
>
> http://ceph.newdream.net/wiki/Kvm-rbd


I have no idea whether it makes sense to add Ceph (no objection
either). I have some minor comments below.

>
> ---
>  Makefile          |    3 +
>  Makefile.objs     |    1 +
>  block/rados.h     |  376 ++++++++++++++++++++++++++++++++++
>  block/rbd.c       |  585 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/rbd_types.h |   48 +++++
>  configure         |   27 +++
>  6 files changed, 1040 insertions(+), 0 deletions(-)
>  create mode 100644 block/rados.h
>  create mode 100644 block/rbd.c
>  create mode 100644 block/rbd_types.h
>
> diff --git a/Makefile b/Makefile
> index eb9e02b..b1ab3e9 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -27,6 +27,9 @@ configure: ;
>  $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
>
>  LIBS+=-lz $(LIBS_TOOLS)
> +ifdef CONFIG_RBD
> +LIBS+=-lrados
> +endif
>
>  ifdef BUILD_DOCS
>  DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
> diff --git a/Makefile.objs b/Makefile.objs
> index acbaf22..85791ac 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -18,6 +18,7 @@ block-nested-y += parallels.o nbd.o blkdebug.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_CURL) += curl.o
> +block-nested-$(CONFIG_RBD) += rbd.o
>
>  block-obj-y +=  $(addprefix block/, $(block-nested-y))
>
> diff --git a/block/rados.h b/block/rados.h
> new file mode 100644
> index 0000000..6cde9a1
> --- /dev/null
> +++ b/block/rados.h
> @@ -0,0 +1,376 @@
> +#ifndef __RADOS_H
> +#define __RADOS_H

IIRC underscores here may conflict with system header use. Please use
something like QEMU_BLOCK_RADOS_H.

> +
> +/*
> + * Data types for the Ceph distributed object storage layer RADOS
> + * (Reliable Autonomic Distributed Object Store).
> + */
> +
> +
> +
> +/*
> + * osdmap encoding versions
> + */
> +#define CEPH_OSDMAP_INC_VERSION     5
> +#define CEPH_OSDMAP_INC_VERSION_EXT 5
> +#define CEPH_OSDMAP_VERSION         5
> +#define CEPH_OSDMAP_VERSION_EXT     5
> +
> +/*
> + * fs id
> + */
> +struct ceph_fsid {
> +       unsigned char fsid[16];

Too large indent, please check also elsewhere.

> +};
> +
> +static inline int ceph_fsid_compare(const struct ceph_fsid *a,
> +                                   const struct ceph_fsid *b)
> +{
> +       return memcmp(a, b, sizeof(*a));
> +}
> +
> +/*
> + * ino, object, etc.
> + */
> +typedef __le64 ceph_snapid_t;

Please use uint64_t and le_to_cpu()/cpu_to_le().

> +#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */

Likewise, uint64_t is the standard type. Also other places.

> +#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
> +#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
> +
> +struct ceph_timespec {
> +       __le32 tv_sec;
> +       __le32 tv_nsec;
> +} __attribute__ ((packed));
> +
> +
> +/*
> + * object layout - how objects are mapped into PGs
> + */
> +#define CEPH_OBJECT_LAYOUT_HASH     1
> +#define CEPH_OBJECT_LAYOUT_LINEAR   2
> +#define CEPH_OBJECT_LAYOUT_HASHINO  3
> +
> +/*
> + * pg layout -- how PGs are mapped onto (sets of) OSDs
> + */
> +#define CEPH_PG_LAYOUT_CRUSH  0
> +#define CEPH_PG_LAYOUT_HASH   1
> +#define CEPH_PG_LAYOUT_LINEAR 2
> +#define CEPH_PG_LAYOUT_HYBRID 3
> +
> +
> +/*
> + * placement group.
> + * we encode this into one __le64.
> + */
> +struct ceph_pg {
> +       __le16 preferred; /* preferred primary osd */
> +       __le16 ps;        /* placement seed */
> +       __le32 pool;      /* object pool */
> +} __attribute__ ((packed));
> +
> +/*
> + * pg_pool is a set of pgs storing a pool of objects
> + *
> + *  pg_num -- base number of pseudorandomly placed pgs
> + *
> + *  pgp_num -- effective number when calculating pg placement.  this
> + * is used for pg_num increases.  new pgs result in data being "split"
> + * into new pgs.  for this to proceed smoothly, new pgs are intiially
> + * colocated with their parents; that is, pgp_num doesn't increase
> + * until the new pgs have successfully split.  only _then_ are the new
> + * pgs placed independently.
> + *
> + *  lpg_num -- localized pg count (per device).  replicas are randomly
> + * selected.
> + *
> + *  lpgp_num -- as above.
> + */
> +#define CEPH_PG_TYPE_REP     1
> +#define CEPH_PG_TYPE_RAID4   2
> +#define CEPH_PG_POOL_VERSION 2
> +struct ceph_pg_pool {
> +       __u8 type;                /* CEPH_PG_TYPE_* */
> +       __u8 size;                /* number of osds in each pg */
> +       __u8 crush_ruleset;       /* crush placement rule */
> +       __u8 object_hash;         /* hash mapping object name to ps */
> +       __le32 pg_num, pgp_num;   /* number of pg's */
> +       __le32 lpg_num, lpgp_num; /* number of localized pg's */
> +       __le32 last_change;       /* most recent epoch changed */

Is the intent here that one uint32_t is implicitly added for padding
or is the structure really unaligned? I'd make the padding explicit to
be sure.

> +       __le64 snap_seq;          /* seq for per-pool snapshot */
> +       __le32 snap_epoch;        /* epoch of last snap */
> +       __le32 num_snaps;
> +       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */

Unaligned?

> +       __le64 auid;               /* who owns the pg */
> +} __attribute__ ((packed));
> +
> +/*
> + * stable_mod func is used to control number of placement groups.
> + * similar to straight-up modulo, but produces a stable mapping as b
> + * increases over time.  b is the number of bins, and bmask is the
> + * containing power of 2 minus 1.
> + *
> + * b <= bmask and bmask=(2**n)-1
> + * e.g., b=12 -> bmask=15, b=123 -> bmask=127
> + */
> +static inline int ceph_stable_mod(int x, int b, int bmask)
> +{
> +       if ((x & bmask) < b)
> +               return x & bmask;
> +       else
> +               return x & (bmask >> 1);

Please check CODING_STYLE for brace use.

> +}
> +
> +/*
> + * object layout - how a given object should be stored.
> + */
> +struct ceph_object_layout {
> +       struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
> +       __le32 ol_stripe_unit;    /* for per-object parity, if any */
> +} __attribute__ ((packed));
> +
> +/*
> + * compound epoch+version, used by storage layer to serialize mutations
> + */
> +struct ceph_eversion {
> +       __le32 epoch;

Unaligned?

> +       __le64 version;
> +} __attribute__ ((packed));
> +
> +/*
> + * osd map bits
> + */
> +
> +/* status bits */
> +#define CEPH_OSD_EXISTS 1
> +#define CEPH_OSD_UP     2
> +
> +/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
> +#define CEPH_OSD_IN  0x10000
> +#define CEPH_OSD_OUT 0
> +
> +
> +/*
> + * osd map flag bits
> + */
> +#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
> +#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
> +#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
> +#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
> +#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
> +
> +/*
> + * osd ops
> + */
> +#define CEPH_OSD_OP_MODE       0xf000
> +#define CEPH_OSD_OP_MODE_RD    0x1000
> +#define CEPH_OSD_OP_MODE_WR    0x2000
> +#define CEPH_OSD_OP_MODE_RMW   0x3000
> +#define CEPH_OSD_OP_MODE_SUB   0x4000
> +
> +#define CEPH_OSD_OP_TYPE       0x0f00
> +#define CEPH_OSD_OP_TYPE_LOCK  0x0100
> +#define CEPH_OSD_OP_TYPE_DATA  0x0200
> +#define CEPH_OSD_OP_TYPE_ATTR  0x0300
> +#define CEPH_OSD_OP_TYPE_EXEC  0x0400
> +#define CEPH_OSD_OP_TYPE_PG    0x0500
> +
> +enum {
> +       /** data **/
> +       /* read */
> +       CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
> +       CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
> +
> +       /* fancy read */
> +       CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
> +
> +       /* write */
> +       CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
> +       CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
> +       CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
> +       CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
> +       CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
> +
> +       /* fancy write */
> +       CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
> +       CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
> +       CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
> +       CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
> +
> +       CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
> +       CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
> +       CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
> +
> +       CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
> +
> +       /** attrs **/
> +       /* read */
> +       CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
> +       CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
> +
> +       /* write */
> +       CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
> +       CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
> +       CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
> +       CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
> +
> +       /** subop **/
> +       CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
> +       CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
> +       CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
> +       CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
> +       CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
> +
> +       /** lock **/
> +       CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
> +       CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
> +       CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
> +       CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
> +       CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
> +       CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
> +
> +       /** exec **/
> +       CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
> +
> +       /** pg **/
> +       CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
> +};
> +
> +static inline int ceph_osd_op_type_lock(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
> +}
> +static inline int ceph_osd_op_type_data(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
> +}
> +static inline int ceph_osd_op_type_attr(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
> +}
> +static inline int ceph_osd_op_type_exec(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
> +}
> +static inline int ceph_osd_op_type_pg(int op)
> +{
> +       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
> +}
> +
> +static inline int ceph_osd_op_mode_subop(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
> +}
> +static inline int ceph_osd_op_mode_read(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
> +}
> +static inline int ceph_osd_op_mode_modify(int op)
> +{
> +       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
> +}
> +
> +#define CEPH_OSD_TMAP_HDR 'h'
> +#define CEPH_OSD_TMAP_SET 's'
> +#define CEPH_OSD_TMAP_RM  'r'
> +
> +extern const char *ceph_osd_op_name(int op);
> +
> +
> +/*
> + * osd op flags
> + *
> + * An op may be READ, WRITE, or READ|WRITE.
> + */
> +enum {
> +       CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
> +       CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
> +       CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
> +       CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
> +       CEPH_OSD_FLAG_READ = 16,        /* op may read */
> +       CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
> +       CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
> +       CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
> +       CEPH_OSD_FLAG_BALANCE_READS = 256,
> +       CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
> +       CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
> +       CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
> +};
> +
> +enum {
> +       CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
> +};
> +
> +#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
> +#define EBLACKLISTED ESHUTDOWN /* blacklisted */

Are these used somewhere? Maybe these could clash with system errnos.

> +
> +/*
> + * an individual object operation.  each may be accompanied by some data
> + * payload
> + */
> +struct ceph_osd_op {
> +       __le16 op;           /* CEPH_OSD_OP_* */

Unaligned?

> +       __le32 flags;        /* CEPH_OSD_FLAG_* */
> +       union {
> +               struct {
> +                       __le64 offset, length;
> +                       __le64 truncate_size;
> +                       __le32 truncate_seq;
> +               } __attribute__ ((packed)) extent;
> +               struct {
> +                       __le32 name_len;
> +                       __le32 value_len;
> +               } __attribute__ ((packed)) xattr;
> +               struct {
> +                       __u8 class_len;
> +                       __u8 method_len;
> +                       __u8 argc;

Unaligned?

> +                       __le32 indata_len;
> +               } __attribute__ ((packed)) cls;
> +               struct {
> +                       __le64 cookie, count;
> +               } __attribute__ ((packed)) pgls;
> +       };
> +       __le32 payload_len;
> +} __attribute__ ((packed));
> +
> +/*
> + * osd request message header.  each request may include multiple
> + * ceph_osd_op object operations.
> + */
> +struct ceph_osd_request_head {
> +       __le32 client_inc;                 /* client incarnation */
> +       struct ceph_object_layout layout;  /* pgid */

Unaligned on 64 bit hosts?

> +       __le32 osdmap_epoch;               /* client's osdmap epoch */
> +
> +       __le32 flags;
> +
> +       struct ceph_timespec mtime;        /* for mutations only */
> +       struct ceph_eversion reassert_version; /* if we are replaying op */
> +
> +       __le32 object_len;     /* length of object name */
> +
> +       __le64 snapid;         /* snapid to read */

Unaligned?

> +       __le64 snap_seq;       /* writer's snap context */
> +       __le32 num_snaps;
> +
> +       __le16 num_ops;
> +       struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */

Unaligned?

> +} __attribute__ ((packed));
> +
> +struct ceph_osd_reply_head {
> +       __le32 client_inc;                /* client incarnation */
> +       __le32 flags;
> +       struct ceph_object_layout layout;
> +       __le32 osdmap_epoch;

Unaligned on 64 bit hosts?

> +       struct ceph_eversion reassert_version; /* for replaying uncommitted */
> +
> +       __le32 result;                    /* result code */
> +
> +       __le32 object_len;                /* length of object name */
> +       __le32 num_ops;
> +       struct ceph_osd_op ops[0];  /* ops[], object */

Unaligned on 64 bit hosts?

> +} __attribute__ ((packed));
> +
> +
> +#endif
> diff --git a/block/rbd.c b/block/rbd.c
> new file mode 100644
> index 0000000..eedae50
> --- /dev/null
> +++ b/block/rbd.c
> @@ -0,0 +1,585 @@
> +/*
> + * QEMU Block driver for RADOS (Ceph)
> + *
> + * Copyright (C) 2010 Christian Brunner <chb@muc.de>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include <sys/types.h>
> +#include <stdbool.h>
> +
> +#include <qemu-common.h>
> +
> +#include "rbd_types.h"
> +#include "rados.h"
> +#include "module.h"
> +#include "block_int.h"
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <rados/librados.h>
> +
> +#include <signal.h>
> +
> +/*
> + * When specifying the image filename use:
> + *
> + * rbd:poolname/devicename
> + *
> + * poolname must be the name of an existing rados pool
> + *
> + * devicename is the basename for all objects used to
> + * emulate the raw device.
> + *
> + * Metadata information (image size, ...) is stored in an
> + * object with the name "devicename.rbd".
> + *
> + * The raw device is split into 4MB sized objects by default.
> + * The sequencenumber is encoded in a 12 byte long hex-string,
> + * and is attached to the devicename, separated by a dot.
> + * e.g. "devicename.1234567890ab"
> + *
> + */
> +
> +#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> +
> +typedef struct RBDAIOCB {
> +    BlockDriverAIOCB common;
> +    QEMUBH *bh;
> +    int ret;
> +    QEMUIOVector *qiov;
> +    char *bounce;
> +    int write;
> +    int64_t sector_num;
> +    int aiocnt;
> +    int error;
> +} RBDAIOCB;
> +
> +typedef struct RADOSCB {
> +    int rcbid;
> +    RBDAIOCB *acb;
> +    int done;
> +    int64_t segsize;
> +    char *buf;
> +} RADOSCB;
> +
> +typedef struct RBDRVRBDState {
> +    rados_pool_t pool;
> +    char name[RBD_MAX_OBJ_NAME_SIZE];
> +    int name_len;
> +    uint64_t size;
> +    uint64_t objsize;
> +} RBDRVRBDState;
> +
> +typedef struct rbd_obj_header_ondisk RbdHeader1;
> +
> +static int rbd_parsename(const char *filename, char *pool, char *name)
> +{
> +    const char *rbdname;
> +    char *p, *n;
> +    int l;
> +
> +    if (!strstart(filename, "rbd:", &rbdname)) {
> +        return -EINVAL;
> +    }
> +
> +    pstrcpy(pool, 2 * RBD_MAX_SEG_NAME_SIZE, rbdname);
> +    p = strchr(pool, '/');
> +    if (p == NULL) {
> +        return -EINVAL;
> +    }
> +
> +    *p = '\0';
> +    n = ++p;
> +
> +    l = strlen(n);
> +
> +    if (l > RBD_MAX_OBJ_NAME_SIZE) {
> +        fprintf(stderr, "object name to long\n");
> +        return -EINVAL;
> +    } else if (l <= 0) {
> +        fprintf(stderr, "object name to short\n");
> +        return -EINVAL;
> +    }
> +
> +    strcpy(name, n);

pstrcpy(name, l, n);

> +
> +    return l;
> +}
> +
> +static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc)
> +{
> +    uint32_t len = strlen(name);
> +    uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t);       /* encoding op + name + empty buffer */
> +    char *desc;
> +
> +    desc = qemu_malloc(total_len);
> +    if (!desc) {
> +        return -ENOMEM;
> +    }

qemu_malloc won't return NULL, the check is not useful.

> +
> +    *tmap_desc = desc;
> +
> +    *desc = op;
> +    desc++;
> +    memcpy(desc, &len, sizeof(len));
> +    desc += sizeof(len);
> +    memcpy(desc, name, len);
> +    desc += len;
> +    len = 0;
> +    memcpy(desc, &len, sizeof(len));
> +    desc += sizeof(len);
> +
> +    return desc - *tmap_desc;
> +}
> +
> +static void free_tmap_op(char *tmap_desc)
> +{
> +    qemu_free(tmap_desc);
> +}
> +
> +static int rbd_register_image(rados_pool_t pool, const char *name)
> +{
> +    char *tmap_desc;
> +    const char *dir = RBD_DIRECTORY;
> +    int ret;
> +
> +    ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = rados_tmap_update(pool, dir, tmap_desc, ret);
> +    free_tmap_op(tmap_desc);
> +
> +    return ret;
> +}
> +
> +static int rbd_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    int64_t bytes = 0;
> +    int64_t objsize;
> +    uint64_t size;
> +    time_t mtime;
> +    uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER;
> +    char pool[RBD_MAX_SEG_NAME_SIZE];
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    char name[RBD_MAX_SEG_NAME_SIZE];
> +    RbdHeader1 header;
> +    rados_pool_t p;
> +    int name_len;
> +    int ret;
> +
> +    if ((name_len = rbd_parsename(filename, pool, name)) < 0) {
> +        return -EINVAL;
> +    }
> +
> +    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
> +
> +    /* Read out options */
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            bytes = options->value.n;
> +        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
> +            if (options->value.n) {
> +                objsize = options->value.n;
> +                if (!objsize || ((objsize - 1) & objsize)) {    /* not a power of 2? */
> +                    fprintf(stderr, "obj size needs to be power of 2\n");
> +                    return -EINVAL;
> +                }
> +                if (objsize < 4096) {
> +                    fprintf(stderr, "obj size too small\n");
> +                    return -EINVAL;
> +                }
> +
> +                for (obj_order = 0; obj_order < 64; obj_order++) {
> +                    if (objsize == 1)
> +                        break;
> +                    objsize >>= 1;
> +                }
> +            }
> +        }
> +        options++;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    pstrcpy(header.text, sizeof(header.text), rbd_text);
> +    pstrcpy(header.signature, sizeof(header.signature), rbd_signature);
> +    pstrcpy(header.version, sizeof(header.version), rbd_version);
> +    header.image_size = bytes;
> +    cpu_to_le64s((uint64_t *) & header.image_size);
> +    header.obj_order = obj_order;
> +    header.crypt_type = RBD_CRYPT_NONE;
> +    header.comp_type = RBD_COMP_NONE;
> +    header.snap_seq = 0;
> +    header.snap_count = 0;
> +    cpu_to_le32s(&header.snap_count);
> +
> +    if (rados_initialize(0, NULL) < 0) {
> +        fprintf(stderr, "error initializing\n");
> +        return -EIO;
> +    }
> +
> +    if (rados_open_pool(pool, &p)) {
> +        fprintf(stderr, "error opening pool %s\n", pool);
> +        return -EIO;
> +    }
> +
> +    /* check for existing rbd header file */
> +    ret = rados_stat(p, n, &size, &mtime);
> +    if (ret == 0) {
> +        ret=-EEXIST;
> +        goto done;
> +    }
> +
> +    /* create header file */
> +    ret = rados_write(p, n, 0, (const char *)&header, sizeof(header));
> +    if (ret < 0) {
> +        goto done;
> +    }
> +
> +    ret = rbd_register_image(p, name);
> +done:
> +    rados_close_pool(p);
> +    rados_deinitialize();
> +
> +    return ret;
> +}
> +
> +static int rbd_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    char pool[RBD_MAX_SEG_NAME_SIZE];
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    char hbuf[4096];
> +
> +    if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
> +        return -EINVAL;
> +    }
> +    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
> +
> +    if (rados_initialize(0, NULL) < 0) {
> +        fprintf(stderr, "error initializing\n");
> +        return -EIO;
> +    }
> +
> +    if (rados_open_pool(pool, &s->pool)) {
> +        fprintf(stderr, "error opening pool %s\n", pool);
> +        return -EIO;
> +    }
> +
> +    if (rados_read(s->pool, n, 0, hbuf, 4096) < 0) {
> +        fprintf(stderr, "error reading header from %s\n", s->name);
> +        return -EIO;
> +    }
> +    if (!strncmp(hbuf + 64, rbd_signature, 4)) {
> +        if (!strncmp(hbuf + 68, rbd_version, 8)) {
> +            RbdHeader1 *header;
> +
> +            header = (RbdHeader1 *) hbuf;
> +            le64_to_cpus((uint64_t *) & header->image_size);
> +            s->size = header->image_size;
> +            s->objsize = 1 << header->obj_order;
> +        } else {
> +            fprintf(stderr, "Unknown image version %s\n", hbuf + 68);
> +            return -EIO;
> +        }
> +    } else {
> +        fprintf(stderr, "Invalid header signature %s\n", hbuf + 64);
> +        return -EIO;
> +    }
> +
> +    return 0;
> +}
> +
> +static void rbd_close(BlockDriverState *bs)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    rados_close_pool(s->pool);
> +    rados_deinitialize();
> +}
> +
> +static int rbd_rw(BlockDriverState *bs, int64_t sector_num,
> +                  uint8_t *buf, int nb_sectors, int write)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +
> +    int64_t segnr, segoffs, segsize, r;
> +    int64_t off, size;
> +
> +    off = sector_num * 512;
> +    size = nb_sectors * 512;
> +    segnr = (int64_t) (off / s->objsize);
> +    segoffs = (int64_t) (off % s->objsize);
> +    segsize = (int64_t) (s->objsize - segoffs);
> +
> +    while (size > 0) {
> +        if (size < segsize) {
> +            segsize = size;
> +        }
> +
> +        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
> +                 (long long unsigned int)segnr);

Please use PRIx64 instead of llx, the cast won't be needed then. Also elsewhere.

> +
> +        if (write) {
> +            if ((r = rados_write(s->pool, n, segoffs, (const char *)buf,
> +                segsize)) < 0) {
> +                return r;
> +            }
> +        } else {
> +            r = rados_read(s->pool, n, segoffs, (char *)buf, segsize);
> +            if (r == -ENOENT) {
> +                memset(buf, 0, segsize);
> +            } else if (r < 0) {
> +                return(r);
> +            } else if (r < segsize) {
> +                memset(buf + r, 0, segsize - r);
> +            }
> +            r = segsize;
> +        }
> +
> +        buf += segsize;
> +        size -= segsize;
> +        segoffs = 0;
> +        segsize = s->objsize;
> +        segnr++;
> +    }
> +
> +    return (0);
> +}
> +
> +static int rbd_read(BlockDriverState *bs, int64_t sector_num,
> +                    uint8_t *buf, int nb_sectors)
> +{
> +    return rbd_rw(bs, sector_num, buf, nb_sectors, 0);
> +}
> +
> +static int rbd_write(BlockDriverState *bs, int64_t sector_num,
> +                     const uint8_t *buf, int nb_sectors)
> +{
> +    return rbd_rw(bs, sector_num, (uint8_t *) buf, nb_sectors, 1);
> +}
> +
> +static void rbd_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
> +    qemu_bh_delete(acb->bh);
> +    acb->bh = NULL;
> +    qemu_aio_release(acb);
> +}
> +
> +static AIOPool rbd_aio_pool = {
> +    .aiocb_size = sizeof(RBDAIOCB),
> +    .cancel = rbd_aio_cancel,
> +};
> +
> +/* This is the callback function for rados_aio_read and _write */
> +static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb)
> +{
> +    RBDAIOCB *acb = rcb->acb;
> +    int64_t r;
> +    int i;
> +
> +    acb->aiocnt--;
> +    r = rados_aio_get_return_value(c);
> +    rados_aio_release(c);
> +    if (acb->write) {
> +        if (r < 0) {
> +            acb->ret = r;
> +            acb->error = 1;
> +        } else if (!acb->error) {
> +            acb->ret += rcb->segsize;
> +        }
> +    } else {
> +        if (r == -ENOENT) {
> +            memset(rcb->buf, 0, rcb->segsize);
> +            if (!acb->error) {
> +                acb->ret += rcb->segsize;
> +            }
> +        } else if (r < 0) {
> +            acb->ret = r;
> +            acb->error = 1;
> +        } else if (r < rcb->segsize) {
> +            memset(rcb->buf + r, 0, rcb->segsize - r);
> +            if (!acb->error) {
> +                acb->ret += rcb->segsize;
> +            }
> +        } else if (!acb->error) {
> +            acb->ret += r;
> +        }
> +    }
> +    qemu_free(rcb);
> +    i = 0;
> +    if (!acb->aiocnt && acb->bh) {
> +        qemu_bh_schedule(acb->bh);
> +    }
> +}
> +
> +/* Callback when all queued rados_aio requests are complete */
> +static void rbd_aio_bh_cb(void *opaque)
> +{
> +    RBDAIOCB *acb = opaque;
> +
> +    if (!acb->write) {
> +        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
> +    }
> +    qemu_vfree(acb->bounce);
> +    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
> +    qemu_bh_delete(acb->bh);
> +    acb->bh = NULL;
> +    qemu_aio_release(acb);
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs,
> +                                           int64_t sector_num,
> +                                           QEMUIOVector *qiov,
> +                                           int nb_sectors,
> +                                           BlockDriverCompletionFunc *cb,
> +                                           void *opaque, int write)
> +{
> +    RBDAIOCB *acb;
> +    RADOSCB *rcb;
> +    rados_completion_t c;
> +    char n[RBD_MAX_SEG_NAME_SIZE];
> +    int64_t segnr, segoffs, segsize, last_segnr;
> +    int64_t off, size;
> +    char *buf;
> +
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
> +    acb->write = write;
> +    acb->qiov = qiov;
> +    acb->bounce = qemu_blockalign(bs, qiov->size);
> +    acb->aiocnt = 0;
> +    acb->ret = 0;
> +    acb->error = 0;
> +
> +    if (!acb->bh) {
> +        acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
> +    }
> +
> +    if (write) {
> +        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
> +    }
> +
> +    buf = acb->bounce;
> +
> +    off = sector_num * 512;
> +    size = nb_sectors * 512;
> +    segnr = (int64_t) (off / s->objsize);
> +    segoffs = (int64_t) (off % s->objsize);
> +    segsize = (int64_t) (s->objsize - segoffs);
> +
> +    last_segnr = ((off + size - 1) / s->objsize);
> +    acb->aiocnt = (last_segnr - segnr) + 1;
> +
> +    while (size > 0) {
> +        if (size < segsize) {
> +            segsize = size;
> +        }
> +
> +        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
> +                 (long long unsigned int)segnr);
> +
> +        rcb = qemu_malloc(sizeof(RADOSCB));
> +        rcb->done = 0;
> +        rcb->acb = acb;
> +        rcb->segsize = segsize;
> +        rcb->buf = buf;
> +
> +        if (write) {
> +            rados_aio_create_completion(rcb, NULL,
> +                                        (rados_callback_t) rbd_finish_aiocb, &c);
> +            rados_aio_write(s->pool, n, segoffs, buf, segsize, c);
> +        } else {
> +            rados_aio_create_completion(rcb, (rados_callback_t) rbd_finish_aiocb,
> +                                        NULL, &c);
> +            rados_aio_read(s->pool, n, segoffs, buf, segsize, c);
> +        }
> +
> +        buf += segsize;
> +        size -= segsize;
> +        segoffs = 0;
> +        segsize = s->objsize;
> +        segnr++;
> +    }
> +
> +    return &acb->common;
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
> +                                       int64_t sector_num, QEMUIOVector *qiov,
> +                                       int nb_sectors,
> +                                       BlockDriverCompletionFunc *cb,
> +                                       void *opaque)
> +{
> +    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
> +}
> +
> +static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
> +                                        int64_t sector_num, QEMUIOVector *qiov,
> +                                        int nb_sectors,
> +                                        BlockDriverCompletionFunc *cb,
> +                                        void *opaque)
> +{
> +    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
> +}
> +
> +static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +    bdi->cluster_size = s->objsize;
> +    return 0;
> +}
> +
> +static int64_t rbd_getlength(BlockDriverState *bs)
> +{
> +    RBDRVRBDState *s = bs->opaque;
> +
> +    return s->size;
> +}
> +
> +static QEMUOptionParameter rbd_create_options[] = {
> +    {
> +     .name = BLOCK_OPT_SIZE,
> +     .type = OPT_SIZE,
> +     .help = "Virtual disk size"
> +    },
> +    {
> +     .name = BLOCK_OPT_CLUSTER_SIZE,
> +     .type = OPT_SIZE,
> +     .help = "RBD object size"
> +    },
> +    {NULL}
> +};
> +
> +static BlockDriver bdrv_rbd = {
> +    .format_name = "rbd",
> +    .instance_size = sizeof(RBDRVRBDState),
> +    .bdrv_open = rbd_open,
> +    .bdrv_read = rbd_read,
> +    .bdrv_write = rbd_write,
> +    .bdrv_close = rbd_close,
> +    .bdrv_create = rbd_create,
> +    .bdrv_get_info = rbd_getinfo,
> +    .create_options = rbd_create_options,
> +    .bdrv_getlength = rbd_getlength,
> +    .protocol_name = "rbd",
> +
> +    .bdrv_aio_readv = rbd_aio_readv,
> +    .bdrv_aio_writev = rbd_aio_writev,
> +};
> +
> +static void bdrv_rbd_init(void)
> +{
> +    bdrv_register(&bdrv_rbd);
> +}
> +
> +block_init(bdrv_rbd_init);
> diff --git a/block/rbd_types.h b/block/rbd_types.h
> new file mode 100644
> index 0000000..dfd5aa0
> --- /dev/null
> +++ b/block/rbd_types.h
> @@ -0,0 +1,48 @@
> +#ifndef _FS_CEPH_RBD
> +#define _FS_CEPH_RBD

QEMU_BLOCK_RBD?

> +
> +#include <linux/types.h>

Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
Ceph libraries used in other systems than Linux?

> +
> +/*
> + * rbd image 'foo' consists of objects
> + *   foo.rbd      - image metadata
> + *   foo.00000000
> + *   foo.00000001
> + *   ...          - data
> + */
> +
> +#define RBD_SUFFIX             ".rbd"
> +#define RBD_DIRECTORY           "rbd_directory"
> +
> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
> +
> +#define RBD_MAX_OBJ_NAME_SIZE  96
> +#define RBD_MAX_SEG_NAME_SIZE  128
> +
> +#define RBD_COMP_NONE          0
> +#define RBD_CRYPT_NONE         0
> +
> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
> +static const char rbd_signature[] = "RBD";
> +static const char rbd_version[] = "001.001";
> +
> +struct rbd_obj_snap_ondisk {
> +       __le64 id;
> +       __le64 image_size;
> +} __attribute__((packed));
> +
> +struct rbd_obj_header_ondisk {
> +       char text[64];
> +       char signature[4];
> +       char version[8];
> +       __le64 image_size;

Unaligned? Is the disk format fixed?

> +       __u8 obj_order;
> +       __u8 crypt_type;
> +       __u8 comp_type;

Unaligned?

> +       __le32 snap_seq;
> +       __le32 snap_count;
> +       __le64 snap_names_len;

Unaligned?

> +       struct rbd_obj_snap_ondisk snaps[0];
> +} __attribute__((packed));
> +
> +#endif
> diff --git a/configure b/configure
> index 36d028f..d07a7e5 100755
> --- a/configure
> +++ b/configure
> @@ -299,6 +299,7 @@ pkgversion=""
>  check_utests="no"
>  user_pie="no"
>  zero_malloc=""
> +rbd="no"
>
>  # OS specific
>  if check_define __linux__ ; then
> @@ -660,6 +661,8 @@ for opt do
>   ;;
>   --enable-vhost-net) vhost_net="yes"
>   ;;
> +  --enable-rbd) rbd="yes"
> +  ;;
>   *) echo "ERROR: unknown option $opt"; show_help="yes"
>   ;;
>   esac
> @@ -826,6 +829,7 @@ echo "  --enable-docs            enable documentation build"
>  echo "  --disable-docs           disable documentation build"
>  echo "  --disable-vhost-net      disable vhost-net acceleration support"
>  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> +echo "  --enable-rbd            enable building the rados block device (rbd)"
>  echo ""
>  echo "NOTE: The object files are built at the place where configure is launched"
>  exit 1
> @@ -1569,6 +1573,25 @@ if test "$mingw32" != yes -a "$pthread" = no; then
>  fi
>
>  ##########################################
> +# rbd probe
> +if test "$rbd" != "no" ; then
> +  cat > $TMPC <<EOF
> +#include <stdio.h>
> +#include <rados/librados.h>
> +int main(void) { rados_initialize(0, NULL); return 0; }
> +EOF
> +  if compile_prog "" "-lrados -lcrypto" ; then
> +    rbd=yes
> +    LIBS="$LIBS -lrados -lcrypto"
> +  else
> +    if test "$rbd" = "yes" ; then
> +      feature_not_found "rados block device"
> +    fi
> +    rbd=no
> +  fi
> +fi
> +
> +##########################################
>  # linux-aio probe
>
>  if test "$linux_aio" != "no" ; then
> @@ -2031,6 +2054,7 @@ echo "preadv support    $preadv"
>  echo "fdatasync         $fdatasync"
>  echo "uuid support      $uuid"
>  echo "vhost-net support $vhost_net"
> +echo "rbd support       $rbd"
>
>  if test $sdl_too_old = "yes"; then
>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> @@ -2260,6 +2284,9 @@ echo "CONFIG_UNAME_RELEASE=\"$uname_release\"" >> $config_host_mak
>  if test "$zero_malloc" = "yes" ; then
>   echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak
>  fi
> +if test "$rbd" = "yes" ; then
> +  echo "CONFIG_RBD=y" >> $config_host_mak
> +fi
>
>  # USB host support
>  case "$usb" in
> --
> 1.7.0.4
>
>
>
Christian Brunner - May 20, 2010, 9:18 p.m.
2010/5/20 Blue Swirl <blauwirbel@gmail.com>:
> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>> The attached patch is a block driver for the distributed file system
>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>> is part of the Ceph server) for direct access to the Ceph object
>> store and is running entirely in userspace. Therefore it is
>> called "rbd" - rados block device.
>>
>> To compile the driver a recent version of ceph (>= 0.20.1) is needed
>> and you have to "--enable-rbd" when running configure.
>>
>> Additional information is available on the Ceph-Wiki:
>>
>> http://ceph.newdream.net/wiki/Kvm-rbd
>
>
> I have no idea whether it makes sense to add Ceph (no objection
> either). I have some minor comments below.

Thanks for your comments. I'll send an updated patch in a few days.

Having a central storage system is quite essential in larger hosting
environments, it enables you to move your guest systems from one node
to another easily (live-migration or dynamic restart). Traditionally
this has been done using SAN, iSCSI or NFS. However most of these
systems don't scale very well and and the costs for high-availability
are quite high.

With new approaches like Sheepdog or Ceph, things are getting a lot
cheaper and you can scale your system without disrupting your service.
The concepts are quite similar to what Amazon is doing in their EC2
environment, but they certainly won't publish it as OpenSource anytime
soon.

Both projects have advantages and disadvantages. Ceph is a bit more
universal as it implements a whole filesystem. Sheepdog is more
feature complete in regards of managing images (e.g. snapshots). Both
projects require some additional work to become stable, but they are
on a good way.

I would really like to see both drivers in the qemu tree, as they are
the key to a design shift in how storage in the datacenter is being
built.

Christian
Anthony Liguori - May 20, 2010, 9:29 p.m.
On 05/20/2010 04:18 PM, Christian Brunner wrote:
> Thanks for your comments. I'll send an updated patch in a few days.
>
> Having a central storage system is quite essential in larger hosting
> environments, it enables you to move your guest systems from one node
> to another easily (live-migration or dynamic restart). Traditionally
> this has been done using SAN, iSCSI or NFS. However most of these
> systems don't scale very well and and the costs for high-availability
> are quite high.
>
> With new approaches like Sheepdog or Ceph, things are getting a lot
> cheaper and you can scale your system without disrupting your service.
> The concepts are quite similar to what Amazon is doing in their EC2
> environment, but they certainly won't publish it as OpenSource anytime
> soon.
>
> Both projects have advantages and disadvantages. Ceph is a bit more
> universal as it implements a whole filesystem. Sheepdog is more
> feature complete in regards of managing images (e.g. snapshots). Both
> projects require some additional work to become stable, but they are
> on a good way.
>
> I would really like to see both drivers in the qemu tree, as they are
> the key to a design shift in how storage in the datacenter is being
> built.
>    

I'd be more interested in enabling people to build these types of 
storage systems without touching qemu.

Both sheepdog and ceph ultimately transmit I/O over a socket to a 
central daemon, right?  So could we not standardize a protocol for this 
that both sheepdog and ceph could implement?

Regards,

Anthony Liguori

> Christian
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Christian Brunner - May 20, 2010, 10:16 p.m.
2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
>> With new approaches like Sheepdog or Ceph, things are getting a lot
>> cheaper and you can scale your system without disrupting your service.
>> The concepts are quite similar to what Amazon is doing in their EC2
>> environment, but they certainly won't publish it as OpenSource anytime
>> soon.
>>
>> Both projects have advantages and disadvantages. Ceph is a bit more
>> universal as it implements a whole filesystem. Sheepdog is more
>> feature complete in regards of managing images (e.g. snapshots). Both
>> projects require some additional work to become stable, but they are
>> on a good way.
>>
>> I would really like to see both drivers in the qemu tree, as they are
>> the key to a design shift in how storage in the datacenter is being
>> built.
>>
>
> I'd be more interested in enabling people to build these types of storage
> systems without touching qemu.

You could do this by using Yehuda's rbd kernel driver, but I think
that it would be better to avoid this additional layer.

> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
> daemon, right?  So could we not standardize a protocol for this that both
> sheepdog and ceph could implement?

There is no central daemon. The concept is that they talk to many
storage nodes at the same time. Data is distributed and replicated
over many nodes in the network. The mechanism to do this is quite
complex. I don't know about sheepdog, but in Ceph this is called RADOS
(reliable autonomic distributed object store). Sheepdog and Ceph may
look similar, but this is where they act different. I don't think that
it would be possible to implement a common protocol.

Regards,
Christian
Yehuda Sadeh Weinraub - May 20, 2010, 11:02 p.m.
On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>> The attached patch is a block driver for the distributed file system
>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>> is part of the Ceph server) for direct access to the Ceph object
>> store and is running entirely in userspace. Therefore it is
>> called "rbd" - rados block device.
...
>
> IIRC underscores here may conflict with system header use. Please use
> something like QEMU_BLOCK_RADOS_H.

This header is shared between the linux kernel client and the ceph
userspace servers and client. We can actually get rid of it, as we
only need it to define CEPH_OSD_TMAP_SET. We can move this definition
to librados.h.

>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>> new file mode 100644
>> index 0000000..dfd5aa0
>> --- /dev/null
>> +++ b/block/rbd_types.h
>> @@ -0,0 +1,48 @@
>> +#ifndef _FS_CEPH_RBD
>> +#define _FS_CEPH_RBD
>
> QEMU_BLOCK_RBD?

This header is shared between the ceph kernel client, between the qemu
rbd module (and between other ceph utilities). It'd be much easier
maintaining it without having to have a different implementation for
each. The same goes to the use of __le32/64 and __u32/64 within these
headers.

>
>> +
>> +#include <linux/types.h>
>
> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
> Ceph libraries used in other systems than Linux?

Not at the moment. I guess that we can take this include out.

>
>> +
>> +/*
>> + * rbd image 'foo' consists of objects
>> + *   foo.rbd      - image metadata
>> + *   foo.00000000
>> + *   foo.00000001
>> + *   ...          - data
>> + */
>> +
>> +#define RBD_SUFFIX             ".rbd"
>> +#define RBD_DIRECTORY           "rbd_directory"
>> +
>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>> +
>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>> +#define RBD_MAX_SEG_NAME_SIZE  128
>> +
>> +#define RBD_COMP_NONE          0
>> +#define RBD_CRYPT_NONE         0
>> +
>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>> +static const char rbd_signature[] = "RBD";
>> +static const char rbd_version[] = "001.001";
>> +
>> +struct rbd_obj_snap_ondisk {
>> +       __le64 id;
>> +       __le64 image_size;
>> +} __attribute__((packed));
>> +
>> +struct rbd_obj_header_ondisk {
>> +       char text[64];
>> +       char signature[4];
>> +       char version[8];
>> +       __le64 image_size;
>
> Unaligned? Is the disk format fixed?

This is a packed structure that represents the on disk format.
Operations on it are being done only to read from the disk header or
to write to the disk header.


Yehuda
Stefan Hajnoczi - May 21, 2010, 5:28 a.m.
On Thu, May 20, 2010 at 11:16 PM, Christian Brunner <chb@muc.de> wrote:
> 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>> daemon, right?  So could we not standardize a protocol for this that both
>> sheepdog and ceph could implement?
>
> There is no central daemon. The concept is that they talk to many
> storage nodes at the same time. Data is distributed and replicated
> over many nodes in the network. The mechanism to do this is quite
> complex. I don't know about sheepdog, but in Ceph this is called RADOS
> (reliable autonomic distributed object store). Sheepdog and Ceph may
> look similar, but this is where they act different. I don't think that
> it would be possible to implement a common protocol.

I believe Sheepdog has a local daemon on each node.  The QEMU storage
backend talks to the daemon on the same node, which then does the real
network communication with the rest of the distributed storage system.
 So I think we're not talking about a network protocol here, we're
talking about a common interface that can be used by QEMU and other
programs to take advantage of Ceph, Sheepdog, etc services available
on the local node.

Haven't looked into your patch enough yet, but does librados talk
directly over the network or does it connect to a local daemon/driver?

Stefan
MORITA Kazutaka - May 21, 2010, 5:54 a.m.
At Fri, 21 May 2010 00:16:46 +0200,
Christian Brunner wrote:
> 
> 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
> >> With new approaches like Sheepdog or Ceph, things are getting a lot
> >> cheaper and you can scale your system without disrupting your service.
> >> The concepts are quite similar to what Amazon is doing in their EC2
> >> environment, but they certainly won't publish it as OpenSource anytime
> >> soon.
> >>
> >> Both projects have advantages and disadvantages. Ceph is a bit more
> >> universal as it implements a whole filesystem. Sheepdog is more
> >> feature complete in regards of managing images (e.g. snapshots). Both

I think a major difference is that Sheepdog servers act fully
autonomously.  Any Sheepdog server has no fixed role such as a monitor
server, and Sheepdog doesn't require any configuration about a list of
nodes in the cluster.


> >> projects require some additional work to become stable, but they are
> >> on a good way.
> >>
> >> I would really like to see both drivers in the qemu tree, as they are
> >> the key to a design shift in how storage in the datacenter is being
> >> built.
> >>
> >
> > I'd be more interested in enabling people to build these types of storage
> > systems without touching qemu.
> 
> You could do this by using Yehuda's rbd kernel driver, but I think
> that it would be better to avoid this additional layer.
> 

I agree.  In addition, if a storage client is a qemu driver, the
storage system can support some features specific to qemu such as live
snapshot from qemu monitor.

Regards,

Kazutaka
MORITA Kazutaka - May 21, 2010, 6:13 a.m.
At Fri, 21 May 2010 06:28:42 +0100,
Stefan Hajnoczi wrote:
> 
> On Thu, May 20, 2010 at 11:16 PM, Christian Brunner <chb@muc.de> wrote:
> > 2010/5/20 Anthony Liguori <anthony@codemonkey.ws>:
> >> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
> >> daemon, right?  So could we not standardize a protocol for this that both
> >> sheepdog and ceph could implement?
> >
> > There is no central daemon. The concept is that they talk to many
> > storage nodes at the same time. Data is distributed and replicated
> > over many nodes in the network. The mechanism to do this is quite
> > complex. I don't know about sheepdog, but in Ceph this is called RADOS
> > (reliable autonomic distributed object store). Sheepdog and Ceph may
> > look similar, but this is where they act different. I don't think that
> > it would be possible to implement a common protocol.
> 
> I believe Sheepdog has a local daemon on each node.  The QEMU storage
> backend talks to the daemon on the same node, which then does the real
> network communication with the rest of the distributed storage system.

Yes.  It is because Sheepdog doesn't have a configuration about
cluster membership as I mentioned in another mail, so the drvier
doesn't know which node to access other than localhost.

>  So I think we're not talking about a network protocol here, we're
> talking about a common interface that can be used by QEMU and other
> programs to take advantage of Ceph, Sheepdog, etc services available
> on the local node.
> 
> Haven't looked into your patch enough yet, but does librados talk
> directly over the network or does it connect to a local daemon/driver?
> 

AFAIK, librados access directly over the network, so I think it is
difficult to define a common interface.


Thanks,

Kazutaka
Blue Swirl - May 23, 2010, 7:59 a.m.
On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
<yehudasa@gmail.com> wrote:
> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>> The attached patch is a block driver for the distributed file system
>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>> is part of the Ceph server) for direct access to the Ceph object
>>> store and is running entirely in userspace. Therefore it is
>>> called "rbd" - rados block device.
> ...
>>
>> IIRC underscores here may conflict with system header use. Please use
>> something like QEMU_BLOCK_RADOS_H.
>
> This header is shared between the linux kernel client and the ceph
> userspace servers and client. We can actually get rid of it, as we
> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
> to librados.h.
>
>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>> new file mode 100644
>>> index 0000000..dfd5aa0
>>> --- /dev/null
>>> +++ b/block/rbd_types.h
>>> @@ -0,0 +1,48 @@
>>> +#ifndef _FS_CEPH_RBD
>>> +#define _FS_CEPH_RBD
>>
>> QEMU_BLOCK_RBD?
>
> This header is shared between the ceph kernel client, between the qemu
> rbd module (and between other ceph utilities). It'd be much easier
> maintaining it without having to have a different implementation for
> each. The same goes to the use of __le32/64 and __u32/64 within these
> headers.

This is user space, so identifiers must conform to C standards. The
identifiers beginning with underscores are reserved.

Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?

>
>>
>>> +
>>> +#include <linux/types.h>
>>
>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>> Ceph libraries used in other systems than Linux?
>
> Not at the moment. I guess that we can take this include out.
>
>>
>>> +
>>> +/*
>>> + * rbd image 'foo' consists of objects
>>> + *   foo.rbd      - image metadata
>>> + *   foo.00000000
>>> + *   foo.00000001
>>> + *   ...          - data
>>> + */
>>> +
>>> +#define RBD_SUFFIX             ".rbd"
>>> +#define RBD_DIRECTORY           "rbd_directory"
>>> +
>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>> +
>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>> +
>>> +#define RBD_COMP_NONE          0
>>> +#define RBD_CRYPT_NONE         0
>>> +
>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>> +static const char rbd_signature[] = "RBD";
>>> +static const char rbd_version[] = "001.001";
>>> +
>>> +struct rbd_obj_snap_ondisk {
>>> +       __le64 id;
>>> +       __le64 image_size;
>>> +} __attribute__((packed));
>>> +
>>> +struct rbd_obj_header_ondisk {
>>> +       char text[64];
>>> +       char signature[4];
>>> +       char version[8];
>>> +       __le64 image_size;
>>
>> Unaligned? Is the disk format fixed?
>
> This is a packed structure that represents the on disk format.
> Operations on it are being done only to read from the disk header or
> to write to the disk header.

That's clear. But what exactly is the alignment of field 'image_size'?
Could there be implicit padding to mod 8 between 'version' and
'image_size' with some compilers?

If there were no other constraints, I'd either make the padding
explicit, or rearrange/resize fields so that the field alignment is
natural. Thus my question, can you change the disk format or are there
already some deployments?

Otherwise, I'd just add some warning comment so people don't try to
use clever pointer tricks which will crash on machines with enforced
alignment.
Avi Kivity - May 23, 2010, 12:01 p.m.
On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>
> I'd be more interested in enabling people to build these types of 
> storage systems without touching qemu.
>
> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
> central daemon, right? 

That incurs an extra copy.

> So could we not standardize a protocol for this that both sheepdog and 
> ceph could implement?

The protocol already exists, nbd.  It doesn't support snapshotting etc. 
but we could extend it.

But IMO what's needed is a plugin API for the block layer.
Yehuda Sadeh Weinraub - May 24, 2010, 2:17 a.m.
On Sun, May 23, 2010 at 12:59 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
> On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
> <yehudasa@gmail.com> wrote:
>> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>>> The attached patch is a block driver for the distributed file system
>>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>>> is part of the Ceph server) for direct access to the Ceph object
>>>> store and is running entirely in userspace. Therefore it is
>>>> called "rbd" - rados block device.
>> ...
>>>
>>> IIRC underscores here may conflict with system header use. Please use
>>> something like QEMU_BLOCK_RADOS_H.
>>
>> This header is shared between the linux kernel client and the ceph
>> userspace servers and client. We can actually get rid of it, as we
>> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
>> to librados.h.
>>
>>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>>> new file mode 100644
>>>> index 0000000..dfd5aa0
>>>> --- /dev/null
>>>> +++ b/block/rbd_types.h
>>>> @@ -0,0 +1,48 @@
>>>> +#ifndef _FS_CEPH_RBD
>>>> +#define _FS_CEPH_RBD
>>>
>>> QEMU_BLOCK_RBD?
>>
>> This header is shared between the ceph kernel client, between the qemu
>> rbd module (and between other ceph utilities). It'd be much easier
>> maintaining it without having to have a different implementation for
>> each. The same goes to the use of __le32/64 and __u32/64 within these
>> headers.
>
> This is user space, so identifiers must conform to C standards. The
> identifiers beginning with underscores are reserved.
>
> Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?
It depends on gcc extension. If needed we can probably have a separate
header for the qemu block device that uses alternative types. Though
looking at the qemu code I see use of other gcc extensions so I'm not
sure this is a real issue.

>
>>
>>>
>>>> +
>>>> +#include <linux/types.h>
>>>
>>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>>> Ceph libraries used in other systems than Linux?
>>
>> Not at the moment. I guess that we can take this include out.
>>
>>>
>>>> +
>>>> +/*
>>>> + * rbd image 'foo' consists of objects
>>>> + *   foo.rbd      - image metadata
>>>> + *   foo.00000000
>>>> + *   foo.00000001
>>>> + *   ...          - data
>>>> + */
>>>> +
>>>> +#define RBD_SUFFIX             ".rbd"
>>>> +#define RBD_DIRECTORY           "rbd_directory"
>>>> +
>>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>>> +
>>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>>> +
>>>> +#define RBD_COMP_NONE          0
>>>> +#define RBD_CRYPT_NONE         0
>>>> +
>>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>>> +static const char rbd_signature[] = "RBD";
>>>> +static const char rbd_version[] = "001.001";
>>>> +
>>>> +struct rbd_obj_snap_ondisk {
>>>> +       __le64 id;
>>>> +       __le64 image_size;
>>>> +} __attribute__((packed));
>>>> +
>>>> +struct rbd_obj_header_ondisk {
>>>> +       char text[64];
>>>> +       char signature[4];
>>>> +       char version[8];
>>>> +       __le64 image_size;
>>>
>>> Unaligned? Is the disk format fixed?
>>
>> This is a packed structure that represents the on disk format.
>> Operations on it are being done only to read from the disk header or
>> to write to the disk header.
>
> That's clear. But what exactly is the alignment of field 'image_size'?
> Could there be implicit padding to mod 8 between 'version' and
> 'image_size' with some compilers?

Obviously it's not 64 bit aligned. As it's an on-disk header, I don't
see alignment a real issue. As was said before, any operation on these
fields have to go through endianity conversion anyway, and this
structure should not be used directly. For such datastructures I'd
rather have the fields ordered in some logical order than maintaining
the alignment by ourselves. That's why we have that __attribute__
packed in the end to let the compiler deal with those issues. Other
compilers though have their own syntax for packed structures (but I do
see other uses of this packed syntax in the qemu code).

>
> If there were no other constraints, I'd either make the padding
> explicit, or rearrange/resize fields so that the field alignment is
> natural. Thus my question, can you change the disk format or are there
> already some deployments?

We can certainly make changes to the disk format at this point. I'm
not very happy with those 3 __u8 in the middle, and they can probably
be changed to a 32 bit flags field. We can get it 64 bit aligned too.

>
> Otherwise, I'd just add some warning comment so people don't try to
> use clever pointer tricks which will crash on machines with enforced
> alignment.
>
Any clever pointer tricks that'll work on one architecture will
probably be wrong on another (different word
size/alignment/endianity), so maybe crashing machines is a good
indicator to bad implementation. We shouldn't try to hide the
problems.

Thanks,
Yehuda
MORITA Kazutaka - May 24, 2010, 7:12 a.m.
At Sun, 23 May 2010 15:01:59 +0300,
Avi Kivity wrote:
> 
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
> >
> > I'd be more interested in enabling people to build these types of 
> > storage systems without touching qemu.
> >
> > Both sheepdog and ceph ultimately transmit I/O over a socket to a 
> > central daemon, right? 
> 
> That incurs an extra copy.
> 
> > So could we not standardize a protocol for this that both sheepdog and 
> > ceph could implement?
> 
> The protocol already exists, nbd.  It doesn't support snapshotting etc. 
> but we could extend it.
> 

I have no objection to use another protocol for Sheepdog support, but
I think nbd protocol is unsuitable for the large storage pool with
many VM images.  It is because nbd protocol doesn't support specifing
a file name to open.  If we use nbd with such a storage system, the
server needs to listen ports as many as the number of VM images.  As
far as I see the protocol, It looks difficult to extend it without
breaking backward compatibility.

Regards,

Kazutaka

> But IMO what's needed is a plugin API for the block layer.
>
Stefan Hajnoczi - May 24, 2010, 8:27 a.m.
On Sun, May 23, 2010 at 1:01 PM, Avi Kivity <avi@redhat.com> wrote:
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>
>> I'd be more interested in enabling people to build these types of storage
>> systems without touching qemu.
>>
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>> daemon, right?
>
> That incurs an extra copy.

Besides a shared memory approach, I wonder if the splice() family of
syscalls could be used to send/receive data through a storage daemon
without the daemon looking at or copying the data?

Stefan
Avi Kivity - May 24, 2010, 11:03 a.m.
On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>      
>>> I'd be more interested in enabling people to build these types of storage
>>> systems without touching qemu.
>>>
>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a central
>>> daemon, right?
>>>        
>> That incurs an extra copy.
>>      
> Besides a shared memory approach, I wonder if the splice() family of
> syscalls could be used to send/receive data through a storage daemon
> without the daemon looking at or copying the data?
>    

Excellent idea.
Avi Kivity - May 24, 2010, 11:05 a.m.
On 05/24/2010 10:12 AM, MORITA Kazutaka wrote:
> At Sun, 23 May 2010 15:01:59 +0300,
> Avi Kivity wrote:
>    
>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>      
>>> I'd be more interested in enabling people to build these types of
>>> storage systems without touching qemu.
>>>
>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a
>>> central daemon, right?
>>>        
>> That incurs an extra copy.
>>
>>      
>>> So could we not standardize a protocol for this that both sheepdog and
>>> ceph could implement?
>>>        
>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>> but we could extend it.
>>
>>      
> I have no objection to use another protocol for Sheepdog support, but
> I think nbd protocol is unsuitable for the large storage pool with
> many VM images.  It is because nbd protocol doesn't support specifing
> a file name to open.  If we use nbd with such a storage system, the
> server needs to listen ports as many as the number of VM images.  As
> far as I see the protocol, It looks difficult to extend it without
> breaking backward compatibility.
>    

The server would be local and talk over a unix domain socket, perhaps 
anonymous.

nbd has other issues though, such as requiring a copy and no support for 
metadata operations such as snapshot and file size extension.
MORITA Kazutaka - May 24, 2010, 11:42 a.m.
At Mon, 24 May 2010 14:05:58 +0300,
Avi Kivity wrote:
> 
> On 05/24/2010 10:12 AM, MORITA Kazutaka wrote:
> > At Sun, 23 May 2010 15:01:59 +0300,
> > Avi Kivity wrote:
> >    
> >> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
> >>      
> >>> I'd be more interested in enabling people to build these types of
> >>> storage systems without touching qemu.
> >>>
> >>> Both sheepdog and ceph ultimately transmit I/O over a socket to a
> >>> central daemon, right?
> >>>        
> >> That incurs an extra copy.
> >>
> >>      
> >>> So could we not standardize a protocol for this that both sheepdog and
> >>> ceph could implement?
> >>>        
> >> The protocol already exists, nbd.  It doesn't support snapshotting etc.
> >> but we could extend it.
> >>
> >>      
> > I have no objection to use another protocol for Sheepdog support, but
> > I think nbd protocol is unsuitable for the large storage pool with
> > many VM images.  It is because nbd protocol doesn't support specifing
> > a file name to open.  If we use nbd with such a storage system, the
> > server needs to listen ports as many as the number of VM images.  As
> > far as I see the protocol, It looks difficult to extend it without
> > breaking backward compatibility.
> >    
> 
> The server would be local and talk over a unix domain socket, perhaps 
> anonymous.
> 
> nbd has other issues though, such as requiring a copy and no support for 
> metadata operations such as snapshot and file size extension.
> 

Sorry, my explanation was unclear.  I'm not sure how running servers
on localhost can solve the problem.

What I wanted to say was that we cannot specify the image of VM. With
nbd protocol, command line arguments are as follows:

 $ qemu nbd:hostname:port

As this syntax shows, with nbd protocol the client cannot pass the VM
image name to the server.

Regards,

Kazutaka
Avi Kivity - May 24, 2010, 11:56 a.m.
On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>
>> The server would be local and talk over a unix domain socket, perhaps
>> anonymous.
>>
>> nbd has other issues though, such as requiring a copy and no support for
>> metadata operations such as snapshot and file size extension.
>>
>>      
> Sorry, my explanation was unclear.  I'm not sure how running servers
> on localhost can solve the problem.
>    

The local server can convert from the local (nbd) protocol to the remote 
(sheepdog, ceph) protocol.

> What I wanted to say was that we cannot specify the image of VM. With
> nbd protocol, command line arguments are as follows:
>
>   $ qemu nbd:hostname:port
>
> As this syntax shows, with nbd protocol the client cannot pass the VM
> image name to the server.
>    

We would extend it to allow it to connect to a unix domain socket:

   qemu nbd:unix:/path/to/socket

The server at the other end would associate the socket with a filename 
and forward it to the server using the remote protocol.

However, I don't think nbd would be a good protocol.  My preference 
would be for a plugin API, or for a new local protocol that uses 
splice() to avoid copies.
Cláudio Martins - May 24, 2010, 12:07 p.m.
On Mon, 24 May 2010 14:56:29 +0300 Avi Kivity <avi@redhat.com> wrote:
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >
> >> The server would be local and talk over a unix domain socket, perhaps
> >> anonymous.
> >>
> >> nbd has other issues though, such as requiring a copy and no support for
> >> metadata operations such as snapshot and file size extension.
> >>
> >>      
> > Sorry, my explanation was unclear.  I'm not sure how running servers
> > on localhost can solve the problem.
> >    
> 
> The local server can convert from the local (nbd) protocol to the remote 
> (sheepdog, ceph) protocol.
> 

 Please note that this shouldn't be relevant to the block driver based
on ceph, as it does not use a local daemon -- it connects to the Object
Storage Devices directly over the network.

 Best regards

Cláudio
MORITA Kazutaka - May 24, 2010, 2:01 p.m.
At Mon, 24 May 2010 14:56:29 +0300,
Avi Kivity wrote:
> 
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >
> >> The server would be local and talk over a unix domain socket, perhaps
> >> anonymous.
> >>
> >> nbd has other issues though, such as requiring a copy and no support for
> >> metadata operations such as snapshot and file size extension.
> >>
> >>      
> > Sorry, my explanation was unclear.  I'm not sure how running servers
> > on localhost can solve the problem.
> >    
> 
> The local server can convert from the local (nbd) protocol to the remote 
> (sheepdog, ceph) protocol.
> 
> > What I wanted to say was that we cannot specify the image of VM. With
> > nbd protocol, command line arguments are as follows:
> >
> >   $ qemu nbd:hostname:port
> >
> > As this syntax shows, with nbd protocol the client cannot pass the VM
> > image name to the server.
> >    
> 
> We would extend it to allow it to connect to a unix domain socket:
> 
>    qemu nbd:unix:/path/to/socket
> 
> The server at the other end would associate the socket with a filename 
> and forward it to the server using the remote protocol.
> 

Thank you for the explanation.  Sheepdog could achieve desired
behavior by creating socket files for all the VM images when the
daemon starts up.

> However, I don't think nbd would be a good protocol.  My preference 
> would be for a plugin API, or for a new local protocol that uses 
> splice() to avoid copies.
> 

Both would be okay for Sheepdog.  I want to take a suitable approach
for qemu.

Thanks,

Kazutaka
Christian Brunner - May 24, 2010, 7:07 p.m.
2010/5/24 MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>:

>> However, I don't think nbd would be a good protocol.  My preference
>> would be for a plugin API, or for a new local protocol that uses
>> splice() to avoid copies.
>>
>
> Both would be okay for Sheepdog.  I want to take a suitable approach
> for qemu.

I think both should be possible:

- Using splice() we would need a daemon that is listening on a control
socket for
  requests from qemu-processes or admin commands. When a qemu-process
  wants to open an image it could call open_image("protocol:imagename") on the
  controll socket and the daemon has to create a pipe to which the
image is mapped.
  (What I'm unsure about, are the security implications. Do we need some kind of
  authentication for the sockets? What about sVirt?

- Building a plugin API seems a bit simpler to me, although I'm to
sure if I'd get the
  idea correctly:
  The block layer has already some kind of api (.bdrv_file_open, .bdrv_read). We
  could simply compile the block-drivers as shared objects and create a method
  for loading the necessary modules at runtime.

Are you planing to use this for all block drivers?

Regards,
Christian
Anthony Liguori - May 24, 2010, 7:16 p.m.
On 05/24/2010 06:56 AM, Avi Kivity wrote:
> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>>
>>> The server would be local and talk over a unix domain socket, perhaps
>>> anonymous.
>>>
>>> nbd has other issues though, such as requiring a copy and no support 
>>> for
>>> metadata operations such as snapshot and file size extension.
>>>
>> Sorry, my explanation was unclear.  I'm not sure how running servers
>> on localhost can solve the problem.
>
> The local server can convert from the local (nbd) protocol to the 
> remote (sheepdog, ceph) protocol.
>
>> What I wanted to say was that we cannot specify the image of VM. With
>> nbd protocol, command line arguments are as follows:
>>
>>   $ qemu nbd:hostname:port
>>
>> As this syntax shows, with nbd protocol the client cannot pass the VM
>> image name to the server.
>
> We would extend it to allow it to connect to a unix domain socket:
>
>   qemu nbd:unix:/path/to/socket

nbd is a no-go because it only supports a single, synchronous I/O 
operation at a time and has no mechanism for extensibility.

If we go this route, I think two options are worth considering.  The 
first would be a purely socket based approach where we just accepted the 
extra copy.

The other potential approach would be shared memory based.  We export 
all guest ram as shared memory along with a small bounce buffer pool.  
We would then use a ring queue (potentially even using virtio-blk) and 
an eventfd for notification.

> The server at the other end would associate the socket with a filename 
> and forward it to the server using the remote protocol.
>
> However, I don't think nbd would be a good protocol.  My preference 
> would be for a plugin API, or for a new local protocol that uses 
> splice() to avoid copies.

I think a good shared memory implementation would be preferable to 
plugins.  I think it's worth attempting to do a plugin interface for the 
block layer but I strongly suspect it would not be sufficient.

I would not want to see plugins that interacted with BlockDriverState 
directly, for instance.  We change it far too often.  Our main loop 
functions are also not terribly stable so I'm not sure how we would 
handle that (unless we forced all block plugins to be in a separate thread).

Regards,

Anthony Liguori
Anthony Liguori - May 24, 2010, 7:19 p.m.
On 05/24/2010 06:03 AM, Avi Kivity wrote:
> On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
>> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>>> I'd be more interested in enabling people to build these types of 
>>>> storage
>>>> systems without touching qemu.
>>>>
>>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>>>> central
>>>> daemon, right?
>>> That incurs an extra copy.
>> Besides a shared memory approach, I wonder if the splice() family of
>> syscalls could be used to send/receive data through a storage daemon
>> without the daemon looking at or copying the data?
>
> Excellent idea.

splice() eventually requires a copy.  You cannot splice() to linux-aio 
so you'd have to splice() to a temporary buffer and then call into 
linux-aio.  With shared memory, you can avoid ever bringing the data 
into memory via O_DIRECT and linux-aio.

Regards,

Anthony Liguori
Avi Kivity - May 25, 2010, 9:19 a.m.
On 05/24/2010 10:16 PM, Anthony Liguori wrote:
> On 05/24/2010 06:56 AM, Avi Kivity wrote:
>> On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
>>>
>>>> The server would be local and talk over a unix domain socket, perhaps
>>>> anonymous.
>>>>
>>>> nbd has other issues though, such as requiring a copy and no 
>>>> support for
>>>> metadata operations such as snapshot and file size extension.
>>>>
>>> Sorry, my explanation was unclear.  I'm not sure how running servers
>>> on localhost can solve the problem.
>>
>> The local server can convert from the local (nbd) protocol to the 
>> remote (sheepdog, ceph) protocol.
>>
>>> What I wanted to say was that we cannot specify the image of VM. With
>>> nbd protocol, command line arguments are as follows:
>>>
>>>   $ qemu nbd:hostname:port
>>>
>>> As this syntax shows, with nbd protocol the client cannot pass the VM
>>> image name to the server.
>>
>> We would extend it to allow it to connect to a unix domain socket:
>>
>>   qemu nbd:unix:/path/to/socket
>
> nbd is a no-go because it only supports a single, synchronous I/O 
> operation at a time and has no mechanism for extensibility.
>
> If we go this route, I think two options are worth considering.  The 
> first would be a purely socket based approach where we just accepted 
> the extra copy.
>
> The other potential approach would be shared memory based.  We export 
> all guest ram as shared memory along with a small bounce buffer pool.  
> We would then use a ring queue (potentially even using virtio-blk) and 
> an eventfd for notification.

We can't actually export guest memory unless we allocate it as a shared 
memory object, which has many disadvantages.  The only way to export 
anonymous memory now is vmsplice(), which is fairly limited.


>
>> The server at the other end would associate the socket with a 
>> filename and forward it to the server using the remote protocol.
>>
>> However, I don't think nbd would be a good protocol.  My preference 
>> would be for a plugin API, or for a new local protocol that uses 
>> splice() to avoid copies.
>
> I think a good shared memory implementation would be preferable to 
> plugins.  I think it's worth attempting to do a plugin interface for 
> the block layer but I strongly suspect it would not be sufficient.
>
> I would not want to see plugins that interacted with BlockDriverState 
> directly, for instance.  We change it far too often.  Our main loop 
> functions are also not terribly stable so I'm not sure how we would 
> handle that (unless we forced all block plugins to be in a separate 
> thread).

If we manage to make a good long-term stable plugin API, it would be a 
good candidate for the block layer itself.

Some OSes manage to have a stable block driver ABI, so it should be 
possible, if difficult.
Avi Kivity - May 25, 2010, 9:22 a.m.
On 05/24/2010 10:19 PM, Anthony Liguori wrote:
> On 05/24/2010 06:03 AM, Avi Kivity wrote:
>> On 05/24/2010 11:27 AM, Stefan Hajnoczi wrote:
>>> On Sun, May 23, 2010 at 1:01 PM, Avi Kivity<avi@redhat.com>  wrote:
>>>> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>>>> I'd be more interested in enabling people to build these types of 
>>>>> storage
>>>>> systems without touching qemu.
>>>>>
>>>>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>>>>> central
>>>>> daemon, right?
>>>> That incurs an extra copy.
>>> Besides a shared memory approach, I wonder if the splice() family of
>>> syscalls could be used to send/receive data through a storage daemon
>>> without the daemon looking at or copying the data?
>>
>> Excellent idea.
>
> splice() eventually requires a copy.  You cannot splice() to linux-aio 
> so you'd have to splice() to a temporary buffer and then call into 
> linux-aio.  With shared memory, you can avoid ever bringing the data 
> into memory via O_DIRECT and linux-aio.

If the final destination is a socket, then you end up queuing guest 
memory as an skbuff.  In theory we could do an aio splice to block 
devices but I don't think that's realistic given our experience with aio 
changes.
Kevin Wolf - May 25, 2010, 11:02 a.m.
Am 23.05.2010 14:01, schrieb Avi Kivity:
> On 05/21/2010 12:29 AM, Anthony Liguori wrote:
>>
>> I'd be more interested in enabling people to build these types of 
>> storage systems without touching qemu.
>>
>> Both sheepdog and ceph ultimately transmit I/O over a socket to a 
>> central daemon, right? 
> 
> That incurs an extra copy.
> 
>> So could we not standardize a protocol for this that both sheepdog and 
>> ceph could implement?
> 
> The protocol already exists, nbd.  It doesn't support snapshotting etc. 
> but we could extend it.
> 
> But IMO what's needed is a plugin API for the block layer.

What would it buy us, apart from more downstreams and having to maintain
a stable API and ABI? Hiding block drivers somewhere else doesn't make
them stop existing, they just might not be properly integrated, but
rather hacked in to fit that limited stable API.

Kevin
Avi Kivity - May 25, 2010, 11:25 a.m.
On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>
>>
>>> So could we not standardize a protocol for this that both sheepdog and
>>> ceph could implement?
>>>        
>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>> but we could extend it.
>>
>> But IMO what's needed is a plugin API for the block layer.
>>      
> What would it buy us, apart from more downstreams and having to maintain
> a stable API and ABI?

Currently if someone wants to add a new block format, they have to 
upstream it and wait for a new qemu to be released.  With a plugin API, 
they can add a new block format to an existing, supported qemu.

> Hiding block drivers somewhere else doesn't make
> them stop existing, they just might not be properly integrated, but
> rather hacked in to fit that limited stable API.
>    

They would hack it to fit the current API, and hack the API in qemu.git 
to fit their requirements for the next release.
Christoph Hellwig - May 25, 2010, 12:03 p.m.
On Tue, May 25, 2010 at 02:25:53PM +0300, Avi Kivity wrote:
> Currently if someone wants to add a new block format, they have to  
> upstream it and wait for a new qemu to be released.  With a plugin API,  
> they can add a new block format to an existing, supported qemu.

So?  Unless we want a stable driver ABI which I fundamentally oppose as
it would make block driver development hell they'd have to wait for
a new release of the block layer.  It's really just going to be a lot
of pain for no major gain.  qemu releases are frequent enough, and if
users care enough they can also easily patch qemu.
Avi Kivity - May 25, 2010, 12:13 p.m.
On 05/25/2010 03:03 PM, Christoph Hellwig wrote:
> On Tue, May 25, 2010 at 02:25:53PM +0300, Avi Kivity wrote:
>    
>> Currently if someone wants to add a new block format, they have to
>> upstream it and wait for a new qemu to be released.  With a plugin API,
>> they can add a new block format to an existing, supported qemu.
>>      
> So?  Unless we want a stable driver ABI which I fundamentally oppose as
> it would make block driver development hell

We'd only freeze it for a major release.

> they'd have to wait for
> a new release of the block layer.  It's really just going to be a lot
> of pain for no major gain.  qemu releases are frequent enough, and if
> users care enough they can also easily patch qemu.
>    

May not be so easy for them, they lose binary updates from their distro 
and have to keep repatching.
Anthony Liguori - May 25, 2010, 1:25 p.m.
On 05/25/2010 06:25 AM, Avi Kivity wrote:
> On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>>
>>>
>>>> So could we not standardize a protocol for this that both sheepdog and
>>>> ceph could implement?
>>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>>> but we could extend it.
>>>
>>> But IMO what's needed is a plugin API for the block layer.
>> What would it buy us, apart from more downstreams and having to maintain
>> a stable API and ABI?
>
> Currently if someone wants to add a new block format, they have to 
> upstream it and wait for a new qemu to be released.  With a plugin 
> API, they can add a new block format to an existing, supported qemu.

Whether we have a plugin or protocol based mechanism to implement block 
formats really ends up being just an implementation detail.

In order to implement either, we need to take a subset of block 
functionality that we feel we can support long term and expose that.  
Right now, that's basically just querying characteristics (like size and 
geometry) and asynchronous reads and writes.

A protocol based mechanism has the advantage of being more robust in the 
face of poorly written block backends so if it's possible to make it 
perform as well as a plugin, it's a preferable approach.

Plugins that just expose chunks of QEMU internal state directly (like 
BlockDriver) are a really bad idea IMHO.

Regards,

Anthony Liguori
MORITA Kazutaka - May 25, 2010, 1:26 p.m.
At Mon, 24 May 2010 14:16:32 -0500,
Anthony Liguori wrote:
> 
> On 05/24/2010 06:56 AM, Avi Kivity wrote:
> > On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
> >>
> >>> The server would be local and talk over a unix domain socket, perhaps
> >>> anonymous.
> >>>
> >>> nbd has other issues though, such as requiring a copy and no support 
> >>> for
> >>> metadata operations such as snapshot and file size extension.
> >>>
> >> Sorry, my explanation was unclear.  I'm not sure how running servers
> >> on localhost can solve the problem.
> >
> > The local server can convert from the local (nbd) protocol to the 
> > remote (sheepdog, ceph) protocol.
> >
> >> What I wanted to say was that we cannot specify the image of VM. With
> >> nbd protocol, command line arguments are as follows:
> >>
> >>   $ qemu nbd:hostname:port
> >>
> >> As this syntax shows, with nbd protocol the client cannot pass the VM
> >> image name to the server.
> >
> > We would extend it to allow it to connect to a unix domain socket:
> >
> >   qemu nbd:unix:/path/to/socket
> 
> nbd is a no-go because it only supports a single, synchronous I/O 
> operation at a time and has no mechanism for extensibility.
> 
> If we go this route, I think two options are worth considering.  The 
> first would be a purely socket based approach where we just accepted the 
> extra copy.
> 
> The other potential approach would be shared memory based.  We export 
> all guest ram as shared memory along with a small bounce buffer pool.  
> We would then use a ring queue (potentially even using virtio-blk) and 
> an eventfd for notification.
> 

The shared memory approach assumes that there is a local server who
can talk with the storage system.  But Ceph doesn't require the local
server, and Sheepdog would be extended to support VMs running outside
the storage system.  We could run a local daemon who can only work as
proxy, but I don't think it looks a clean approach.  So I think a
socket based approach is the right way to go.

BTW, is it required to design a common interface?  The way Sheepdog
replicates data is different from Ceph, so I think it is not possible
to define a common protocol as Christian says.

Regards,

Kazutaka

> > The server at the other end would associate the socket with a filename 
> > and forward it to the server using the remote protocol.
> >
> > However, I don't think nbd would be a good protocol.  My preference 
> > would be for a plugin API, or for a new local protocol that uses 
> > splice() to avoid copies.
> 
> I think a good shared memory implementation would be preferable to 
> plugins.  I think it's worth attempting to do a plugin interface for the 
> block layer but I strongly suspect it would not be sufficient.
> 
> I would not want to see plugins that interacted with BlockDriverState 
> directly, for instance.  We change it far too often.  Our main loop 
> functions are also not terribly stable so I'm not sure how we would 
> handle that (unless we forced all block plugins to be in a separate thread).
>
Avi Kivity - May 25, 2010, 1:31 p.m.
On 05/25/2010 04:25 PM, Anthony Liguori wrote:
>> Currently if someone wants to add a new block format, they have to 
>> upstream it and wait for a new qemu to be released.  With a plugin 
>> API, they can add a new block format to an existing, supported qemu.
>
>
> Whether we have a plugin or protocol based mechanism to implement 
> block formats really ends up being just an implementation detail.

True.

> In order to implement either, we need to take a subset of block 
> functionality that we feel we can support long term and expose that.  
> Right now, that's basically just querying characteristics (like size 
> and geometry) and asynchronous reads and writes.

Unfortunately, you're right.

> A protocol based mechanism has the advantage of being more robust in 
> the face of poorly written block backends so if it's possible to make 
> it perform as well as a plugin, it's a preferable approach.

May be hard due to difficulty of exposing guest memory.

>
> Plugins that just expose chunks of QEMU internal state directly (like 
> BlockDriver) are a really bad idea IMHO.

Also, we don't want to expose all of the qemu API.  We should default 
the visibility attribute to "hidden" and expose only select functions, 
perhaps under their own interface.  And no inlines.
Anthony Liguori - May 25, 2010, 1:35 p.m.
On 05/25/2010 08:31 AM, Avi Kivity wrote:
>> A protocol based mechanism has the advantage of being more robust in 
>> the face of poorly written block backends so if it's possible to make 
>> it perform as well as a plugin, it's a preferable approach.
>
> May be hard due to difficulty of exposing guest memory.

If someone did a series to add plugins, I would expect a very strong 
argument as to why a shared memory mechanism was not possible or at 
least plausible.

I'm not sure I understand why shared memory is such a bad thing wrt 
KVM.  Can you elaborate?  Is it simply a matter of fork()?

>>
>> Plugins that just expose chunks of QEMU internal state directly (like 
>> BlockDriver) are a really bad idea IMHO.
>
> Also, we don't want to expose all of the qemu API.  We should default 
> the visibility attribute to "hidden" and expose only select functions, 
> perhaps under their own interface.  And no inlines.

Yeah, if we did plugins, this would be a key requirement.

Regards,

Anthony Liguori
Avi Kivity - May 25, 2010, 1:38 p.m.
On 05/25/2010 04:35 PM, Anthony Liguori wrote:
> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>> A protocol based mechanism has the advantage of being more robust in 
>>> the face of poorly written block backends so if it's possible to 
>>> make it perform as well as a plugin, it's a preferable approach.
>>
>> May be hard due to difficulty of exposing guest memory.
>
> If someone did a series to add plugins, I would expect a very strong 
> argument as to why a shared memory mechanism was not possible or at 
> least plausible.
>
> I'm not sure I understand why shared memory is such a bad thing wrt 
> KVM.  Can you elaborate?  Is it simply a matter of fork()?

fork() doesn't work in the with of memory hotplug.  What else is there?
Kevin Wolf - May 25, 2010, 1:53 p.m.
Am 25.05.2010 15:25, schrieb Anthony Liguori:
> On 05/25/2010 06:25 AM, Avi Kivity wrote:
>> On 05/25/2010 02:02 PM, Kevin Wolf wrote:
>>>
>>>>
>>>>> So could we not standardize a protocol for this that both sheepdog and
>>>>> ceph could implement?
>>>> The protocol already exists, nbd.  It doesn't support snapshotting etc.
>>>> but we could extend it.
>>>>
>>>> But IMO what's needed is a plugin API for the block layer.
>>> What would it buy us, apart from more downstreams and having to maintain
>>> a stable API and ABI?
>>
>> Currently if someone wants to add a new block format, they have to 
>> upstream it and wait for a new qemu to be released.  With a plugin 
>> API, they can add a new block format to an existing, supported qemu.
> 
> Whether we have a plugin or protocol based mechanism to implement block 
> formats really ends up being just an implementation detail.
> 
> In order to implement either, we need to take a subset of block 
> functionality that we feel we can support long term and expose that.  
> Right now, that's basically just querying characteristics (like size and 
> geometry) and asynchronous reads and writes.
> 
> A protocol based mechanism has the advantage of being more robust in the 
> face of poorly written block backends so if it's possible to make it 
> perform as well as a plugin, it's a preferable approach.
> 
> Plugins that just expose chunks of QEMU internal state directly (like 
> BlockDriver) are a really bad idea IMHO.

I'm still not convinced that we need either. I share Christoph's concern
that we would make our life harder for almost no gain. It's probably a
very small group of users (if it exists at all) that wants to add new
block drivers themselves, but at the same time can't run upstream qemu.

But if we were to decide that there's no way around it, I agree with you
that directly exposing the internal API isn't going to work.

Kevin
Anthony Liguori - May 25, 2010, 1:55 p.m.
On 05/25/2010 08:38 AM, Avi Kivity wrote:
> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>> A protocol based mechanism has the advantage of being more robust 
>>>> in the face of poorly written block backends so if it's possible to 
>>>> make it perform as well as a plugin, it's a preferable approach.
>>>
>>> May be hard due to difficulty of exposing guest memory.
>>
>> If someone did a series to add plugins, I would expect a very strong 
>> argument as to why a shared memory mechanism was not possible or at 
>> least plausible.
>>
>> I'm not sure I understand why shared memory is such a bad thing wrt 
>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>
> fork() doesn't work in the with of memory hotplug.  What else is there?
>

Is it that fork() doesn't work or is it that fork() is very expensive?

Regards,

Anthony Liguori
Avi Kivity - May 25, 2010, 1:55 p.m.
On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>
> I'm still not convinced that we need either. I share Christoph's concern
> that we would make our life harder for almost no gain. It's probably a
> very small group of users (if it exists at all) that wants to add new
> block drivers themselves, but at the same time can't run upstream qemu.
>
>    

The first part of your argument may be true, but the second isn't.  No 
user can run upstream qemu.git.  It's not tested or supported, and has 
no backwards compatibility guarantees.
Avi Kivity - May 25, 2010, 2:01 p.m.
On 05/25/2010 04:55 PM, Anthony Liguori wrote:
> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>> A protocol based mechanism has the advantage of being more robust 
>>>>> in the face of poorly written block backends so if it's possible 
>>>>> to make it perform as well as a plugin, it's a preferable approach.
>>>>
>>>> May be hard due to difficulty of exposing guest memory.
>>>
>>> If someone did a series to add plugins, I would expect a very strong 
>>> argument as to why a shared memory mechanism was not possible or at 
>>> least plausible.
>>>
>>> I'm not sure I understand why shared memory is such a bad thing wrt 
>>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>
>> fork() doesn't work in the with of memory hotplug.  What else is there?
>>
>
> Is it that fork() doesn't work or is it that fork() is very expensive?

It doesn't work, fork() is done at block device creation time, which 
freezes the child memory map, while guest memory is allocated at hotplug 
time.

fork() actually isn't very expensive since we use MADV_DONTFORK 
(probably fast enough for everything except realtime).

It may be possible to do a processfd() which can be mmap()ed by another 
process to export anonymous memory using mmu notifiers, not sure how 
easy or mergeable that is.
Anthony Liguori - May 25, 2010, 2:03 p.m.
On 05/25/2010 08:55 AM, Avi Kivity wrote:
> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>
>> I'm still not convinced that we need either. I share Christoph's concern
>> that we would make our life harder for almost no gain. It's probably a
>> very small group of users (if it exists at all) that wants to add new
>> block drivers themselves, but at the same time can't run upstream qemu.
>>
>
> The first part of your argument may be true, but the second isn't.  No 
> user can run upstream qemu.git.  It's not tested or supported, and has 
> no backwards compatibility guarantees.

Yes, it does have backwards compatibility guarantees.

Regards,

Anthony Liguori
Anthony Liguori - May 25, 2010, 2:05 p.m.
On 05/25/2010 09:01 AM, Avi Kivity wrote:
> On 05/25/2010 04:55 PM, Anthony Liguori wrote:
>> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>>> A protocol based mechanism has the advantage of being more robust 
>>>>>> in the face of poorly written block backends so if it's possible 
>>>>>> to make it perform as well as a plugin, it's a preferable approach.
>>>>>
>>>>> May be hard due to difficulty of exposing guest memory.
>>>>
>>>> If someone did a series to add plugins, I would expect a very 
>>>> strong argument as to why a shared memory mechanism was not 
>>>> possible or at least plausible.
>>>>
>>>> I'm not sure I understand why shared memory is such a bad thing wrt 
>>>> KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>>
>>> fork() doesn't work in the with of memory hotplug.  What else is there?
>>>
>>
>> Is it that fork() doesn't work or is it that fork() is very expensive?
>
> It doesn't work, fork() is done at block device creation time, which 
> freezes the child memory map, while guest memory is allocated at 
> hotplug time.

Now I'm confused.  I thought you were saying shared memory somehow 
affects fork().  If you're talking about shared memory inheritance via 
fork(), that's less important.  You can also pass /dev/shm fd's via 
SCM_RIGHTs to establish shared memory segments dynamically.

Regards,

Anthony Liguori

> fork() actually isn't very expensive since we use MADV_DONTFORK 
> (probably fast enough for everything except realtime).
>
> It may be possible to do a processfd() which can be mmap()ed by 
> another process to export anonymous memory using mmu notifiers, not 
> sure how easy or mergeable that is.
>
Kevin Wolf - May 25, 2010, 2:09 p.m.
Am 25.05.2010 15:55, schrieb Avi Kivity:
> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>
>> I'm still not convinced that we need either. I share Christoph's concern
>> that we would make our life harder for almost no gain. It's probably a
>> very small group of users (if it exists at all) that wants to add new
>> block drivers themselves, but at the same time can't run upstream qemu.
>>
>>    
> 
> The first part of your argument may be true, but the second isn't.  No 
> user can run upstream qemu.git.  It's not tested or supported, and has 
> no backwards compatibility guarantees.

The second part was basically meant to say "developers don't count here".

Kevin
Avi Kivity - May 25, 2010, 3 p.m.
On 05/25/2010 05:05 PM, Anthony Liguori wrote:
> On 05/25/2010 09:01 AM, Avi Kivity wrote:
>> On 05/25/2010 04:55 PM, Anthony Liguori wrote:
>>> On 05/25/2010 08:38 AM, Avi Kivity wrote:
>>>> On 05/25/2010 04:35 PM, Anthony Liguori wrote:
>>>>> On 05/25/2010 08:31 AM, Avi Kivity wrote:
>>>>>>> A protocol based mechanism has the advantage of being more 
>>>>>>> robust in the face of poorly written block backends so if it's 
>>>>>>> possible to make it perform as well as a plugin, it's a 
>>>>>>> preferable approach.
>>>>>>
>>>>>> May be hard due to difficulty of exposing guest memory.
>>>>>
>>>>> If someone did a series to add plugins, I would expect a very 
>>>>> strong argument as to why a shared memory mechanism was not 
>>>>> possible or at least plausible.
>>>>>
>>>>> I'm not sure I understand why shared memory is such a bad thing 
>>>>> wrt KVM.  Can you elaborate?  Is it simply a matter of fork()?
>>>>
>>>> fork() doesn't work in the with of memory hotplug.  What else is 
>>>> there?
>>>>
>>>
>>> Is it that fork() doesn't work or is it that fork() is very expensive?
>>
>> It doesn't work, fork() is done at block device creation time, which 
>> freezes the child memory map, while guest memory is allocated at 
>> hotplug time.
>
> Now I'm confused.  I thought you were saying shared memory somehow 
> affects fork().  If you're talking about shared memory inheritance via 
> fork(), that's less important. 

The latter.  Why is it less important?  If you don't inherit the memory, 
you can't access it.

> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
> memory segments dynamically.

Doesn't work for anonymous memory.
Avi Kivity - May 25, 2010, 3:01 p.m.
On 05/25/2010 05:09 PM, Kevin Wolf wrote:
>
>> The first part of your argument may be true, but the second isn't.  No
>> user can run upstream qemu.git.  It's not tested or supported, and has
>> no backwards compatibility guarantees.
>>      
> The second part was basically meant to say "developers don't count here".
>    

Agreed.
Anthony Liguori - May 25, 2010, 3:01 p.m.
On 05/25/2010 10:00 AM, Avi Kivity wrote:
> The latter.  Why is it less important?  If you don't inherit the 
> memory, you can't access it.
>
>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>> memory segments dynamically.
>
> Doesn't work for anonymous memory.

What's wrong with /dev/shm memory?

Regards,

Anthony Liguori
Avi Kivity - May 25, 2010, 3:02 p.m.
On 05/25/2010 05:03 PM, Anthony Liguori wrote:
> On 05/25/2010 08:55 AM, Avi Kivity wrote:
>> On 05/25/2010 04:53 PM, Kevin Wolf wrote:
>>>
>>> I'm still not convinced that we need either. I share Christoph's 
>>> concern
>>> that we would make our life harder for almost no gain. It's probably a
>>> very small group of users (if it exists at all) that wants to add new
>>> block drivers themselves, but at the same time can't run upstream qemu.
>>>
>>
>> The first part of your argument may be true, but the second isn't.  
>> No user can run upstream qemu.git.  It's not tested or supported, and 
>> has no backwards compatibility guarantees.
>
> Yes, it does have backwards compatibility guarantees.

I meant a random untagged qemu.git snapshot.  Do we guarantee anything 
about it, except that it's likely to be broken?
Avi Kivity - May 25, 2010, 4:16 p.m.
On 05/25/2010 06:01 PM, Anthony Liguori wrote:
> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>> The latter.  Why is it less important?  If you don't inherit the 
>> memory, you can't access it.
>>
>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>> memory segments dynamically.
>>
>> Doesn't work for anonymous memory.
>
> What's wrong with /dev/shm memory?

The kernel treats anonymous and nonymous memory differently for swapping 
(see /proc/sys/vm/swappiness); transparent hugepages won't work for 
/dev/shm (though it may be argued that that's a problem with thp); setup 
(/dev/shm defaults to half memory IIRC, we want mem+swap); different 
cgroup handling; somewhat clunky (a minor concern to be sure).

Nothing is a killer, but we should prefer anonymous memory.
Anthony Liguori - May 25, 2010, 4:21 p.m.
On 05/25/2010 11:16 AM, Avi Kivity wrote:
> On 05/25/2010 06:01 PM, Anthony Liguori wrote:
>> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>>> The latter.  Why is it less important?  If you don't inherit the 
>>> memory, you can't access it.
>>>
>>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>>> memory segments dynamically.
>>>
>>> Doesn't work for anonymous memory.
>>
>> What's wrong with /dev/shm memory?
>
> The kernel treats anonymous and nonymous memory differently for 
> swapping (see /proc/sys/vm/swappiness); transparent hugepages won't 
> work for /dev/shm (though it may be argued that that's a problem with 
> thp); setup (/dev/shm defaults to half memory IIRC, we want mem+swap); 
> different cgroup handling; somewhat clunky (a minor concern to be sure).

Surely, with mmu notifiers, it wouldn't be that hard to share anonymous 
memory via an fd though, no?

Regards,

Anthony Liguori

>
> Nothing is a killer, but we should prefer anonymous memory.
>
Avi Kivity - May 25, 2010, 4:27 p.m.
On 05/25/2010 07:21 PM, Anthony Liguori wrote:
> On 05/25/2010 11:16 AM, Avi Kivity wrote:
>> On 05/25/2010 06:01 PM, Anthony Liguori wrote:
>>> On 05/25/2010 10:00 AM, Avi Kivity wrote:
>>>> The latter.  Why is it less important?  If you don't inherit the 
>>>> memory, you can't access it.
>>>>
>>>>> You can also pass /dev/shm fd's via SCM_RIGHTs to establish shared 
>>>>> memory segments dynamically.
>>>>
>>>> Doesn't work for anonymous memory.
>>>
>>> What's wrong with /dev/shm memory?
>>
>> The kernel treats anonymous and nonymous memory differently for 
>> swapping (see /proc/sys/vm/swappiness); transparent hugepages won't 
>> work for /dev/shm (though it may be argued that that's a problem with 
>> thp); setup (/dev/shm defaults to half memory IIRC, we want 
>> mem+swap); different cgroup handling; somewhat clunky (a minor 
>> concern to be sure).
>
> Surely, with mmu notifiers, it wouldn't be that hard to share 
> anonymous memory via an fd though, no?

That's what I suggested with processfd().  I wouldn't call it easy but 
it's likely doable.  Whether it's mergable is a different issue.
Blue Swirl - May 25, 2010, 8:13 p.m.
On Mon, May 24, 2010 at 2:17 AM, Yehuda Sadeh Weinraub
<yehudasa@gmail.com> wrote:
> On Sun, May 23, 2010 at 12:59 AM, Blue Swirl <blauwirbel@gmail.com> wrote:
>> On Thu, May 20, 2010 at 11:02 PM, Yehuda Sadeh Weinraub
>> <yehudasa@gmail.com> wrote:
>>> On Thu, May 20, 2010 at 1:31 PM, Blue Swirl <blauwirbel@gmail.com> wrote:
>>>> On Wed, May 19, 2010 at 7:22 PM, Christian Brunner <chb@muc.de> wrote:
>>>>> The attached patch is a block driver for the distributed file system
>>>>> Ceph (http://ceph.newdream.net/). This driver uses librados (which
>>>>> is part of the Ceph server) for direct access to the Ceph object
>>>>> store and is running entirely in userspace. Therefore it is
>>>>> called "rbd" - rados block device.
>>> ...
>>>>
>>>> IIRC underscores here may conflict with system header use. Please use
>>>> something like QEMU_BLOCK_RADOS_H.
>>>
>>> This header is shared between the linux kernel client and the ceph
>>> userspace servers and client. We can actually get rid of it, as we
>>> only need it to define CEPH_OSD_TMAP_SET. We can move this definition
>>> to librados.h.
>>>
>>>>> diff --git a/block/rbd_types.h b/block/rbd_types.h
>>>>> new file mode 100644
>>>>> index 0000000..dfd5aa0
>>>>> --- /dev/null
>>>>> +++ b/block/rbd_types.h
>>>>> @@ -0,0 +1,48 @@
>>>>> +#ifndef _FS_CEPH_RBD
>>>>> +#define _FS_CEPH_RBD
>>>>
>>>> QEMU_BLOCK_RBD?
>>>
>>> This header is shared between the ceph kernel client, between the qemu
>>> rbd module (and between other ceph utilities). It'd be much easier
>>> maintaining it without having to have a different implementation for
>>> each. The same goes to the use of __le32/64 and __u32/64 within these
>>> headers.
>>
>> This is user space, so identifiers must conform to C standards. The
>> identifiers beginning with underscores are reserved.
>>
>> Doesn't __le32/64 also depend on some GCC extension? Or sparse magic?
> It depends on gcc extension. If needed we can probably have a separate
> header for the qemu block device that uses alternative types. Though
> looking at the qemu code I see use of other gcc extensions so I'm not
> sure this is a real issue.

We use some (contained with for example macros if possible), but in
earlier discussions, __le32 etc. were considered problematic. IIRC
it's hard to provide alternate versions for other compilers (or older
versions of gcc).

>
>>
>>>
>>>>
>>>>> +
>>>>> +#include <linux/types.h>
>>>>
>>>> Can you use standard includes, like <sys/types.h> or <inttypes.h>? Are
>>>> Ceph libraries used in other systems than Linux?
>>>
>>> Not at the moment. I guess that we can take this include out.
>>>
>>>>
>>>>> +
>>>>> +/*
>>>>> + * rbd image 'foo' consists of objects
>>>>> + *   foo.rbd      - image metadata
>>>>> + *   foo.00000000
>>>>> + *   foo.00000001
>>>>> + *   ...          - data
>>>>> + */
>>>>> +
>>>>> +#define RBD_SUFFIX             ".rbd"
>>>>> +#define RBD_DIRECTORY           "rbd_directory"
>>>>> +
>>>>> +#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
>>>>> +
>>>>> +#define RBD_MAX_OBJ_NAME_SIZE  96
>>>>> +#define RBD_MAX_SEG_NAME_SIZE  128
>>>>> +
>>>>> +#define RBD_COMP_NONE          0
>>>>> +#define RBD_CRYPT_NONE         0
>>>>> +
>>>>> +static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
>>>>> +static const char rbd_signature[] = "RBD";
>>>>> +static const char rbd_version[] = "001.001";
>>>>> +
>>>>> +struct rbd_obj_snap_ondisk {
>>>>> +       __le64 id;
>>>>> +       __le64 image_size;
>>>>> +} __attribute__((packed));
>>>>> +
>>>>> +struct rbd_obj_header_ondisk {
>>>>> +       char text[64];
>>>>> +       char signature[4];
>>>>> +       char version[8];
>>>>> +       __le64 image_size;
>>>>
>>>> Unaligned? Is the disk format fixed?
>>>
>>> This is a packed structure that represents the on disk format.
>>> Operations on it are being done only to read from the disk header or
>>> to write to the disk header.
>>
>> That's clear. But what exactly is the alignment of field 'image_size'?
>> Could there be implicit padding to mod 8 between 'version' and
>> 'image_size' with some compilers?
>
> Obviously it's not 64 bit aligned. As it's an on-disk header, I don't
> see alignment a real issue. As was said before, any operation on these
> fields have to go through endianity conversion anyway, and this
> structure should not be used directly. For such datastructures I'd
> rather have the fields ordered in some logical order than maintaining
> the alignment by ourselves. That's why we have that __attribute__
> packed in the end to let the compiler deal with those issues. Other
> compilers though have their own syntax for packed structures (but I do
> see other uses of this packed syntax in the qemu code).

Packed structures are OK, but the padding should be explicit to avoid
compiler problems.

Eventually the disk format is read into memory buffer and then aligned
fields should be also faster on all architectures, even on x86.

>>
>> If there were no other constraints, I'd either make the padding
>> explicit, or rearrange/resize fields so that the field alignment is
>> natural. Thus my question, can you change the disk format or are there
>> already some deployments?
>
> We can certainly make changes to the disk format at this point. I'm
> not very happy with those 3 __u8 in the middle, and they can probably
> be changed to a 32 bit flags field. We can get it 64 bit aligned too.

I hope my comments helped you to avoid possible problems in the
future. From purely QEMU code base point of view, any architecture
goes. Some architectures are faster to emulate, others are slower.

>>
>> Otherwise, I'd just add some warning comment so people don't try to
>> use clever pointer tricks which will crash on machines with enforced
>> alignment.
>>
> Any clever pointer tricks that'll work on one architecture will
> probably be wrong on another (different word
> size/alignment/endianity), so maybe crashing machines is a good
> indicator to bad implementation. We shouldn't try to hide the
> problems.
>
> Thanks,
> Yehuda
>

Patch

diff --git a/Makefile b/Makefile
index eb9e02b..b1ab3e9 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,9 @@  configure: ;
 $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
 
 LIBS+=-lz $(LIBS_TOOLS)
+ifdef CONFIG_RBD
+LIBS+=-lrados
+endif
 
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
diff --git a/Makefile.objs b/Makefile.objs
index acbaf22..85791ac 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -18,6 +18,7 @@  block-nested-y += parallels.o nbd.o blkdebug.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
+block-nested-$(CONFIG_RBD) += rbd.o
 
 block-obj-y +=  $(addprefix block/, $(block-nested-y))
 
diff --git a/block/rados.h b/block/rados.h
new file mode 100644
index 0000000..6cde9a1
--- /dev/null
+++ b/block/rados.h
@@ -0,0 +1,376 @@ 
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/block/rbd.c b/block/rbd.c
new file mode 100644
index 0000000..eedae50
--- /dev/null
+++ b/block/rbd.c
@@ -0,0 +1,585 @@ 
+/*
+ * QEMU Block driver for RADOS (Ceph)
+ *
+ * Copyright (C) 2010 Christian Brunner <chb@muc.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include <qemu-common.h>
+
+#include "rbd_types.h"
+#include "rados.h"
+#include "module.h"
+#include "block_int.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <rados/librados.h>
+
+#include <signal.h>
+
+/*
+ * When specifying the image filename use:
+ *
+ * rbd:poolname/devicename
+ *
+ * poolname must be the name of an existing rados pool
+ *
+ * devicename is the basename for all objects used to
+ * emulate the raw device.
+ *
+ * Metadata information (image size, ...) is stored in an
+ * object with the name "devicename.rbd".
+ *
+ * The raw device is split into 4MB sized objects by default.
+ * The sequencenumber is encoded in a 12 byte long hex-string,
+ * and is attached to the devicename, separated by a dot.
+ * e.g. "devicename.1234567890ab"
+ *
+ */
+
+#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
+
+typedef struct RBDAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    QEMUIOVector *qiov;
+    char *bounce;
+    int write;
+    int64_t sector_num;
+    int aiocnt;
+    int error;
+} RBDAIOCB;
+
+typedef struct RADOSCB {
+    int rcbid;
+    RBDAIOCB *acb;
+    int done;
+    int64_t segsize;
+    char *buf;
+} RADOSCB;
+
+typedef struct RBDRVRBDState {
+    rados_pool_t pool;
+    char name[RBD_MAX_OBJ_NAME_SIZE];
+    int name_len;
+    uint64_t size;
+    uint64_t objsize;
+} RBDRVRBDState;
+
+typedef struct rbd_obj_header_ondisk RbdHeader1;
+
+static int rbd_parsename(const char *filename, char *pool, char *name)
+{
+    const char *rbdname;
+    char *p, *n;
+    int l;
+
+    if (!strstart(filename, "rbd:", &rbdname)) {
+        return -EINVAL;
+    }
+
+    pstrcpy(pool, 2 * RBD_MAX_SEG_NAME_SIZE, rbdname);
+    p = strchr(pool, '/');
+    if (p == NULL) {
+        return -EINVAL;
+    }
+
+    *p = '\0';
+    n = ++p;
+
+    l = strlen(n);
+
+    if (l > RBD_MAX_OBJ_NAME_SIZE) {
+        fprintf(stderr, "object name to long\n");
+        return -EINVAL;
+    } else if (l <= 0) {
+        fprintf(stderr, "object name to short\n");
+        return -EINVAL;
+    }
+
+    strcpy(name, n);
+
+    return l;
+}
+
+static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc)
+{
+    uint32_t len = strlen(name);
+    uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t);       /* encoding op + name + empty buffer */
+    char *desc;
+
+    desc = qemu_malloc(total_len);
+    if (!desc) {
+        return -ENOMEM;
+    }
+
+    *tmap_desc = desc;
+
+    *desc = op;
+    desc++;
+    memcpy(desc, &len, sizeof(len));
+    desc += sizeof(len);
+    memcpy(desc, name, len);
+    desc += len;
+    len = 0;
+    memcpy(desc, &len, sizeof(len));
+    desc += sizeof(len);
+
+    return desc - *tmap_desc;
+}
+
+static void free_tmap_op(char *tmap_desc)
+{
+    qemu_free(tmap_desc);
+}
+
+static int rbd_register_image(rados_pool_t pool, const char *name)
+{
+    char *tmap_desc;
+    const char *dir = RBD_DIRECTORY;
+    int ret;
+
+    ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = rados_tmap_update(pool, dir, tmap_desc, ret);
+    free_tmap_op(tmap_desc);
+
+    return ret;
+}
+
+static int rbd_create(const char *filename, QEMUOptionParameter *options)
+{
+    int64_t bytes = 0;
+    int64_t objsize;
+    uint64_t size;
+    time_t mtime;
+    uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER;
+    char pool[RBD_MAX_SEG_NAME_SIZE];
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    char name[RBD_MAX_SEG_NAME_SIZE];
+    RbdHeader1 header;
+    rados_pool_t p;
+    int name_len;
+    int ret;
+
+    if ((name_len = rbd_parsename(filename, pool, name)) < 0) {
+        return -EINVAL;
+    }
+
+    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", name, RBD_SUFFIX);
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            bytes = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                objsize = options->value.n;
+                if (!objsize || ((objsize - 1) & objsize)) {    /* not a power of 2? */
+                    fprintf(stderr, "obj size needs to be power of 2\n");
+                    return -EINVAL;
+                }
+                if (objsize < 4096) {
+                    fprintf(stderr, "obj size too small\n");
+                    return -EINVAL;
+                }
+
+                for (obj_order = 0; obj_order < 64; obj_order++) {
+                    if (objsize == 1)
+                        break;
+                    objsize >>= 1;
+                }
+            }
+        }
+        options++;
+    }
+
+    memset(&header, 0, sizeof(header));
+    pstrcpy(header.text, sizeof(header.text), rbd_text);
+    pstrcpy(header.signature, sizeof(header.signature), rbd_signature);
+    pstrcpy(header.version, sizeof(header.version), rbd_version);
+    header.image_size = bytes;
+    cpu_to_le64s((uint64_t *) & header.image_size);
+    header.obj_order = obj_order;
+    header.crypt_type = RBD_CRYPT_NONE;
+    header.comp_type = RBD_COMP_NONE;
+    header.snap_seq = 0;
+    header.snap_count = 0;
+    cpu_to_le32s(&header.snap_count);
+
+    if (rados_initialize(0, NULL) < 0) {
+        fprintf(stderr, "error initializing\n");
+        return -EIO;
+    }
+
+    if (rados_open_pool(pool, &p)) {
+        fprintf(stderr, "error opening pool %s\n", pool);
+        return -EIO;
+    }
+
+    /* check for existing rbd header file */
+    ret = rados_stat(p, n, &size, &mtime);
+    if (ret == 0) {
+        ret=-EEXIST;
+        goto done;
+    }
+
+    /* create header file */
+    ret = rados_write(p, n, 0, (const char *)&header, sizeof(header));
+    if (ret < 0) {
+        goto done;
+    }
+
+    ret = rbd_register_image(p, name);
+done:
+    rados_close_pool(p);
+    rados_deinitialize();
+
+    return ret;
+}
+
+static int rbd_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    RBDRVRBDState *s = bs->opaque;
+    char pool[RBD_MAX_SEG_NAME_SIZE];
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    char hbuf[4096];
+
+    if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
+        return -EINVAL;
+    }
+    snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s%s", s->name, RBD_SUFFIX);
+
+    if (rados_initialize(0, NULL) < 0) {
+        fprintf(stderr, "error initializing\n");
+        return -EIO;
+    }
+
+    if (rados_open_pool(pool, &s->pool)) {
+        fprintf(stderr, "error opening pool %s\n", pool);
+        return -EIO;
+    }
+
+    if (rados_read(s->pool, n, 0, hbuf, 4096) < 0) {
+        fprintf(stderr, "error reading header from %s\n", s->name);
+        return -EIO;
+    }
+    if (!strncmp(hbuf + 64, rbd_signature, 4)) {
+        if (!strncmp(hbuf + 68, rbd_version, 8)) {
+            RbdHeader1 *header;
+
+            header = (RbdHeader1 *) hbuf;
+            le64_to_cpus((uint64_t *) & header->image_size);
+            s->size = header->image_size;
+            s->objsize = 1 << header->obj_order;
+        } else {
+            fprintf(stderr, "Unknown image version %s\n", hbuf + 68);
+            return -EIO;
+        }
+    } else {
+        fprintf(stderr, "Invalid header signature %s\n", hbuf + 64);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static void rbd_close(BlockDriverState *bs)
+{
+    RBDRVRBDState *s = bs->opaque;
+
+    rados_close_pool(s->pool);
+    rados_deinitialize();
+}
+
+static int rbd_rw(BlockDriverState *bs, int64_t sector_num,
+                  uint8_t *buf, int nb_sectors, int write)
+{
+    RBDRVRBDState *s = bs->opaque;
+    char n[RBD_MAX_SEG_NAME_SIZE];
+
+    int64_t segnr, segoffs, segsize, r;
+    int64_t off, size;
+
+    off = sector_num * 512;
+    size = nb_sectors * 512;
+    segnr = (int64_t) (off / s->objsize);
+    segoffs = (int64_t) (off % s->objsize);
+    segsize = (int64_t) (s->objsize - segoffs);
+
+    while (size > 0) {
+        if (size < segsize) {
+            segsize = size;
+        }
+
+        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
+                 (long long unsigned int)segnr);
+
+        if (write) {
+            if ((r = rados_write(s->pool, n, segoffs, (const char *)buf,
+                segsize)) < 0) {
+                return r;
+            }
+        } else {
+            r = rados_read(s->pool, n, segoffs, (char *)buf, segsize);
+            if (r == -ENOENT) {
+                memset(buf, 0, segsize);
+            } else if (r < 0) {
+                return(r);
+            } else if (r < segsize) {
+                memset(buf + r, 0, segsize - r);
+            }
+            r = segsize;
+        }
+
+        buf += segsize;
+        size -= segsize;
+        segoffs = 0;
+        segsize = s->objsize;
+        segnr++;
+    }
+
+    return (0);
+}
+
+static int rbd_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    return rbd_rw(bs, sector_num, buf, nb_sectors, 0);
+}
+
+static int rbd_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    return rbd_rw(bs, sector_num, (uint8_t *) buf, nb_sectors, 1);
+}
+
+static void rbd_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static AIOPool rbd_aio_pool = {
+    .aiocb_size = sizeof(RBDAIOCB),
+    .cancel = rbd_aio_cancel,
+};
+
+/* This is the callback function for rados_aio_read and _write */
+static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb)
+{
+    RBDAIOCB *acb = rcb->acb;
+    int64_t r;
+    int i;
+
+    acb->aiocnt--;
+    r = rados_aio_get_return_value(c);
+    rados_aio_release(c);
+    if (acb->write) {
+        if (r < 0) {
+            acb->ret = r;
+            acb->error = 1;
+        } else if (!acb->error) {
+            acb->ret += rcb->segsize;
+        }
+    } else {
+        if (r == -ENOENT) {
+            memset(rcb->buf, 0, rcb->segsize);
+            if (!acb->error) {
+                acb->ret += rcb->segsize;
+            }
+        } else if (r < 0) {
+            acb->ret = r;
+            acb->error = 1;
+        } else if (r < rcb->segsize) {
+            memset(rcb->buf + r, 0, rcb->segsize - r);
+            if (!acb->error) {
+                acb->ret += rcb->segsize;
+            }
+        } else if (!acb->error) {
+            acb->ret += r;
+        }
+    }
+    qemu_free(rcb);
+    i = 0;
+    if (!acb->aiocnt && acb->bh) {
+        qemu_bh_schedule(acb->bh);
+    }
+}
+
+/* Callback when all queued rados_aio requests are complete */
+static void rbd_aio_bh_cb(void *opaque)
+{
+    RBDAIOCB *acb = opaque;
+
+    if (!acb->write) {
+        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           QEMUIOVector *qiov,
+                                           int nb_sectors,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque, int write)
+{
+    RBDAIOCB *acb;
+    RADOSCB *rcb;
+    rados_completion_t c;
+    char n[RBD_MAX_SEG_NAME_SIZE];
+    int64_t segnr, segoffs, segsize, last_segnr;
+    int64_t off, size;
+    char *buf;
+
+    RBDRVRBDState *s = bs->opaque;
+
+    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+    acb->write = write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_blockalign(bs, qiov->size);
+    acb->aiocnt = 0;
+    acb->ret = 0;
+    acb->error = 0;
+
+    if (!acb->bh) {
+        acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
+    }
+
+    if (write) {
+        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
+    }
+
+    buf = acb->bounce;
+
+    off = sector_num * 512;
+    size = nb_sectors * 512;
+    segnr = (int64_t) (off / s->objsize);
+    segoffs = (int64_t) (off % s->objsize);
+    segsize = (int64_t) (s->objsize - segoffs);
+
+    last_segnr = ((off + size - 1) / s->objsize);
+    acb->aiocnt = (last_segnr - segnr) + 1;
+
+    while (size > 0) {
+        if (size < segsize) {
+            segsize = size;
+        }
+
+        snprintf(n, RBD_MAX_SEG_NAME_SIZE, "%s.%012llx", s->name,
+                 (long long unsigned int)segnr);
+
+        rcb = qemu_malloc(sizeof(RADOSCB));
+        rcb->done = 0;
+        rcb->acb = acb;
+        rcb->segsize = segsize;
+        rcb->buf = buf;
+
+        if (write) {
+            rados_aio_create_completion(rcb, NULL,
+                                        (rados_callback_t) rbd_finish_aiocb, &c);
+            rados_aio_write(s->pool, n, segoffs, buf, segsize, c);
+        } else {
+            rados_aio_create_completion(rcb, (rados_callback_t) rbd_finish_aiocb,
+                                        NULL, &c);
+            rados_aio_read(s->pool, n, segoffs, buf, segsize, c);
+        }
+
+        buf += segsize;
+        size -= segsize;
+        segoffs = 0;
+        segsize = s->objsize;
+        segnr++;
+    }
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState *bs,
+                                       int64_t sector_num, QEMUIOVector *qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
+{
+    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState *bs,
+                                        int64_t sector_num, QEMUIOVector *qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc *cb,
+                                        void *opaque)
+{
+    return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    RBDRVRBDState *s = bs->opaque;
+    bdi->cluster_size = s->objsize;
+    return 0;
+}
+
+static int64_t rbd_getlength(BlockDriverState *bs)
+{
+    RBDRVRBDState *s = bs->opaque;
+
+    return s->size;
+}
+
+static QEMUOptionParameter rbd_create_options[] = {
+    {
+     .name = BLOCK_OPT_SIZE,
+     .type = OPT_SIZE,
+     .help = "Virtual disk size"
+    },
+    {
+     .name = BLOCK_OPT_CLUSTER_SIZE,
+     .type = OPT_SIZE,
+     .help = "RBD object size"
+    },
+    {NULL}
+};
+
+static BlockDriver bdrv_rbd = {
+    .format_name = "rbd",
+    .instance_size = sizeof(RBDRVRBDState),
+    .bdrv_open = rbd_open,
+    .bdrv_read = rbd_read,
+    .bdrv_write = rbd_write,
+    .bdrv_close = rbd_close,
+    .bdrv_create = rbd_create,
+    .bdrv_get_info = rbd_getinfo,
+    .create_options = rbd_create_options,
+    .bdrv_getlength = rbd_getlength,
+    .protocol_name = "rbd",
+
+    .bdrv_aio_readv = rbd_aio_readv,
+    .bdrv_aio_writev = rbd_aio_writev,
+};
+
+static void bdrv_rbd_init(void)
+{
+    bdrv_register(&bdrv_rbd);
+}
+
+block_init(bdrv_rbd_init);
diff --git a/block/rbd_types.h b/block/rbd_types.h
new file mode 100644
index 0000000..dfd5aa0
--- /dev/null
+++ b/block/rbd_types.h
@@ -0,0 +1,48 @@ 
+#ifndef _FS_CEPH_RBD
+#define _FS_CEPH_RBD
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX	 	".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+
+#define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
+
+#define RBD_MAX_OBJ_NAME_SIZE	96
+#define RBD_MAX_SEG_NAME_SIZE	128
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+static const char rbd_text[] = "<<< Rados Block Device Image >>>\n";
+static const char rbd_signature[] = "RBD";
+static const char rbd_version[] = "001.001";
+
+struct rbd_obj_snap_ondisk {
+	__le64 id;
+	__le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+	char text[64];
+	char signature[4];
+	char version[8];
+	__le64 image_size;
+	__u8 obj_order;
+	__u8 crypt_type;
+	__u8 comp_type;
+	__le32 snap_seq;
+	__le32 snap_count;
+	__le64 snap_names_len;
+	struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+#endif
diff --git a/configure b/configure
index 36d028f..d07a7e5 100755
--- a/configure
+++ b/configure
@@ -299,6 +299,7 @@  pkgversion=""
 check_utests="no"
 user_pie="no"
 zero_malloc=""
+rbd="no"
 
 # OS specific
 if check_define __linux__ ; then
@@ -660,6 +661,8 @@  for opt do
   ;;
   --enable-vhost-net) vhost_net="yes"
   ;;
+  --enable-rbd) rbd="yes"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -826,6 +829,7 @@  echo "  --enable-docs            enable documentation build"
 echo "  --disable-docs           disable documentation build"
 echo "  --disable-vhost-net      disable vhost-net acceleration support"
 echo "  --enable-vhost-net       enable vhost-net acceleration support"
+echo "  --enable-rbd		 enable building the rados block device (rbd)"
 echo ""
 echo "NOTE: The object files are built at the place where configure is launched"
 exit 1
@@ -1569,6 +1573,25 @@  if test "$mingw32" != yes -a "$pthread" = no; then
 fi
 
 ##########################################
+# rbd probe
+if test "$rbd" != "no" ; then
+  cat > $TMPC <<EOF
+#include <stdio.h>
+#include <rados/librados.h>
+int main(void) { rados_initialize(0, NULL); return 0; }
+EOF
+  if compile_prog "" "-lrados -lcrypto" ; then
+    rbd=yes
+    LIBS="$LIBS -lrados -lcrypto"
+  else
+    if test "$rbd" = "yes" ; then
+      feature_not_found "rados block device"
+    fi
+    rbd=no
+  fi
+fi
+
+##########################################
 # linux-aio probe
 
 if test "$linux_aio" != "no" ; then
@@ -2031,6 +2054,7 @@  echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
 echo "vhost-net support $vhost_net"
+echo "rbd support       $rbd"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -2260,6 +2284,9 @@  echo "CONFIG_UNAME_RELEASE=\"$uname_release\"" >> $config_host_mak
 if test "$zero_malloc" = "yes" ; then
   echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak
 fi
+if test "$rbd" = "yes" ; then
+  echo "CONFIG_RBD=y" >> $config_host_mak
+fi
 
 # USB host support
 case "$usb" in