diff mbox

[v13,19/20] file-posix: Add image locking in perm operations

Message ID 20170420075237.18219-20-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng April 20, 2017, 7:52 a.m. UTC
virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 0x10.

The complication is in the transactional interface.  To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/file-posix.c | 744 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 741 insertions(+), 3 deletions(-)

Comments

Kevin Wolf April 20, 2017, 11:26 a.m. UTC | #1
Am 20.04.2017 um 09:52 hat Fam Zheng geschrieben:
> virtlockd in libvirt locks the first byte, so we start looking at the
> file bytes from 0x10.
> 
> The complication is in the transactional interface.  To make the reopen
> logic managable, and allow better reuse, the code is internally
> organized with a table from old mode to the new one.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>

Looking at the very early patches in this series, I think it quickly
becomes obvious that we need to discuss one thing first:

> +static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
> +                          Error **errp)
> +{
> +    bool is_shared;
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (!RAW_LOCK_SUPPORTED) {
> +        return 0;
> +    }
> +    if (s->lock_update) {
> +        /* Override the previously stashed update. */
> +        g_free(s->lock_update);
> +        s->lock_update = NULL;
> +    }
> +    is_shared = !(perm & BLK_PERM_CONSISTENT_READ) && (shared & BLK_PERM_WRITE);

Why do you check BLK_PERM_CONSISTENT_READ? The locks that we said we
would take on the image file represent BLK_PERM_WRITE, both in perm and
in shared, so this is what they should be checked against. Opening the
image in another process is fine if BLK_PERM_WRITE is set in shared,
even if BLK_PERM_CONSISTENT_READ is also set in perm.

BLK_PERM_CONSISTENT_READ is for cases where the contents of an image
is inherently invalid, not just because of a concurrent writer that we
might not be aware of, but because the image just doesn't makes sense on
it own. It may make sense as part of larger backing chain, though (the
only place where we clear the flag is for intermediate nodes in the
commit job). This semantics is more or less separate from what we want
to achieve here.

Of course, if we wanted, I guess we could individually map all 64
bits of each perm and shared to bytes to be locked in the file, so that
all permissions would be shared between qemu instances. That's probably
not worth the effort though.

And even if we did that, most likely you still wouldn't need any special
exceptions for BLK_PERM_CONSISTENT_READ, because it is always shared
except when one process wants to run a commit job - and for a commit
job, making sure that nobody else uses the image would probably be
right.

So if we really treat the file system level locks just as a mapping of
BLK_PERM_WRITE, things should become a bit easier in the early patches
of this series.

Kevin
diff mbox

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 24ea3ff..b85ac9c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -131,8 +131,54 @@  do { \
 
 #define MAX_BLOCKSIZE	4096
 
+/* Posix file locking bytes. Libvirt takes byte 0, we start from byte 0x10,
+ * leaving a few more bytes for its future use. */
+#define RAW_LOCK_BYTE_MIN             0x10
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 0x10
+#define RAW_LOCK_BYTE_WRITE           0x11
+#ifdef F_OFD_SETLK
+#define RAW_LOCK_SUPPORTED 1
+#else
+#define RAW_LOCK_SUPPORTED 0
+#endif
+
+/*
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 0x10. Test
+ *  byte 0x11 is unlocked.
+ *
+ ** shared writer: Take shared lock on byte 0x11. Test byte 0x10 is unlocked.
+ *
+ ** exclusive writer: Take exclusive locks on both bytes.
+ */
+
+typedef enum {
+    /* Read only and accept other writers. */
+    RAW_L_READ_SHARE_RW,
+    /* Read only and try to forbid other writers. */
+    RAW_L_READ,
+    /* Read/write and accept other writers. */
+    RAW_L_WRITE_SHARE_RW,
+    /* Read/write and try to forbid other writers. */
+    RAW_L_WRITE,
+} BDRVRawLockMode;
+
+typedef struct BDRVRawLockUpdateState {
+    /* A dup of @fd used for acquiring lock. */
+    int image_fd;
+    int lock_fd;
+    int open_flags;
+    BDRVRawLockMode new_lock;
+    bool use_lock;
+} BDRVRawLockUpdateState;
+
 typedef struct BDRVRawState {
     int fd;
+    /* A dup of @fd to make manipulating lock easier, especially during reopen,
+     * where this will accept BDRVRawReopenState.lock_fd. */
+    int lock_fd;
+    bool use_lock;
     int type;
     int open_flags;
     size_t buf_align;
@@ -147,6 +193,11 @@  typedef struct BDRVRawState {
     bool page_cache_inconsistent:1;
     bool has_fallocate;
     bool needs_alignment;
+    /* The current lock mode we are in. Note that in incoming migration this is
+     * the "desired" mode to be applied at bdrv_invalidate_cache. */
+    BDRVRawLockMode cur_lock_mode;
+    /* Used by raw_check_perm/raw_set_perm. */
+    BDRVRawLockUpdateState *lock_update;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -369,6 +420,64 @@  static void raw_parse_flags(int bdrv_flags, int *open_flags)
     }
 }
 
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+    int ret;
+    assert(fd >= 0);
+    assert(RAW_LOCK_SUPPORTED);
+    switch (mode) {
+    case RAW_L_READ_SHARE_RW:
+        ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to unlock fd");
+            goto fail;
+        }
+        break;
+    case RAW_L_READ:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock share byte");
+            goto fail;
+        }
+        ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Write byte lock is taken");
+            goto fail;
+        }
+        break;
+    case RAW_L_WRITE_SHARE_RW:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock write byte exclusively");
+            goto fail;
+        }
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to downgrade lock write byte");
+            goto fail;
+        }
+        ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Share byte lock is taken");
+            goto fail;
+        }
+        break;
+    case RAW_L_WRITE:
+        ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_MIN, 2, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock image");
+            goto fail;
+        }
+        break;
+    default:
+        abort();
+    }
+    return 0;
+fail:
+    qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+    return ret;
+}
+
 static void raw_parse_filename(const char *filename, QDict *options,
                                Error **errp)
 {
@@ -403,6 +512,23 @@  static QemuOptsList raw_runtime_opts = {
     },
 };
 
+static BDRVRawLockMode raw_get_lock_mode(bool write, bool shared)
+{
+    if (write) {
+        if (shared) {
+            return RAW_L_WRITE_SHARE_RW;
+        } else {
+            return RAW_L_WRITE;
+        }
+    } else {
+        if (shared) {
+            return RAW_L_READ_SHARE_RW;
+        } else {
+            return RAW_L_READ;
+        }
+    }
+}
+
 static int raw_open_common(BlockDriverState *bs, QDict *options,
                            int bdrv_flags, int open_flags, Error **errp)
 {
@@ -442,10 +568,13 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 
+    s->use_lock = qemu_opt_get_bool(opts, "locking", true);
+
     s->open_flags = open_flags;
     raw_parse_flags(bdrv_flags, &s->open_flags);
 
     s->fd = -1;
+    s->lock_fd = -1;
     fd = qemu_open(filename, s->open_flags, 0644);
     if (fd < 0) {
         ret = -errno;
@@ -544,6 +673,509 @@  static int raw_open(BlockDriverState *bs, QDict *options, int flags,
     return raw_open_common(bs, options, flags, 0, errp);
 }
 
+typedef enum {
+    RAW_LT_PREPARE,
+    RAW_LT_COMMIT,
+    RAW_LT_ABORT
+} RawLockTransOp;
+
+typedef int (*RawLockTransFunc)(RawLockTransOp op,
+                                int old_lock_fd, int new_lock_fd,
+                                BDRVRawLockMode old_lock,
+                                BDRVRawLockMode new_lock,
+                                Error **errp);
+
+static int raw_lt_nop(RawLockTransOp op,
+                      int old_lock_fd, int new_lock_fd,
+                      BDRVRawLockMode old_lock,
+                      BDRVRawLockMode new_lock,
+                      Error **errp)
+{
+    assert(old_lock == new_lock || new_lock == RAW_L_READ_SHARE_RW);
+    return 0;
+}
+
+static int raw_lt_from_unlock(RawLockTransOp op,
+                              int old_lock_fd, int new_lock_fd,
+                              BDRVRawLockMode old_lock,
+                              BDRVRawLockMode new_lock,
+                              Error **errp)
+{
+    assert(old_lock != new_lock);
+    assert(old_lock == RAW_L_READ_SHARE_RW);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        return raw_lock_fd(new_lock_fd, new_lock, errp);
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+
+    return 0;
+}
+
+static int raw_lt_read_to_write_share(RawLockTransOp op,
+                                      int old_lock_fd, int new_lock_fd,
+                                      BDRVRawLockMode old_lock,
+                                      BDRVRawLockMode new_lock,
+                                      Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_READ);
+    assert(new_lock == RAW_L_WRITE_SHARE_RW);
+
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                S                           0
+     * new                0                           S
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (write byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (share byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to unlock old fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to upgrade new fd (share byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            /* This is very unlikely, but catch it anyway. */
+            error_setg_errno(errp, -ret, "Failed to unlock new fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to downgrade new fd (write byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            /* As unlikely as above unlock failure, but report it anyway. */
+            error_report("Failed to restore lock on old fd (share byte)");
+        }
+        break;
+    }
+    return ret;
+}
+
+static int raw_lt_read_to_write(RawLockTransOp op,
+                                int old_lock_fd, int new_lock_fd,
+                                BDRVRawLockMode old_lock,
+                                BDRVRawLockMode new_lock,
+                                Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_READ);
+    assert(new_lock == RAW_L_WRITE);
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                S                           0
+     * new                X                           X
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (write byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (share byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to unlock old fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to upgrade new fd (share byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to downgrade new fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_report("Failed to restore lock on old fd (share byte)");
+        }
+        break;
+    }
+    return ret;
+}
+
+static int raw_lt_write_share_to_read(RawLockTransOp op,
+                                      int old_lock_fd, int new_lock_fd,
+                                      BDRVRawLockMode old_lock,
+                                      BDRVRawLockMode new_lock,
+                                      Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE_SHARE_RW);
+    assert(new_lock == RAW_L_READ);
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                0                           S
+     * new                S                           0
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        /* Make sure there are no other writers. */
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock old fd (write byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (!ret) {
+            break;
+        }
+        error_setg_errno(errp, -ret, "Failed to lock new fd (share byte)");
+        /* fall through */
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_report("Failed to downgrade old fd (write byte)");
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    }
+    return ret;
+}
+
+static int raw_lt_write_share_to_write(RawLockTransOp op,
+                                       int old_lock_fd, int new_lock_fd,
+                                       BDRVRawLockMode old_lock,
+                                       BDRVRawLockMode new_lock,
+                                       Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE_SHARE_RW);
+    assert(new_lock == RAW_L_WRITE);
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                0                           S
+     * new                X                           X
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        /* Make sure there are no other writers. */
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (write byte)");
+            break;
+        }
+        ret = qemu_unlock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to unlock old fd (write byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, true);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to upgrade new fd (write byte)");
+            break;
+        }
+        break;
+    case RAW_LT_COMMIT:
+        break;
+    case RAW_LT_ABORT:
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to downgrade new fd (write byte)");
+            break;
+        }
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to restore old fd (write byte)");
+            break;
+        }
+        break;
+    }
+    return ret;
+}
+
+static int raw_lt_write_to_read(RawLockTransOp op,
+                                int old_lock_fd, int new_lock_fd,
+                                BDRVRawLockMode old_lock,
+                                BDRVRawLockMode new_lock,
+                                Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE);
+    assert(new_lock == RAW_L_READ);
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                X                           X
+     * new                S                           0
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        break;
+    case RAW_LT_COMMIT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to downgrade old fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (share byte)");
+            break;
+        }
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret;
+}
+
+static int raw_lt_write_to_write_share(RawLockTransOp op,
+                                       int old_lock_fd, int new_lock_fd,
+                                       BDRVRawLockMode old_lock,
+                                       BDRVRawLockMode new_lock,
+                                       Error **errp)
+{
+    int ret = 0;
+
+    assert(old_lock == RAW_L_WRITE);
+    assert(new_lock == RAW_L_WRITE_SHARE_RW);
+    /*
+     *        lock byte "no other writer"      lock byte "write"
+     * old                X                           X
+     * new                0                           S
+     *
+     * (0 = unlocked; S = shared; X = exclusive.)
+     */
+    switch (op) {
+    case RAW_LT_PREPARE:
+        break;
+    case RAW_LT_COMMIT:
+        ret = qemu_lock_fd(old_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_report("Failed to downgrade old fd (share byte)");
+            break;
+        }
+        ret = qemu_lock_fd(new_lock_fd, RAW_LOCK_BYTE_WRITE, 1, false);
+        if (ret) {
+            error_report("Failed to lock new fd (write byte)");
+        }
+        break;
+    case RAW_LT_ABORT:
+        break;
+    }
+    return ret;
+}
+
+/**
+ * Transactionally moving between possible locking states is tricky and must be
+ * done carefully. That is mostly because downgrading an exclusive lock to
+ * shared or unlocked is not guaranteed to be revertible. As a result, in such
+ * cases we have to defer the downgrading to "commit", given that no revert will
+ * happen after that point, and that downgrading a lock should never fail.
+ *
+ * On the other hand, upgrading a lock (e.g. from unlocked or shared to
+ * exclusive lock) must happen in "prepare" because it may fail.
+ *
+ * Manage the operation matrix with this state transition table to make
+ * fulfilling above conditions easier.
+ */
+static const struct RawLockTransOp {
+    BDRVRawLockMode old_lock;
+    BDRVRawLockMode new_lock;
+    RawLockTransFunc func;
+    bool need_lock_fd;
+    bool close_old_lock_fd;
+} raw_lock_trans_ops[] = {
+
+    {RAW_L_READ_SHARE_RW,  RAW_L_READ_SHARE_RW,  raw_lt_nop,                  false, false},
+    {RAW_L_READ_SHARE_RW,  RAW_L_READ,           raw_lt_from_unlock,          true},
+    {RAW_L_READ_SHARE_RW,  RAW_L_WRITE_SHARE_RW, raw_lt_from_unlock,          true},
+    {RAW_L_READ_SHARE_RW,  RAW_L_WRITE,          raw_lt_from_unlock,          true},
+
+    {RAW_L_READ,           RAW_L_READ_SHARE_RW,  raw_lt_nop,                  false, true},
+    {RAW_L_READ,           RAW_L_READ,           raw_lt_nop,                  false, false},
+    {RAW_L_READ,           RAW_L_WRITE_SHARE_RW, raw_lt_read_to_write_share,  true},
+    {RAW_L_READ,           RAW_L_WRITE,          raw_lt_read_to_write,        true},
+
+    {RAW_L_WRITE_SHARE_RW, RAW_L_READ_SHARE_RW,  raw_lt_nop,                  false, true},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_READ,           raw_lt_write_share_to_read,  true},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_WRITE_SHARE_RW, raw_lt_nop,                  false, false},
+    {RAW_L_WRITE_SHARE_RW, RAW_L_WRITE,          raw_lt_write_share_to_write, true},
+
+    {RAW_L_WRITE,          RAW_L_READ_SHARE_RW,  raw_lt_nop,                  false, true},
+    {RAW_L_WRITE,          RAW_L_READ,           raw_lt_write_to_read,        true},
+    {RAW_L_WRITE,          RAW_L_WRITE_SHARE_RW, raw_lt_write_to_write_share, true},
+    {RAW_L_WRITE,          RAW_L_WRITE,          raw_lt_nop,                  false, false},
+};
+
+static int raw_handle_lock_update(BlockDriverState *bs,
+                                  RawLockTransOp op,
+                                  Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+    BDRVRawLockMode old_lock, new_lock;
+    const struct RawLockTransOp *rec;
+    int ret = 0;
+    Error *local_err = NULL;
+    BDRVRawLockUpdateState *lu = s->lock_update;
+    int lock_fd;
+
+    if (!RAW_LOCK_SUPPORTED) {
+        return 0;
+    }
+
+    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
+        /* leave the work to bdrv_invalidate_cache. */
+        return 0;
+    }
+
+    if (op == RAW_LT_PREPARE) {
+        lock_fd = qemu_open(bs->filename, lu->open_flags);
+        if (lock_fd == -1) {
+            if (errno == ENOENT) {
+                /* The file is gone, probably BDRV_O_SNAPSHOT? Skip locking. */
+                lu->use_lock = false;
+            } else {
+                /* other errors handled later. */
+            }
+        }
+    }
+
+    old_lock = s->cur_lock_mode;
+    new_lock = lu->use_lock ? lu->new_lock : RAW_L_READ_SHARE_RW;
+    for (rec = &raw_lock_trans_ops[0];
+         rec < &raw_lock_trans_ops[ARRAY_SIZE(raw_lock_trans_ops)];
+         rec++) {
+        if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
+            break;
+        }
+    }
+    assert(rec != &raw_lock_trans_ops[ARRAY_SIZE(raw_lock_trans_ops)]);
+
+    assert(old_lock == RAW_L_READ_SHARE_RW || s->lock_fd >= 0);
+
+    DPRINTF("handle lock %p old lock %d new lock %d op %d func %p\n", bs,
+            old_lock, new_lock, op, rec->func);
+    switch (op) {
+    case RAW_LT_PREPARE:
+        if (rec->need_lock_fd) {
+            if (lock_fd >= 0) {
+                lu->lock_fd = lock_fd;
+            } else {
+                error_setg(errp, "Failed to initialize lock fd");
+            }
+        } else {
+            if (lock_fd >= 0) {
+                qemu_close(lock_fd);
+                lock_fd = -1;
+            }
+        }
+        ret = rec->func(op, s->lock_fd, lu->lock_fd, old_lock, new_lock, errp);
+        if (!ret) {
+            break;
+        }
+        /* Only succeeded preparation will be reverted by block layer, we
+         * need to clean up this failure manually. */
+        op = RAW_LT_ABORT;
+        /* fall through */
+    case RAW_LT_ABORT:
+        rec->func(op, s->lock_fd, lu->lock_fd, old_lock, new_lock, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+        }
+        if (lu->lock_fd >= 0) {
+            qemu_close(lu->lock_fd);
+            lu->lock_fd = -1;
+        }
+        goto cleanup;
+    case RAW_LT_COMMIT:
+        rec->func(op, s->lock_fd, lu->lock_fd, old_lock, new_lock, &error_abort);
+        if ((rec->need_lock_fd || rec->close_old_lock_fd) && s->lock_fd >= 0) {
+            raw_lock_fd(s->lock_fd, RAW_L_READ_SHARE_RW, NULL);
+            qemu_close(s->lock_fd);
+            s->lock_fd = -1;
+        }
+        if (rec->need_lock_fd) {
+            s->lock_fd = lu->lock_fd;
+        }
+        assert(s->lock_fd >= 0 || new_lock == RAW_L_READ_SHARE_RW);
+        s->cur_lock_mode = new_lock;
+        s->use_lock = lu->use_lock;
+        goto cleanup;
+    }
+    return ret;
+cleanup:
+    g_free(s->lock_update);
+    s->lock_update = NULL;
+    return ret;
+}
+
+static void raw_init_lock_update(BlockDriverState *bs,
+                                 int image_fd,
+                                 bool write, bool shared,
+                                 bool use_lock)
+{
+    BDRVRawState *s = bs->opaque;
+
+    assert(!s->lock_update);
+    s->lock_update = g_new0(BDRVRawLockUpdateState, 1);
+    *s->lock_update = (BDRVRawLockUpdateState) {
+        .image_fd = image_fd,
+        .new_lock = raw_get_lock_mode(write, shared),
+        .use_lock = use_lock,
+        .open_flags = (s->open_flags & ~(O_RDWR | O_RDONLY)) |
+                       (write ? O_RDWR : O_RDONLY),
+    };
+}
+
 static int raw_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
 {
@@ -551,6 +1183,7 @@  static int raw_reopen_prepare(BDRVReopenState *state,
     BDRVRawReopenState *rs;
     int ret = 0;
     Error *local_err = NULL;
+    bool shared;
 
     assert(state != NULL);
     assert(state->bs != NULL);
@@ -615,13 +1248,27 @@  static int raw_reopen_prepare(BDRVReopenState *state,
     if (rs->fd != -1) {
         raw_probe_alignment(state->bs, rs->fd, &local_err);
         if (local_err) {
-            qemu_close(rs->fd);
-            rs->fd = -1;
             error_propagate(errp, local_err);
             ret = -EINVAL;
+            goto fail;
         }
     }
+    shared = s->cur_lock_mode == RAW_L_READ_SHARE_RW ||
+        s->cur_lock_mode == RAW_L_WRITE_SHARE_RW;
+    /* Shared perm doesn't change during reopen. */
+    raw_init_lock_update(state->bs, rs->fd, state->flags & BDRV_O_RDWR, shared,
+                         s->use_lock);
 
+    qdict_del(state->options, "locking");
+    ret = raw_handle_lock_update(state->bs, RAW_LT_PREPARE, errp);
+    if (ret) {
+        goto fail;
+    }
+
+    return 0;
+fail:
+    qemu_close(rs->fd);
+    rs->fd = -1;
     return ret;
 }
 
@@ -632,6 +1279,8 @@  static void raw_reopen_commit(BDRVReopenState *state)
 
     s->open_flags = rs->open_flags;
 
+    raw_handle_lock_update(state->bs, RAW_LT_COMMIT, &error_abort);
+
     qemu_close(s->fd);
     s->fd = rs->fd;
 
@@ -649,6 +1298,8 @@  static void raw_reopen_abort(BDRVReopenState *state)
         return;
     }
 
+    raw_handle_lock_update(state->bs, RAW_LT_ABORT, &error_abort);
+
     if (rs->fd >= 0) {
         qemu_close(rs->fd);
         rs->fd = -1;
@@ -1412,6 +2063,10 @@  static void raw_close(BlockDriverState *bs)
         qemu_close(s->fd);
         s->fd = -1;
     }
+    if (s->lock_fd >= 0) {
+        qemu_close(s->lock_fd);
+        s->lock_fd = -1;
+    }
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -1949,6 +2604,85 @@  static QemuOptsList raw_create_opts = {
     }
 };
 
+static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
+                          Error **errp)
+{
+    bool is_shared;
+    BDRVRawState *s = bs->opaque;
+
+    if (!RAW_LOCK_SUPPORTED) {
+        return 0;
+    }
+    if (s->lock_update) {
+        /* Override the previously stashed update. */
+        g_free(s->lock_update);
+        s->lock_update = NULL;
+    }
+    is_shared = !(perm & BLK_PERM_CONSISTENT_READ) && (shared & BLK_PERM_WRITE);
+    DPRINTF("raw check perm %p rw %d shared %d\n",
+            bs, perm & BLK_PERM_WRITE ? 1 : 0,
+            is_shared);
+    raw_init_lock_update(bs, s->fd,
+                         perm & BLK_PERM_WRITE, is_shared, s->use_lock);
+
+    return raw_handle_lock_update(bs, RAW_LT_PREPARE, errp);
+}
+
+static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (!RAW_LOCK_SUPPORTED) {
+        return;
+    }
+    assert(s->lock_update);
+
+    raw_handle_lock_update(bs, RAW_LT_COMMIT, NULL);
+}
+
+static void raw_abort_perm_update(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (!RAW_LOCK_SUPPORTED) {
+        return;
+    }
+    if (!s->lock_update) {
+        return;
+    }
+    raw_handle_lock_update(bs, RAW_LT_ABORT, NULL);
+}
+
+static int raw_inactivate(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int r = 0;
+
+    if (RAW_LOCK_SUPPORTED && s->cur_lock_mode != RAW_L_READ_SHARE_RW) {
+        r = raw_lock_fd(s->lock_fd, RAW_L_READ_SHARE_RW, NULL);
+    }
+    return r;
+}
+
+static void raw_invalidate_cache(BlockDriverState *bs, Error **errp)
+{
+    int r;
+    BDRVRawState *s = bs->opaque;
+
+    if (!RAW_LOCK_SUPPORTED) {
+        return;
+    }
+    if (s->lock_update) {
+        /* Apply the pending lock update from perm or reopen. */
+        r = raw_handle_lock_update(bs, RAW_LT_PREPARE, errp);
+        if (r) {
+            return;
+        }
+        raw_handle_lock_update(bs, RAW_LT_COMMIT, errp);
+        assert(!s->lock_update);
+    }
+}
+
 BlockDriver bdrv_file = {
     .format_name = "file",
     .protocol_name = "file",
@@ -1979,7 +2713,11 @@  BlockDriver bdrv_file = {
     .bdrv_get_info = raw_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
-
+    .bdrv_inactivate = raw_inactivate,
+    .bdrv_invalidate_cache = raw_invalidate_cache,
+    .bdrv_check_perm = raw_check_perm,
+    .bdrv_set_perm   = raw_set_perm,
+    .bdrv_abort_perm_update = raw_abort_perm_update,
     .create_opts = &raw_create_opts,
 };