Patchwork [V8,08/14] Introduce file lock for the block layer

login
register
mail settings
Submitter Stefan Berger
Date Aug. 31, 2011, 2:35 p.m.
Message ID <20110831143621.799480525@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/112578/
State New
Headers show

Comments

Stefan Berger - Aug. 31, 2011, 2:35 p.m.
This patch introduces file locking via fcntl() for the block layer so that
concurrent access to files shared by 2 Qemu instances, for example via NFS,
can be serialized. This feature is useful primarily during initial phases of
VM migration where the target machine's TIS driver validates the block
storage (and in a later patch checks for missing AES keys) and terminates
Qemu if the storage is found to be faulty. This then allows migration to
be gracefully terminated and Qemu continues running on the source machine.

Support for win32 is based on win32 API and has been lightly tested with a
standalone test program locking shared storage from two different machines.

To enable locking a file multiple times, a counter is used. Actual locking
happens the very first time and unlocking happens when the counter is zero.

v7:
 - fixed compilation error in win32 part

Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com>

---

---
 block.c           |   41 +++++++++++++++++++++++++++++++++++
 block.h           |    8 ++++++
 block/raw-posix.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/raw-win32.c |   52 ++++++++++++++++++++++++++++++++++++++++++++
 block_int.h       |    4 +++
 5 files changed, 168 insertions(+)
Michael S. Tsirkin - Sept. 1, 2011, 5:32 p.m.
On Wed, Aug 31, 2011 at 10:35:59AM -0400, Stefan Berger wrote:
> This patch introduces file locking via fcntl() for the block layer so that
> concurrent access to files shared by 2 Qemu instances, for example via NFS,
> can be serialized. This feature is useful primarily during initial phases of
> VM migration where the target machine's TIS driver validates the block
> storage (and in a later patch checks for missing AES keys) and terminates
> Qemu if the storage is found to be faulty. This then allows migration to
> be gracefully terminated and Qemu continues running on the source machine.
> 
> Support for win32 is based on win32 API and has been lightly tested with a
> standalone test program locking shared storage from two different machines.
> 
> To enable locking a file multiple times, a counter is used. Actual locking
> happens the very first time and unlocking happens when the counter is zero.
> 
> v7:
>  - fixed compilation error in win32 part
> 
> Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com>

Generally, what all other devices do is perform validation
as the last step in migration when device state
is restored. On failure, management can decide what to do:
retry migration or restart on source.

Why is TPM special and needs to be treated differently?



> ---
> 
> ---
>  block.c           |   41 +++++++++++++++++++++++++++++++++++
>  block.h           |    8 ++++++
>  block/raw-posix.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/raw-win32.c |   52 ++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h       |    4 +++
>  5 files changed, 168 insertions(+)
> 
> Index: qemu-git/block.c
> ===================================================================
> --- qemu-git.orig/block.c
> +++ qemu-git/block.c
> @@ -521,6 +521,8 @@ static int bdrv_open_common(BlockDriverS
>          goto free_and_fail;
>      }
>  
> +    drv->num_locks = 0;
> +
>      bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
>  
>      ret = refresh_total_sectors(bs, bs->total_sectors);
> @@ -1316,6 +1318,45 @@ void bdrv_get_geometry(BlockDriverState 
>      *nb_sectors_ptr = length;
>  }
>  
> +/* file locking */
> +static int bdrv_lock_common(BlockDriverState *bs, BDRVLockType lock_type)
> +{
> +    BlockDriver *drv = bs->drv;
> +
> +    if (!drv) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    if (bs->file) {
> +        drv = bs->file->drv;
> +        if (drv->bdrv_lock) {
> +            return drv->bdrv_lock(bs->file, lock_type);
> +        }
> +    }
> +
> +    if (drv->bdrv_lock) {
> +        return drv->bdrv_lock(bs, lock_type);
> +    }
> +
> +    return -ENOTSUP;
> +}
> +
> +
> +int bdrv_lock(BlockDriverState *bs)
> +{
> +    if (bdrv_is_read_only(bs)) {
> +        return bdrv_lock_common(bs, BDRV_F_RDLCK);
> +    }
> +
> +    return bdrv_lock_common(bs, BDRV_F_WRLCK);
> +}
> +
> +void bdrv_unlock(BlockDriverState *bs)
> +{
> +    bdrv_lock_common(bs, BDRV_F_UNLCK);
> +}
> +
> +
>  struct partition {
>          uint8_t boot_ind;           /* 0x80 - active */
>          uint8_t head;               /* starting head */
> Index: qemu-git/block.h
> ===================================================================
> --- qemu-git.orig/block.h
> +++ qemu-git/block.h
> @@ -43,6 +43,12 @@ typedef struct QEMUSnapshotInfo {
>  #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
>  
>  typedef enum {
> +    BDRV_F_UNLCK,
> +    BDRV_F_RDLCK,
> +    BDRV_F_WRLCK,
> +} BDRVLockType;
> +
> +typedef enum {
>      BLOCK_ERR_REPORT, BLOCK_ERR_IGNORE, BLOCK_ERR_STOP_ENOSPC,
>      BLOCK_ERR_STOP_ANY
>  } BlockErrorAction;
> @@ -100,6 +106,8 @@ int bdrv_commit(BlockDriverState *bs);
>  void bdrv_commit_all(void);
>  int bdrv_change_backing_file(BlockDriverState *bs,
>      const char *backing_file, const char *backing_fmt);
> +int bdrv_lock(BlockDriverState *bs);
> +void bdrv_unlock(BlockDriverState *bs);
>  void bdrv_register(BlockDriver *bdrv);
>  
>  
> Index: qemu-git/block/raw-posix.c
> ===================================================================
> --- qemu-git.orig/block/raw-posix.c
> +++ qemu-git/block/raw-posix.c
> @@ -803,6 +803,67 @@ static int64_t raw_get_allocated_file_si
>      return (int64_t)st.st_blocks * 512;
>  }
>  
> +static int raw_lock(BlockDriverState *bs, BDRVLockType lock_type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    BDRVRawState *s = bs->opaque;
> +    struct flock flock = {
> +        .l_whence = SEEK_SET,
> +        .l_start = 0,
> +        .l_len = 0,
> +    };
> +    int n;
> +
> +    switch (lock_type) {
> +    case BDRV_F_RDLCK:
> +    case BDRV_F_WRLCK:
> +        if (drv->num_locks) {
> +            drv->num_locks++;
> +            return 0;
> +        }
> +        flock.l_type = (lock_type == BDRV_F_RDLCK) ? F_RDLCK : F_WRLCK;
> +        break;
> +
> +    case BDRV_F_UNLCK:
> +        if (--drv->num_locks > 0) {
> +            return 0;
> +        }
> +
> +        assert(drv->num_locks == 0);
> +
> +        flock.l_type = F_UNLCK;
> +        break;
> +
> +    default:
> +        return -EINVAL;
> +    }
> +
> +    while (1) {
> +        n = fcntl(s->fd, F_SETLKW, &flock);
> +        if (n < 0) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            if (errno == EAGAIN) {
> +                usleep(10000);
> +                continue;
> +            }
> +        }
> +        break;
> +    }
> +
> +    if (n == 0 &&
> +        ((lock_type == BDRV_F_RDLCK) || (lock_type == BDRV_F_WRLCK))) {
> +        drv->num_locks = 1;
> +    }
> +
> +    if (n) {
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
>  static int raw_create(const char *filename, QEMUOptionParameter *options)
>  {
>      int fd;
> @@ -901,6 +962,8 @@ static BlockDriver bdrv_file = {
>      .bdrv_get_allocated_file_size
>                          = raw_get_allocated_file_size,
>  
> +    .bdrv_lock = raw_lock,
> +
>      .create_options = raw_create_options,
>  };
>  
> Index: qemu-git/block_int.h
> ===================================================================
> --- qemu-git.orig/block_int.h
> +++ qemu-git/block_int.h
> @@ -146,6 +146,10 @@ struct BlockDriver {
>       */
>      int (*bdrv_has_zero_init)(BlockDriverState *bs);
>  
> +    /* File locking */
> +    int num_locks;
> +    int (*bdrv_lock)(BlockDriverState *bs, BDRVLockType lock_type);
> +
>      QLIST_ENTRY(BlockDriver) list;
>  };
>  
> Index: qemu-git/block/raw-win32.c
> ===================================================================
> --- qemu-git.orig/block/raw-win32.c
> +++ qemu-git/block/raw-win32.c
> @@ -242,6 +242,57 @@ static int64_t raw_get_allocated_file_si
>      return st.st_size;
>  }
>  
> +static int raw_lock(BlockDriverState *bs, int lock_type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    BDRVRawState *s = bs->opaque;
> +    OVERLAPPED ov;
> +    BOOL res;
> +    DWORD num_bytes;
> +
> +    switch (lock_type) {
> +    case BDRV_F_RDLCK:
> +    case BDRV_F_WRLCK:
> +        if (drv->num_locks) {
> +            drv->num_locks++;
> +            return 0;
> +        }
> +
> +        memset(&ov, 0, sizeof(ov));
> +
> +        res = LockFileEx(s->hfile, LOCKFILE_EXCLUSIVE_LOCK, 0, ~0, ~0, &ov);
> +
> +        if (res == FALSE) {
> +            res = GetOverlappedResult(s->hfile, &ov, &num_bytes, TRUE);
> +        }
> +
> +        if (res == TRUE) {
> +            drv->num_locks = 1;
> +        }
> +
> +        break;
> +
> +    case BDRV_F_UNLCK:
> +        if (--drv->num_locks > 0) {
> +            return 0;
> +        }
> +
> +        assert(drv->num_locks >= 0);
> +
> +        res = UnlockFile(s->hfile, 0, 0, ~0, ~0);
> +        break;
> +
> +    default:
> +        return -EINVAL;
> +    }
> +
> +    if (res == FALSE) {
> +        return -EIO;
> +    }
> +
> +    return 0;
> +}
> +
>  static int raw_create(const char *filename, QEMUOptionParameter *options)
>  {
>      int fd;
> @@ -289,6 +340,7 @@ static BlockDriver bdrv_file = {
>      .bdrv_get_allocated_file_size
>                          = raw_get_allocated_file_size,
>  
> +    .bdrv_lock		= raw_lock,
>      .create_options = raw_create_options,
>  };
>  
>
Stefan Berger - Sept. 2, 2011, 1:53 a.m.
On 09/01/2011 01:32 PM, Michael S. Tsirkin wrote:
> On Wed, Aug 31, 2011 at 10:35:59AM -0400, Stefan Berger wrote:
>> This patch introduces file locking via fcntl() for the block layer so that
>> concurrent access to files shared by 2 Qemu instances, for example via NFS,
>> can be serialized. This feature is useful primarily during initial phases of
>> VM migration where the target machine's TIS driver validates the block
>> storage (and in a later patch checks for missing AES keys) and terminates
>> Qemu if the storage is found to be faulty. This then allows migration to
>> be gracefully terminated and Qemu continues running on the source machine.
>>
>> Support for win32 is based on win32 API and has been lightly tested with a
>> standalone test program locking shared storage from two different machines.
>>
>> To enable locking a file multiple times, a counter is used. Actual locking
>> happens the very first time and unlocking happens when the counter is zero.
>>
>> v7:
>>   - fixed compilation error in win32 part
>>
>> Signed-off-by: Stefan Berger<stefanb@linux.vnet.ibm.com>
> Generally, what all other devices do is perform validation
> as the last step in migration when device state
> is restored. On failure, management can decide what to do:
> retry migration or restart on source.
>
> Why is TPM special and needs to be treated differently?
>
>
>
Some background on the TPM: Typically a TPM is a chip with built-in 
NVRAM where it can write its persistent state into. We are simulating 
this NVRAM through a file (QCoW2).

What's special about the TPM is that Qemu stores the libtpms-internal 
state onto that QCoW2 file whereas normally it's the OS that stores its 
data onto files accessed through BlockDriverState. This in turn is 
related to the fact that the TPM does not only have the internal state 
of the TPM TIS frontend but also that of the libtpms backend where for 
example keys and the owner's password are stored and have to be written 
into persistent storage.

More detail: Typically one starts out with an empty QCoW2 file created 
via qemu-img. Once Qemu starts and initializes the libtpms-based TPM, it 
tries to read existing state from that QCoW2 file. Since there is no 
state stored in the QCoW2, the TPM will start initializing itself to an 
initial 'blank' state. Then once the user has booted into the OS he can 
use the TPM. Assuming that user now creates an Endorsement Key  (EK) 
using a command sent to the TPM, then this EK becomes part of the 
persistent state of the TPM and the persistent state is written into the 
QCoW2 file. The next time the VM is (cold) started that EK has to be 
there. Assuming now the user takes ownership of the TPM, then his 
password(s) also becomes part of the persistent state of the TPM and has 
to be written into the QCoW2 file. Again, the fact that ownership was 
taken leads to the requirement that the passwords are expected to be 
there upon another (cold) restart of the VM. All this is covered by 
storing the TPM's persistent state into that file.

I hope this explains what is special about the TPM.

   Stefan
>> ---
>>
>> ---
>>   block.c           |   41 +++++++++++++++++++++++++++++++++++
>>   block.h           |    8 ++++++
>>   block/raw-posix.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/raw-win32.c |   52 ++++++++++++++++++++++++++++++++++++++++++++
>>   block_int.h       |    4 +++
>>   5 files changed, 168 insertions(+)
>>
>> Index: qemu-git/block.c
>> ===================================================================
>> --- qemu-git.orig/block.c
>> +++ qemu-git/block.c
>> @@ -521,6 +521,8 @@ static int bdrv_open_common(BlockDriverS
>>           goto free_and_fail;
>>       }
>>
>> +    drv->num_locks = 0;
>> +
>>       bs->keep_read_only = bs->read_only = !(open_flags&  BDRV_O_RDWR);
>>
>>       ret = refresh_total_sectors(bs, bs->total_sectors);
>> @@ -1316,6 +1318,45 @@ void bdrv_get_geometry(BlockDriverState
>>       *nb_sectors_ptr = length;
>>   }
>>
>> +/* file locking */
>> +static int bdrv_lock_common(BlockDriverState *bs, BDRVLockType lock_type)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +
>> +    if (!drv) {
>> +        return -ENOMEDIUM;
>> +    }
>> +
>> +    if (bs->file) {
>> +        drv = bs->file->drv;
>> +        if (drv->bdrv_lock) {
>> +            return drv->bdrv_lock(bs->file, lock_type);
>> +        }
>> +    }
>> +
>> +    if (drv->bdrv_lock) {
>> +        return drv->bdrv_lock(bs, lock_type);
>> +    }
>> +
>> +    return -ENOTSUP;
>> +}
>> +
>> +
>> +int bdrv_lock(BlockDriverState *bs)
>> +{
>> +    if (bdrv_is_read_only(bs)) {
>> +        return bdrv_lock_common(bs, BDRV_F_RDLCK);
>> +    }
>> +
>> +    return bdrv_lock_common(bs, BDRV_F_WRLCK);
>> +}
>> +
>> +void bdrv_unlock(BlockDriverState *bs)
>> +{
>> +    bdrv_lock_common(bs, BDRV_F_UNLCK);
>> +}
>> +
>> +
>>   struct partition {
>>           uint8_t boot_ind;           /* 0x80 - active */
>>           uint8_t head;               /* starting head */
>> Index: qemu-git/block.h
>> ===================================================================
>> --- qemu-git.orig/block.h
>> +++ qemu-git/block.h
>> @@ -43,6 +43,12 @@ typedef struct QEMUSnapshotInfo {
>>   #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
>>
>>   typedef enum {
>> +    BDRV_F_UNLCK,
>> +    BDRV_F_RDLCK,
>> +    BDRV_F_WRLCK,
>> +} BDRVLockType;
>> +
>> +typedef enum {
>>       BLOCK_ERR_REPORT, BLOCK_ERR_IGNORE, BLOCK_ERR_STOP_ENOSPC,
>>       BLOCK_ERR_STOP_ANY
>>   } BlockErrorAction;
>> @@ -100,6 +106,8 @@ int bdrv_commit(BlockDriverState *bs);
>>   void bdrv_commit_all(void);
>>   int bdrv_change_backing_file(BlockDriverState *bs,
>>       const char *backing_file, const char *backing_fmt);
>> +int bdrv_lock(BlockDriverState *bs);
>> +void bdrv_unlock(BlockDriverState *bs);
>>   void bdrv_register(BlockDriver *bdrv);
>>
>>
>> Index: qemu-git/block/raw-posix.c
>> ===================================================================
>> --- qemu-git.orig/block/raw-posix.c
>> +++ qemu-git/block/raw-posix.c
>> @@ -803,6 +803,67 @@ static int64_t raw_get_allocated_file_si
>>       return (int64_t)st.st_blocks * 512;
>>   }
>>
>> +static int raw_lock(BlockDriverState *bs, BDRVLockType lock_type)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +    BDRVRawState *s = bs->opaque;
>> +    struct flock flock = {
>> +        .l_whence = SEEK_SET,
>> +        .l_start = 0,
>> +        .l_len = 0,
>> +    };
>> +    int n;
>> +
>> +    switch (lock_type) {
>> +    case BDRV_F_RDLCK:
>> +    case BDRV_F_WRLCK:
>> +        if (drv->num_locks) {
>> +            drv->num_locks++;
>> +            return 0;
>> +        }
>> +        flock.l_type = (lock_type == BDRV_F_RDLCK) ? F_RDLCK : F_WRLCK;
>> +        break;
>> +
>> +    case BDRV_F_UNLCK:
>> +        if (--drv->num_locks>  0) {
>> +            return 0;
>> +        }
>> +
>> +        assert(drv->num_locks == 0);
>> +
>> +        flock.l_type = F_UNLCK;
>> +        break;
>> +
>> +    default:
>> +        return -EINVAL;
>> +    }
>> +
>> +    while (1) {
>> +        n = fcntl(s->fd, F_SETLKW,&flock);
>> +        if (n<  0) {
>> +            if (errno == EINTR) {
>> +                continue;
>> +            }
>> +            if (errno == EAGAIN) {
>> +                usleep(10000);
>> +                continue;
>> +            }
>> +        }
>> +        break;
>> +    }
>> +
>> +    if (n == 0&&
>> +        ((lock_type == BDRV_F_RDLCK) || (lock_type == BDRV_F_WRLCK))) {
>> +        drv->num_locks = 1;
>> +    }
>> +
>> +    if (n) {
>> +        return -errno;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int raw_create(const char *filename, QEMUOptionParameter *options)
>>   {
>>       int fd;
>> @@ -901,6 +962,8 @@ static BlockDriver bdrv_file = {
>>       .bdrv_get_allocated_file_size
>>                           = raw_get_allocated_file_size,
>>
>> +    .bdrv_lock = raw_lock,
>> +
>>       .create_options = raw_create_options,
>>   };
>>
>> Index: qemu-git/block_int.h
>> ===================================================================
>> --- qemu-git.orig/block_int.h
>> +++ qemu-git/block_int.h
>> @@ -146,6 +146,10 @@ struct BlockDriver {
>>        */
>>       int (*bdrv_has_zero_init)(BlockDriverState *bs);
>>
>> +    /* File locking */
>> +    int num_locks;
>> +    int (*bdrv_lock)(BlockDriverState *bs, BDRVLockType lock_type);
>> +
>>       QLIST_ENTRY(BlockDriver) list;
>>   };
>>
>> Index: qemu-git/block/raw-win32.c
>> ===================================================================
>> --- qemu-git.orig/block/raw-win32.c
>> +++ qemu-git/block/raw-win32.c
>> @@ -242,6 +242,57 @@ static int64_t raw_get_allocated_file_si
>>       return st.st_size;
>>   }
>>
>> +static int raw_lock(BlockDriverState *bs, int lock_type)
>> +{
>> +    BlockDriver *drv = bs->drv;
>> +    BDRVRawState *s = bs->opaque;
>> +    OVERLAPPED ov;
>> +    BOOL res;
>> +    DWORD num_bytes;
>> +
>> +    switch (lock_type) {
>> +    case BDRV_F_RDLCK:
>> +    case BDRV_F_WRLCK:
>> +        if (drv->num_locks) {
>> +            drv->num_locks++;
>> +            return 0;
>> +        }
>> +
>> +        memset(&ov, 0, sizeof(ov));
>> +
>> +        res = LockFileEx(s->hfile, LOCKFILE_EXCLUSIVE_LOCK, 0, ~0, ~0,&ov);
>> +
>> +        if (res == FALSE) {
>> +            res = GetOverlappedResult(s->hfile,&ov,&num_bytes, TRUE);
>> +        }
>> +
>> +        if (res == TRUE) {
>> +            drv->num_locks = 1;
>> +        }
>> +
>> +        break;
>> +
>> +    case BDRV_F_UNLCK:
>> +        if (--drv->num_locks>  0) {
>> +            return 0;
>> +        }
>> +
>> +        assert(drv->num_locks>= 0);
>> +
>> +        res = UnlockFile(s->hfile, 0, 0, ~0, ~0);
>> +        break;
>> +
>> +    default:
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (res == FALSE) {
>> +        return -EIO;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int raw_create(const char *filename, QEMUOptionParameter *options)
>>   {
>>       int fd;
>> @@ -289,6 +340,7 @@ static BlockDriver bdrv_file = {
>>       .bdrv_get_allocated_file_size
>>                           = raw_get_allocated_file_size,
>>
>> +    .bdrv_lock		= raw_lock,
>>       .create_options = raw_create_options,
>>   };
>>
>>
Michael S. Tsirkin - Sept. 4, 2011, 7:32 p.m.
On Thu, Sep 01, 2011 at 09:53:40PM -0400, Stefan Berger wrote:
> >Generally, what all other devices do is perform validation
> >as the last step in migration when device state
> >is restored. On failure, management can decide what to do:
> >retry migration or restart on source.
> >
> >Why is TPM special and needs to be treated differently?
> >
> >
> >

...

> 
> More detail: Typically one starts out with an empty QCoW2 file
> created via qemu-img. Once Qemu starts and initializes the
> libtpms-based TPM, it tries to read existing state from that QCoW2
> file. Since there is no state stored in the QCoW2, the TPM will
> start initializing itself to an initial 'blank' state.

So it looks like the problem is you access the file when guest isn't
running.  Delaying the initialization until the guest actually starts
running will solve the problem in a way that is more consistent with
other qemu devices.
Stefan Berger - Sept. 6, 2011, 11:55 p.m.
On 09/04/2011 03:32 PM, Michael S. Tsirkin wrote:
> On Thu, Sep 01, 2011 at 09:53:40PM -0400, Stefan Berger wrote:
>>> Generally, what all other devices do is perform validation
>>> as the last step in migration when device state
>>> is restored. On failure, management can decide what to do:
>>> retry migration or restart on source.
>>>
>>> Why is TPM special and needs to be treated differently?
>>>
>>>
>>>
> ...
>
>> More detail: Typically one starts out with an empty QCoW2 file
>> created via qemu-img. Once Qemu starts and initializes the
>> libtpms-based TPM, it tries to read existing state from that QCoW2
>> file. Since there is no state stored in the QCoW2, the TPM will
>> start initializing itself to an initial 'blank' state.
> So it looks like the problem is you access the file when guest isn't
> running.  Delaying the initialization until the guest actually starts
> running will solve the problem in a way that is more consistent with
> other qemu devices.
>
I'd agree if there wasn't one more thing: encryption on the data inside 
the QCoW2 filesystem

First: There are two ways to encrypt the data.

One comes with the QCoW2 type of image and it comes for free. Set the 
encryption flag when creating the QCoW2 file and one has to provide a 
key to access the QCoW2. I found this mode problematic for users since 
it required me to go through the monitor every time I started the VM. 
Besides that the key is provided so late that all devices are already 
initialized and if the wrong key was provided the only thing the TPM can 
do is to go into shutdown mode since there is state on the QCoW2 but it 
cannot be decrypted. This also became problematic when doing migrations 
with libvirt for example and one was to have a wrong key/password 
installed on the target side -- graceful termination of the migration is 
impossible.

So the above drove the implementation of the encryption mode added in 
patch 10 in this series. Here the key is provided via command line and 
it can be used immediately. So I am reading the state blobs from the 
file, decrypt them, create the CRC32 on the plain data and check against 
the CRC32 stored in the 'directory'. If it doesn't match the expected 
CRC32 either the key was wrong or the state is corrupted and I can 
terminate Qemu gracefully. I can also react appropriately if no key was 
provided but one is expected and vice-versa. Also in case of migration 
this now allows me to terminate Qemu gracefully so it continues running 
on the source. This is an improvement over the situation described above 
where in case the target had the wrong key the TPM went into shutdown 
mode and the user would be wondering why that is -- the TPM becomes 
inaccessible. However, particularly in the case of migration with shared 
storage I need to access the QCoW2 file to check whether on the target I 
have the right key. And this happens very early during qemu startup on 
the target machine. So that's where the file locking on the block layer 
comes in. I want to serialize access to the QCoW2 so I don't read 
intermediate state on the migration-target host that can occur if the 
source is currently writing TPM state data.

This patch and the command line provided key along with the encryption 
mode added in patch 10 in this series add to usability and help prevent 
failures.

Regards,
      Stefan
Michael S. Tsirkin - Sept. 7, 2011, 11:18 a.m.
On Tue, Sep 06, 2011 at 07:55:48PM -0400, Stefan Berger wrote:
> On 09/04/2011 03:32 PM, Michael S. Tsirkin wrote:
> >On Thu, Sep 01, 2011 at 09:53:40PM -0400, Stefan Berger wrote:
> >>>Generally, what all other devices do is perform validation
> >>>as the last step in migration when device state
> >>>is restored. On failure, management can decide what to do:
> >>>retry migration or restart on source.
> >>>
> >>>Why is TPM special and needs to be treated differently?
> >>>
> >>>
> >>>
> >...
> >
> >>More detail: Typically one starts out with an empty QCoW2 file
> >>created via qemu-img. Once Qemu starts and initializes the
> >>libtpms-based TPM, it tries to read existing state from that QCoW2
> >>file. Since there is no state stored in the QCoW2, the TPM will
> >>start initializing itself to an initial 'blank' state.
> >So it looks like the problem is you access the file when guest isn't
> >running.  Delaying the initialization until the guest actually starts
> >running will solve the problem in a way that is more consistent with
> >other qemu devices.
> >
> I'd agree if there wasn't one more thing: encryption on the data
> inside the QCoW2 filesystem
> 
> First: There are two ways to encrypt the data.
> 
> One comes with the QCoW2 type of image and it comes for free. Set
> the encryption flag when creating the QCoW2 file and one has to
> provide a key to access the QCoW2. I found this mode problematic for
> users since it required me to go through the monitor every time I
> started the VM. Besides that the key is provided so late that all
> devices are already initialized and if the wrong key was provided
> the only thing the TPM can do is to go into shutdown mode since
> there is state on the QCoW2 but it cannot be decrypted. This also
> became problematic when doing migrations with libvirt for example
> and one was to have a wrong key/password installed on the target
> side -- graceful termination of the migration is impossible.
> 
> So the above drove the implementation of the encryption mode added
> in patch 10 in this series. Here the key is provided via command
> line and it can be used immediately. So I am reading the state blobs
> from the file, decrypt them, create the CRC32 on the plain data and
> check against the CRC32 stored in the 'directory'. If it doesn't
> match the expected CRC32 either the key was wrong or the state is
> corrupted and I can terminate Qemu gracefully. I can also react
> appropriately if no key was provided but one is expected and
> vice-versa. Also in case of migration this now allows me to
> terminate Qemu gracefully so it continues running on the source.
> This is an improvement over the situation described above where in
> case the target had the wrong key the TPM went into shutdown mode
> and the user would be wondering why that is -- the TPM becomes
> inaccessible. However, particularly in the case of migration with
> shared storage I need to access the QCoW2 file to check whether on
> the target I have the right key. And this happens very early during
> qemu startup on the target machine. So that's where the file locking
> on the block layer comes in. I want to serialize access to the QCoW2
> so I don't read intermediate state on the migration-target host that
> can occur if the source is currently writing TPM state data.
> 
> This patch and the command line provided key along with the
> encryption mode added in patch 10 in this series add to usability
> and help prevent failures.
> 
> Regards,
>      Stefan

Sure, it's a useful feature of validating the device early.
But as I said above, it's useful for other devices besides
TPM. However it introduces issues where state changes
since guest keeps running (such as what you have described here).

I think solving this in a way specific to TPM is a mistake.
Passing key on command line does not mean you must use it
immediately, this can still be done when guest starts running.

Further, the patchset seems to be split in
a way that introduces support headaches and makes review
harder, not easier. In this case, we will have to support
both a broken mode (key supplied later) and a non-broken one
(key supplied on init). It would be better to implement
a single mode, in a single patch.
Stefan Berger - Sept. 7, 2011, 1:06 p.m.
On 09/07/2011 07:18 AM, Michael S. Tsirkin wrote:
> On Tue, Sep 06, 2011 at 07:55:48PM -0400, Stefan Berger wrote:
>> On 09/04/2011 03:32 PM, Michael S. Tsirkin wrote:
>>> On Thu, Sep 01, 2011 at 09:53:40PM -0400, Stefan Berger wrote:
>>>>> Generally, what all other devices do is perform validation
>>>>> as the last step in migration when device state
>>>>> is restored. On failure, management can decide what to do:
>>>>> retry migration or restart on source.
>>>>>
>>>>> Why is TPM special and needs to be treated differently?
>>>>>
>>>>>
>>>>>
>>> ...
>>>
>>>> More detail: Typically one starts out with an empty QCoW2 file
>>>> created via qemu-img. Once Qemu starts and initializes the
>>>> libtpms-based TPM, it tries to read existing state from that QCoW2
>>>> file. Since there is no state stored in the QCoW2, the TPM will
>>>> start initializing itself to an initial 'blank' state.
>>> So it looks like the problem is you access the file when guest isn't
>>> running.  Delaying the initialization until the guest actually starts
>>> running will solve the problem in a way that is more consistent with
>>> other qemu devices.
>>>
>> I'd agree if there wasn't one more thing: encryption on the data
>> inside the QCoW2 filesystem
>>
>> First: There are two ways to encrypt the data.
>>
>> One comes with the QCoW2 type of image and it comes for free. Set
>> the encryption flag when creating the QCoW2 file and one has to
>> provide a key to access the QCoW2. I found this mode problematic for
>> users since it required me to go through the monitor every time I
>> started the VM. Besides that the key is provided so late that all
>> devices are already initialized and if the wrong key was provided
>> the only thing the TPM can do is to go into shutdown mode since
>> there is state on the QCoW2 but it cannot be decrypted. This also
>> became problematic when doing migrations with libvirt for example
>> and one was to have a wrong key/password installed on the target
>> side -- graceful termination of the migration is impossible.
>>
>> So the above drove the implementation of the encryption mode added
>> in patch 10 in this series. Here the key is provided via command
>> line and it can be used immediately. So I am reading the state blobs
>> from the file, decrypt them, create the CRC32 on the plain data and
>> check against the CRC32 stored in the 'directory'. If it doesn't
>> match the expected CRC32 either the key was wrong or the state is
>> corrupted and I can terminate Qemu gracefully. I can also react
>> appropriately if no key was provided but one is expected and
>> vice-versa. Also in case of migration this now allows me to
>> terminate Qemu gracefully so it continues running on the source.
>> This is an improvement over the situation described above where in
>> case the target had the wrong key the TPM went into shutdown mode
>> and the user would be wondering why that is -- the TPM becomes
>> inaccessible. However, particularly in the case of migration with
>> shared storage I need to access the QCoW2 file to check whether on
>> the target I have the right key. And this happens very early during
>> qemu startup on the target machine. So that's where the file locking
>> on the block layer comes in. I want to serialize access to the QCoW2
>> so I don't read intermediate state on the migration-target host that
>> can occur if the source is currently writing TPM state data.
>>
>> This patch and the command line provided key along with the
>> encryption mode added in patch 10 in this series add to usability
>> and help prevent failures.
>>
>> Regards,
>>       Stefan
> Sure, it's a useful feature of validating the device early.
> But as I said above, it's useful for other devices besides
> TPM. However it introduces issues where state changes
> since guest keeps running (such as what you have described here).
>
> I think solving this in a way specific to TPM is a mistake.
> Passing key on command line does not mean you must use it
> immediately, this can still be done when guest starts running.
>
If I was not using the key immediately I could just drop this patch and 
the following one introducing the blob encryption and we would have to 
go with the QCoW2 encryption mode. Delaying the key usage then becomes 
equivalent to how QCoW2 is handling its encryption mode along with the 
problems related to user-friendliness / handling of missing or wrong 
keys described earlier.

> Further, the patchset seems to be split in
> a way that introduces support headaches and makes review
Patch 8 introduces file locking for bdrv. Patch 9 implements support for 
string TPM blobs inside a QCoW2 image and makes use of the locking. 
Patch 10 adds encryption support for the TPM blobs. What otherwise would 
be the logical split?
> harder, not easier. In this case, we will have to support
> both a broken mode (key supplied later) and a non-broken one
> (key supplied on init). It would be better to implement
> a single mode, in a single patch.
>
>
If we call QCoW2 encryption support the 'broken mode' and we want to do 
better than this then I do have to solve it in a TPM-specific way since 
Qemu otherwise does not support any better method (afaics).
QCoW2 encryption is there today and we get it for free. The only thing I 
could really do in patch 10 in this series is check whether the QCoW2 
image is encrypted and refuse to use it.
I find it contradicting what you said above.

Regards,
    Stefan
Michael S. Tsirkin - Sept. 7, 2011, 1:16 p.m.
On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
> >>First: There are two ways to encrypt the data.
> >>
> >>One comes with the QCoW2 type of image and it comes for free. Set
> >>the encryption flag when creating the QCoW2 file and one has to
> >>provide a key to access the QCoW2. I found this mode problematic for
> >>users since it required me to go through the monitor every time I
> >>started the VM. Besides that the key is provided so late that all
> >>devices are already initialized and if the wrong key was provided
> >>the only thing the TPM can do is to go into shutdown mode since
> >>there is state on the QCoW2 but it cannot be decrypted. This also
> >>became problematic when doing migrations with libvirt for example
> >>and one was to have a wrong key/password installed on the target
> >>side -- graceful termination of the migration is impossible.

OK let's go back to this for a moment. Add a load
callback, access file there. On failure, return
an error. migration fails gracefully, and
management can retry, or migrate to another node,
or whatever.

What's the problem exactly?
Stefan Berger - Sept. 7, 2011, 1:56 p.m.
On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
> On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
>>>> First: There are two ways to encrypt the data.
>>>>
>>>> One comes with the QCoW2 type of image and it comes for free. Set
>>>> the encryption flag when creating the QCoW2 file and one has to
>>>> provide a key to access the QCoW2. I found this mode problematic for
>>>> users since it required me to go through the monitor every time I
>>>> started the VM. Besides that the key is provided so late that all
>>>> devices are already initialized and if the wrong key was provided
>>>> the only thing the TPM can do is to go into shutdown mode since
>>>> there is state on the QCoW2 but it cannot be decrypted. This also
>>>> became problematic when doing migrations with libvirt for example
>>>> and one was to have a wrong key/password installed on the target
>>>> side -- graceful termination of the migration is impossible.
> OK let's go back to this for a moment. Add a load
> callback, access file there. On failure, return
> an error. migration fails gracefully, and
> management can retry, or migrate to another node,
> or whatever.
>
> What's the problem exactly?
>
>
The switch-over from source to destination already happened when the key 
is finally passed and you just won't be able to access the QCoW2 in case 
the key was wrong. Similar problems occur when you start a VM with an 
encrypted QCoW2 image. The monitor will prompt you for the password and 
then you start the VM and if the password was wrong the OS just won't be 
able to access the image.

    Stefan
Michael S. Tsirkin - Sept. 7, 2011, 2:10 p.m.
On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
> On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
> >On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
> >>>>First: There are two ways to encrypt the data.
> >>>>
> >>>>One comes with the QCoW2 type of image and it comes for free. Set
> >>>>the encryption flag when creating the QCoW2 file and one has to
> >>>>provide a key to access the QCoW2. I found this mode problematic for
> >>>>users since it required me to go through the monitor every time I
> >>>>started the VM. Besides that the key is provided so late that all
> >>>>devices are already initialized and if the wrong key was provided
> >>>>the only thing the TPM can do is to go into shutdown mode since
> >>>>there is state on the QCoW2 but it cannot be decrypted. This also
> >>>>became problematic when doing migrations with libvirt for example
> >>>>and one was to have a wrong key/password installed on the target
> >>>>side -- graceful termination of the migration is impossible.
> >OK let's go back to this for a moment. Add a load
> >callback, access file there. On failure, return
> >an error. migration fails gracefully, and
> >management can retry, or migrate to another node,
> >or whatever.
> >
> >What's the problem exactly?
> >
> >
> The switch-over from source to destination already happened when the
> key is finally passed and you just won't be able to access the QCoW2
> in case the key was wrong.

This is exactly what happens with any kind of othe rmigration errror.
So fail migration, and source can get restarted if necessary.

> Similar problems occur when you start a
> VM with an encrypted QCoW2 image. The monitor will prompt you for
> the password and then you start the VM and if the password was wrong
> the OS just won't be able to access the image.
> 
>    Stefan

Why can't you verify the password?
Stefan Berger - Sept. 7, 2011, 2:25 p.m.
On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
> On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
>> On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
>>> On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
>>>>>> First: There are two ways to encrypt the data.
>>>>>>
>>>>>> One comes with the QCoW2 type of image and it comes for free. Set
>>>>>> the encryption flag when creating the QCoW2 file and one has to
>>>>>> provide a key to access the QCoW2. I found this mode problematic for
>>>>>> users since it required me to go through the monitor every time I
>>>>>> started the VM. Besides that the key is provided so late that all
>>>>>> devices are already initialized and if the wrong key was provided
>>>>>> the only thing the TPM can do is to go into shutdown mode since
>>>>>> there is state on the QCoW2 but it cannot be decrypted. This also
>>>>>> became problematic when doing migrations with libvirt for example
>>>>>> and one was to have a wrong key/password installed on the target
>>>>>> side -- graceful termination of the migration is impossible.
>>> OK let's go back to this for a moment. Add a load
>>> callback, access file there. On failure, return
>>> an error. migration fails gracefully, and
>>> management can retry, or migrate to another node,
>>> or whatever.
>>>
>>> What's the problem exactly?
>>>
>>>
>> The switch-over from source to destination already happened when the
>> key is finally passed and you just won't be able to access the QCoW2
>> in case the key was wrong.
> This is exactly what happens with any kind of othe rmigration errror.
> So fail migration, and source can get restarted if necessary.
>
I guess I wanted to improve on this and catch user errors.
If we let migration fail then all you can do is try to terminate the VM 
on the destination and cold-start on the source.
>> Similar problems occur when you start a
>> VM with an encrypted QCoW2 image. The monitor will prompt you for
>> the password and then you start the VM and if the password was wrong
>> the OS just won't be able to access the image.
>>
>>     Stefan
> Why can't you verify the password?
>
I do verify the key/password in the TPM driver. If the driver cannot 
make sense of the contents of the QCoW2 due to wrong key I simply put 
the driver into failure mode. That's all I can do with encrypted QCoW2.

In case of a QCoW2 encrypted VM image it's different. There I guess the 
intelligence of what is good and bad data is only inside the OS.

    Stefan
Michael S. Tsirkin - Sept. 7, 2011, 2:35 p.m.
On Wed, Sep 07, 2011 at 10:25:08AM -0400, Stefan Berger wrote:
> On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
> >On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
> >>On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
> >>>On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
> >>>>>>First: There are two ways to encrypt the data.
> >>>>>>
> >>>>>>One comes with the QCoW2 type of image and it comes for free. Set
> >>>>>>the encryption flag when creating the QCoW2 file and one has to
> >>>>>>provide a key to access the QCoW2. I found this mode problematic for
> >>>>>>users since it required me to go through the monitor every time I
> >>>>>>started the VM. Besides that the key is provided so late that all
> >>>>>>devices are already initialized and if the wrong key was provided
> >>>>>>the only thing the TPM can do is to go into shutdown mode since
> >>>>>>there is state on the QCoW2 but it cannot be decrypted. This also
> >>>>>>became problematic when doing migrations with libvirt for example
> >>>>>>and one was to have a wrong key/password installed on the target
> >>>>>>side -- graceful termination of the migration is impossible.
> >>>OK let's go back to this for a moment. Add a load
> >>>callback, access file there. On failure, return
> >>>an error. migration fails gracefully, and
> >>>management can retry, or migrate to another node,
> >>>or whatever.
> >>>
> >>>What's the problem exactly?
> >>>
> >>>
> >>The switch-over from source to destination already happened when the
> >>key is finally passed and you just won't be able to access the QCoW2
> >>in case the key was wrong.
> >This is exactly what happens with any kind of othe rmigration errror.
> >So fail migration, and source can get restarted if necessary.
> >
> I guess I wanted to improve on this and catch user errors.
> If we let migration fail then all you can do is try to terminate the
> VM on the destination and cold-start on the source.

No, normally if migration fails VM is not started on destination,
and it can just continue on source.

> >>Similar problems occur when you start a
> >>VM with an encrypted QCoW2 image. The monitor will prompt you for
> >>the password and then you start the VM and if the password was wrong
> >>the OS just won't be able to access the image.
> >>
> >>    Stefan
> >Why can't you verify the password?
> >
> I do verify the key/password in the TPM driver. If the driver cannot
> make sense of the contents of the QCoW2 due to wrong key I simply
> put the driver into failure mode. That's all I can do with encrypted
> QCoW2.

You can return error from init script which will make qemu exit.

> In case of a QCoW2 encrypted VM image it's different. There I guess
> the intelligence of what is good and bad data is only inside the OS.
> 
>    Stefan
Stefan Berger - Sept. 7, 2011, 3:06 p.m.
On 09/07/2011 10:35 AM, Michael S. Tsirkin wrote:
> On Wed, Sep 07, 2011 at 10:25:08AM -0400, Stefan Berger wrote:
>> On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
>>> On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
>>>> On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
>>>>> On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
>>>>>>>> First: There are two ways to encrypt the data.
>>>>>>>>
>>>>>>>> One comes with the QCoW2 type of image and it comes for free. Set
>>>>>>>> the encryption flag when creating the QCoW2 file and one has to
>>>>>>>> provide a key to access the QCoW2. I found this mode problematic for
>>>>>>>> users since it required me to go through the monitor every time I
>>>>>>>> started the VM. Besides that the key is provided so late that all
>>>>>>>> devices are already initialized and if the wrong key was provided
>>>>>>>> the only thing the TPM can do is to go into shutdown mode since
>>>>>>>> there is state on the QCoW2 but it cannot be decrypted. This also
>>>>>>>> became problematic when doing migrations with libvirt for example
>>>>>>>> and one was to have a wrong key/password installed on the target
>>>>>>>> side -- graceful termination of the migration is impossible.
>>>>> OK let's go back to this for a moment. Add a load
>>>>> callback, access file there. On failure, return
>>>>> an error. migration fails gracefully, and
>>>>> management can retry, or migrate to another node,
>>>>> or whatever.
>>>>>
>>>>> What's the problem exactly?
>>>>>
>>>>>
>>>> The switch-over from source to destination already happened when the
>>>> key is finally passed and you just won't be able to access the QCoW2
>>>> in case the key was wrong.
>>> This is exactly what happens with any kind of othe rmigration errror.
>>> So fail migration, and source can get restarted if necessary.
>>>
>> I guess I wanted to improve on this and catch user errors.
>> If we let migration fail then all you can do is try to terminate the
>> VM on the destination and cold-start on the source.
> No, normally if migration fails VM is not started on destination,
> and it can just continue on source.
>
When I had tried this in conjunction with encrypted QCoW2 the 
switch-over already had happened and the source had died. So a wrong key 
on the destination was fatal.
>>>> Similar problems occur when you start a
>>>> VM with an encrypted QCoW2 image. The monitor will prompt you for
>>>> the password and then you start the VM and if the password was wrong
>>>> the OS just won't be able to access the image.
>>>>
>>>>     Stefan
>>> Why can't you verify the password?
>>>
>> I do verify the key/password in the TPM driver. If the driver cannot
>> make sense of the contents of the QCoW2 due to wrong key I simply
>> put the driver into failure mode. That's all I can do with encrypted
>> QCoW2.
> You can return error from init script which will make qemu exit.
>
I can return an error code when the front- and backend interfaces are 
initialized, but that happens really early and the encyrption key 
entered through the monitor is not available at this point.

I also don't get a notification about when the key was entered. In case 
of QCoW2 encryption (and migration) I delay initialization until very 
late, basically when the VM accesses the tpm tis hardware emulation 
layer again (needs to be done this way I think to support block 
migration where I cannot even access the block device early on at all). 
Only then I find out that the key was wrong. I guess any other handling 
would require blockdev.c's invocation of monitor_read_bdrv_key_start() 
to be 'somehow' extended and ... do what ? loop until the correct 
password was entered?

    Stefan
>> In case of a QCoW2 encrypted VM image it's different. There I guess
>> the intelligence of what is good and bad data is only inside the OS.
>>
>>     Stefan
Michael S. Tsirkin - Sept. 7, 2011, 3:16 p.m.
On Wed, Sep 07, 2011 at 11:06:42AM -0400, Stefan Berger wrote:
> On 09/07/2011 10:35 AM, Michael S. Tsirkin wrote:
> >On Wed, Sep 07, 2011 at 10:25:08AM -0400, Stefan Berger wrote:
> >>On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
> >>>On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
> >>>>On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
> >>>>>On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
> >>>>>>>>First: There are two ways to encrypt the data.
> >>>>>>>>
> >>>>>>>>One comes with the QCoW2 type of image and it comes for free. Set
> >>>>>>>>the encryption flag when creating the QCoW2 file and one has to
> >>>>>>>>provide a key to access the QCoW2. I found this mode problematic for
> >>>>>>>>users since it required me to go through the monitor every time I
> >>>>>>>>started the VM. Besides that the key is provided so late that all
> >>>>>>>>devices are already initialized and if the wrong key was provided
> >>>>>>>>the only thing the TPM can do is to go into shutdown mode since
> >>>>>>>>there is state on the QCoW2 but it cannot be decrypted. This also
> >>>>>>>>became problematic when doing migrations with libvirt for example
> >>>>>>>>and one was to have a wrong key/password installed on the target
> >>>>>>>>side -- graceful termination of the migration is impossible.
> >>>>>OK let's go back to this for a moment. Add a load
> >>>>>callback, access file there. On failure, return
> >>>>>an error. migration fails gracefully, and
> >>>>>management can retry, or migrate to another node,
> >>>>>or whatever.
> >>>>>
> >>>>>What's the problem exactly?
> >>>>>
> >>>>>
> >>>>The switch-over from source to destination already happened when the
> >>>>key is finally passed and you just won't be able to access the QCoW2
> >>>>in case the key was wrong.
> >>>This is exactly what happens with any kind of othe rmigration errror.
> >>>So fail migration, and source can get restarted if necessary.
> >>>
> >>I guess I wanted to improve on this and catch user errors.
> >>If we let migration fail then all you can do is try to terminate the
> >>VM on the destination and cold-start on the source.
> >No, normally if migration fails VM is not started on destination,
> >and it can just continue on source.
> >
> When I had tried this in conjunction with encrypted QCoW2 the
> switch-over already had happened and the source had died.

Giving continue command should bring it back.

> So a wrong key on the destination was fatal.

So it's a bug in the code then?

> >>>>Similar problems occur when you start a
> >>>>VM with an encrypted QCoW2 image. The monitor will prompt you for
> >>>>the password and then you start the VM and if the password was wrong
> >>>>the OS just won't be able to access the image.
> >>>>
> >>>>    Stefan
> >>>Why can't you verify the password?
> >>>
> >>I do verify the key/password in the TPM driver. If the driver cannot
> >>make sense of the contents of the QCoW2 due to wrong key I simply
> >>put the driver into failure mode. That's all I can do with encrypted
> >>QCoW2.
> >You can return error from init script which will make qemu exit.
> >
> I can return an error code when the front- and backend interfaces
> are initialized, but that happens really early and the encyrption
> key entered through the monitor is not available at this point.
>
> I also don't get a notification about when the key was entered. In
> case of QCoW2 encryption (and migration) I delay initialization
> until very late, basically when the VM accesses the tpm tis hardware
> emulation layer again (needs to be done this way I think to support
> block migration where I cannot even access the block device early on
> at all).

So it in the loadvm callback. This happens when guest is
stopped on source, so no need for locks.
On failure you return an error and migration fails
before destination is started. You can

> Only then I find out that the key was wrong. I guess any
> other handling would require blockdev.c's invocation of
> monitor_read_bdrv_key_start() to be 'somehow' extended and ... do
> what ? loop until the correct password was entered?

Return an error so vm start fails?

>    Stefan
> >>In case of a QCoW2 encrypted VM image it's different. There I guess
> >>the intelligence of what is good and bad data is only inside the OS.
> >>
> >>    Stefan
Stefan Berger - Sept. 7, 2011, 4:08 p.m.
On 09/07/2011 11:16 AM, Michael S. Tsirkin wrote:
> On Wed, Sep 07, 2011 at 11:06:42AM -0400, Stefan Berger wrote:
>> On 09/07/2011 10:35 AM, Michael S. Tsirkin wrote:
>>> On Wed, Sep 07, 2011 at 10:25:08AM -0400, Stefan Berger wrote:
>>>> On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
>>>>> On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
>>>>>> On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
>>>>>>> On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
>>>>>>>>>> First: There are two ways to encrypt the data.
>>>>>>>>>>
>>>>>>>>>> One comes with the QCoW2 type of image and it comes for free. Set
>>>>>>>>>> the encryption flag when creating the QCoW2 file and one has to
>>>>>>>>>> provide a key to access the QCoW2. I found this mode problematic for
>>>>>>>>>> users since it required me to go through the monitor every time I
>>>>>>>>>> started the VM. Besides that the key is provided so late that all
>>>>>>>>>> devices are already initialized and if the wrong key was provided
>>>>>>>>>> the only thing the TPM can do is to go into shutdown mode since
>>>>>>>>>> there is state on the QCoW2 but it cannot be decrypted. This also
>>>>>>>>>> became problematic when doing migrations with libvirt for example
>>>>>>>>>> and one was to have a wrong key/password installed on the target
>>>>>>>>>> side -- graceful termination of the migration is impossible.
>>>>>>> OK let's go back to this for a moment. Add a load
>>>>>>> callback, access file there. On failure, return
>>>>>>> an error. migration fails gracefully, and
>>>>>>> management can retry, or migrate to another node,
>>>>>>> or whatever.
>>>>>>>
>>>>>>> What's the problem exactly?
>>>>>>>
>>>>>>>
>>>>>> The switch-over from source to destination already happened when the
>>>>>> key is finally passed and you just won't be able to access the QCoW2
>>>>>> in case the key was wrong.
>>>>> This is exactly what happens with any kind of othe rmigration errror.
>>>>> So fail migration, and source can get restarted if necessary.
>>>>>
>>>> I guess I wanted to improve on this and catch user errors.
>>>> If we let migration fail then all you can do is try to terminate the
>>>> VM on the destination and cold-start on the source.
>>> No, normally if migration fails VM is not started on destination,
>>> and it can just continue on source.
>>>
>> When I had tried this in conjunction with encrypted QCoW2 the
>> switch-over already had happened and the source had died.
> Giving continue command should bring it back.
>
On the source? Qemu on the source didn't exist anymore.
>> So a wrong key on the destination was fatal.
> So it's a bug in the code then?
>
 From what I saw, yes. Migration is not complete until the passwords had 
been entered. Though the requirement for a correct password wasn't there 
before because Qemu just couldn't know which password is correct since 
it doesn't know what content in a VM image is correct -- just using the 
wrong key gives you content but it's of course not understandable.
>>>>>> Similar problems occur when you start a
>>>>>> VM with an encrypted QCoW2 image. The monitor will prompt you for
>>>>>> the password and then you start the VM and if the password was wrong
>>>>>> the OS just won't be able to access the image.
>>>>>>
>>>>>>     Stefan
>>>>> Why can't you verify the password?
>>>>>
>>>> I do verify the key/password in the TPM driver. If the driver cannot
>>>> make sense of the contents of the QCoW2 due to wrong key I simply
>>>> put the driver into failure mode. That's all I can do with encrypted
>>>> QCoW2.
>>> You can return error from init script which will make qemu exit.
>>>
>> I can return an error code when the front- and backend interfaces
>> are initialized, but that happens really early and the encyrption
>> key entered through the monitor is not available at this point.
>>
>> I also don't get a notification about when the key was entered. In
>> case of QCoW2 encryption (and migration) I delay initialization
>> until very late, basically when the VM accesses the tpm tis hardware
>> emulation layer again (needs to be done this way I think to support
>> block migration where I cannot even access the block device early on
>> at all).
> So it in the loadvm callback. This happens when guest is
> stopped on source, so no need for locks.
Two bigger cases here:

1) Encryption key passed via command line:
     - Migration with shared storage: When Qemu is initializing on the 
destination side I try to access the QCoW2 file. I do some basic tests 
to check whether a key was needed but none was given or whether some of 
the content could be read to confirm a valid key. This is done really 
early on during startup of Qemu on the destination side while or before 
actually the memory pages were transferred. Graceful termination was 
easily possible here.
     - Migration using block migration: During initialization I only see 
an empty QCoW2 file (created by libvirt). I terminate at this point and 
do another initialization later on which basically comes down to 
initializing upon access of the TPM TIS interface. At this point 
graceful termination wasn't possible anymore. There may be a possibility 
to do this in the loadvm callback, assuming block migration at that 
point has already finished, which I am not quite sure. Though along with 
case 2) below this would then end up in 3 different times for 
initialization of the emulation layer.

2) QCoW2 encryption:
     - This maps to the last case above. Also here graceful termination 
wasn't possible.

As for the loadvm callback: I have a note in my code that in case of 
QCoW2 encryption the key is not available, yet. So I even have to defer 
initialization further. In this case Qemu on the source machine will 
have terminated.

    Stefan


> On failure you return an error and migration fails
> before destination is started. You can
>

>> Only then I find out that the key was wrong. I guess any
>> other handling would require blockdev.c's invocation of
>> monitor_read_bdrv_key_start() to be 'somehow' extended and ... do
>> what ? loop until the correct password was entered?
> Return an error so vm start fails?
>
>>     Stefan
>>>> In case of a QCoW2 encrypted VM image it's different. There I guess
>>>> the intelligence of what is good and bad data is only inside the OS.
>>>>
>>>>     Stefan
Michael S. Tsirkin - Sept. 7, 2011, 6:49 p.m.
On Wed, Sep 07, 2011 at 12:08:22PM -0400, Stefan Berger wrote:
> On 09/07/2011 11:16 AM, Michael S. Tsirkin wrote:
> >On Wed, Sep 07, 2011 at 11:06:42AM -0400, Stefan Berger wrote:
> >>On 09/07/2011 10:35 AM, Michael S. Tsirkin wrote:
> >>>On Wed, Sep 07, 2011 at 10:25:08AM -0400, Stefan Berger wrote:
> >>>>On 09/07/2011 10:10 AM, Michael S. Tsirkin wrote:
> >>>>>On Wed, Sep 07, 2011 at 09:56:52AM -0400, Stefan Berger wrote:
> >>>>>>On 09/07/2011 09:16 AM, Michael S. Tsirkin wrote:
> >>>>>>>On Wed, Sep 07, 2011 at 09:06:05AM -0400, Stefan Berger wrote:
> >>>>>>>>>>First: There are two ways to encrypt the data.
> >>>>>>>>>>
> >>>>>>>>>>One comes with the QCoW2 type of image and it comes for free. Set
> >>>>>>>>>>the encryption flag when creating the QCoW2 file and one has to
> >>>>>>>>>>provide a key to access the QCoW2. I found this mode problematic for
> >>>>>>>>>>users since it required me to go through the monitor every time I
> >>>>>>>>>>started the VM. Besides that the key is provided so late that all
> >>>>>>>>>>devices are already initialized and if the wrong key was provided
> >>>>>>>>>>the only thing the TPM can do is to go into shutdown mode since
> >>>>>>>>>>there is state on the QCoW2 but it cannot be decrypted. This also
> >>>>>>>>>>became problematic when doing migrations with libvirt for example
> >>>>>>>>>>and one was to have a wrong key/password installed on the target
> >>>>>>>>>>side -- graceful termination of the migration is impossible.
> >>>>>>>OK let's go back to this for a moment. Add a load
> >>>>>>>callback, access file there. On failure, return
> >>>>>>>an error. migration fails gracefully, and
> >>>>>>>management can retry, or migrate to another node,
> >>>>>>>or whatever.
> >>>>>>>
> >>>>>>>What's the problem exactly?
> >>>>>>>
> >>>>>>>
> >>>>>>The switch-over from source to destination already happened when the
> >>>>>>key is finally passed and you just won't be able to access the QCoW2
> >>>>>>in case the key was wrong.
> >>>>>This is exactly what happens with any kind of othe rmigration errror.
> >>>>>So fail migration, and source can get restarted if necessary.
> >>>>>
> >>>>I guess I wanted to improve on this and catch user errors.
> >>>>If we let migration fail then all you can do is try to terminate the
> >>>>VM on the destination and cold-start on the source.
> >>>No, normally if migration fails VM is not started on destination,
> >>>and it can just continue on source.
> >>>
> >>When I had tried this in conjunction with encrypted QCoW2 the
> >>switch-over already had happened and the source had died.
> >Giving continue command should bring it back.
> >
> On the source? Qemu on the source didn't exist anymore.

didn't exist? Well, fix your management to not kill
until destination starts running then.
Really, all other devices have the same problem,
we can't solve it 100% but yes, we could
make it more robust. But there appears to be
nothing TPM specific here.

> >>So a wrong key on the destination was fatal.
> >So it's a bug in the code then?
> >
> From what I saw, yes. Migration is not complete until the passwords
> had been entered. Though the requirement for a correct password
> wasn't there before because Qemu just couldn't know which password
> is correct since it doesn't know what content in a VM image is
> correct -- just using the wrong key gives you content but it's of
> course not understandable.

OK, we covered that on irc - the issue is that monitor
on destination is inactive until migration is complete.
Yes we need to fix that but no, it's not a tpm only
problem.

> >>>>>>Similar problems occur when you start a
> >>>>>>VM with an encrypted QCoW2 image. The monitor will prompt you for
> >>>>>>the password and then you start the VM and if the password was wrong
> >>>>>>the OS just won't be able to access the image.
> >>>>>>
> >>>>>>    Stefan
> >>>>>Why can't you verify the password?
> >>>>>
> >>>>I do verify the key/password in the TPM driver. If the driver cannot
> >>>>make sense of the contents of the QCoW2 due to wrong key I simply
> >>>>put the driver into failure mode. That's all I can do with encrypted
> >>>>QCoW2.
> >>>You can return error from init script which will make qemu exit.
> >>>
> >>I can return an error code when the front- and backend interfaces
> >>are initialized, but that happens really early and the encyrption
> >>key entered through the monitor is not available at this point.
> >>
> >>I also don't get a notification about when the key was entered. In
> >>case of QCoW2 encryption (and migration) I delay initialization
> >>until very late, basically when the VM accesses the tpm tis hardware
> >>emulation layer again (needs to be done this way I think to support
> >>block migration where I cannot even access the block device early on
> >>at all).
> >So it in the loadvm callback. This happens when guest is
> >stopped on source, so no need for locks.
> Two bigger cases here:
> 
> 1) Encryption key passed via command line:
>     - Migration with shared storage: When Qemu is initializing on
> the destination side I try to access the QCoW2 file. I do some basic
> tests to check whether a key was needed but none was given or
> whether some of the content could be read to confirm a valid key.
> This is done really early on during startup of Qemu on the
> destination side while or before actually the memory pages were
> transferred. Graceful termination was easily possible here.
>     - Migration using block migration: During initialization I only
> see an empty QCoW2 file (created by libvirt). I terminate at this
> point and do another initialization later on which basically comes
> down to initializing upon access of the TPM TIS interface. At this
> point graceful termination wasn't possible anymore. There may be a
> possibility to do this in the loadvm callback, assuming block
> migration at that point has already finished, which I am not quite
> sure. Though along with case 2) below this would then end up in 3
> different times for initialization of the emulation layer.
> 
> 2) QCoW2 encryption:
>     - This maps to the last case above. Also here graceful
> termination wasn't possible.
> 
> As for the loadvm callback: I have a note in my code that in case of
> QCoW2 encryption the key is not available, yet. So I even have to
> defer initialization further. In this case Qemu on the source
> machine will have terminated.
> 
>    Stefan

The point is to decrypt when you start running on dest.
At that point source is stopped. On failure,
notify management and have it restart source.
On success, management can kill source qemu.

> >On failure you return an error and migration fails
> >before destination is started. You can
> >
> 
> >>Only then I find out that the key was wrong. I guess any
> >>other handling would require blockdev.c's invocation of
> >>monitor_read_bdrv_key_start() to be 'somehow' extended and ... do
> >>what ? loop until the correct password was entered?
> >Return an error so vm start fails?
> >
> >>    Stefan
> >>>>In case of a QCoW2 encrypted VM image it's different. There I guess
> >>>>the intelligence of what is good and bad data is only inside the OS.
> >>>>
> >>>>    Stefan
>
Stefan Berger - Sept. 8, 2011, 12:31 a.m.
On 09/07/2011 02:49 PM, Michael S. Tsirkin wrote:
> On Wed, Sep 07, 2011 at 12:08:22PM -0400, Stefan Berger wrote:
>> On 09/07/2011 11:16 AM, Michael S. Tsirkin wrote:
>>>
>>> So it's a bug in the code then?
>>>
>>  From what I saw, yes. Migration is not complete until the passwords
>> had been entered. Though the requirement for a correct password
>> wasn't there before because Qemu just couldn't know which password
>> is correct since it doesn't know what content in a VM image is
>> correct -- just using the wrong key gives you content but it's of
>> course not understandable.
> OK, we covered that on irc - the issue is that monitor
> on destination is inactive until migration is complete.
> Yes we need to fix that but no, it's not a tpm only
> problem.
I think the TPM is the first device that needs that password before the 
migration switch-over happens. The reason is that the TPM emulation 
layer needs the password/key to read the data from the QCoW2 to be able 
to initialize a device BEFORE the Qemu on the source side terminates 
thinking that the migration went ok. Previously an OS image that was 
'opened' with the wrong key/password would probably cause the OS to not 
be able to read the data and hopefully not destroy it by writing wrongly 
encrypted data into it -- QEMU wasn't able to detect whether the QCoW2 
encryption key was correct or not since it has not knowledge of the 
organization of the data inside the image.
[[You'd need some form of reference point, like a sector that when 
written to a hash is calculated over its data and that hash is written 
into a location in the image. If a wrong key is given and the sector's 
hash ends up being != the reference hash you could say the key is wrong.]]
>>>>>>>> Similar problems occur when you start a
>>>>>>>> VM with an encrypted QCoW2 image. The monitor will prompt you for
>>>>>>>> the password and then you start the VM and if the password was wrong
>>>>>>>> the OS just won't be able to access the image.
>>>>>>>>
>>>>>>>>     Stefan
>>>>>>> Why can't you verify the password?
>>>>>>>
>>>>>> I do verify the key/password in the TPM driver. If the driver cannot
>>>>>> make sense of the contents of the QCoW2 due to wrong key I simply
>>>>>> put the driver into failure mode. That's all I can do with encrypted
>>>>>> QCoW2.
>>>>> You can return error from init script which will make qemu exit.
>>>>>
>>>> I can return an error code when the front- and backend interfaces
>>>> are initialized, but that happens really early and the encyrption
>>>> key entered through the monitor is not available at this point.
>>>>
>>>> I also don't get a notification about when the key was entered. In
>>>> case of QCoW2 encryption (and migration) I delay initialization
>>>> until very late, basically when the VM accesses the tpm tis hardware
>>>> emulation layer again (needs to be done this way I think to support
>>>> block migration where I cannot even access the block device early on
>>>> at all).
>>> So it in the loadvm callback. This happens when guest is
>>> stopped on source, so no need for locks.
>> Two bigger cases here:
>>
>> 1) Encryption key passed via command line:
>>      - Migration with shared storage: When Qemu is initializing on
>> the destination side I try to access the QCoW2 file. I do some basic
>> tests to check whether a key was needed but none was given or
>> whether some of the content could be read to confirm a valid key.
>> This is done really early on during startup of Qemu on the
>> destination side while or before actually the memory pages were
>> transferred. Graceful termination was easily possible here.
>>      - Migration using block migration: During initialization I only
>> see an empty QCoW2 file (created by libvirt). I terminate at this
>> point and do another initialization later on which basically comes
>> down to initializing upon access of the TPM TIS interface. At this
>> point graceful termination wasn't possible anymore. There may be a
>> possibility to do this in the loadvm callback, assuming block
>> migration at that point has already finished, which I am not quite
>> sure. Though along with case 2) below this would then end up in 3
>> different times for initialization of the emulation layer.
>>
>> 2) QCoW2 encryption:
>>      - This maps to the last case above. Also here graceful
>> termination wasn't possible.
>>
>> As for the loadvm callback: I have a note in my code that in case of
>> QCoW2 encryption the key is not available, yet. So I even have to
>> defer initialization further. In this case Qemu on the source
>> machine will have terminated.
>>
>>     Stefan
> The point is to decrypt when you start running on dest.
When the monitor gets the key for the QCoW2 it would have to invoke the 
TPM driver code (callback) and return an error code if the key was found 
to be wrong and display an error message that libvirt could react to. 
Afaik none of the callback and error display logic  is in place. Is that 
something we could add later as an improvement?
> At that point source is stopped. On failure,
> notify management and have it restart source.
> On success, management can kill source qemu.
>
Stefan
Michael S. Tsirkin - Sept. 8, 2011, 10:36 a.m.
On Wed, Sep 07, 2011 at 08:31:45PM -0400, Stefan Berger wrote:
> On 09/07/2011 02:49 PM, Michael S. Tsirkin wrote:
> >On Wed, Sep 07, 2011 at 12:08:22PM -0400, Stefan Berger wrote:
> >>On 09/07/2011 11:16 AM, Michael S. Tsirkin wrote:
> >>>
> >>>So it's a bug in the code then?
> >>>
> >> From what I saw, yes. Migration is not complete until the passwords
> >>had been entered. Though the requirement for a correct password
> >>wasn't there before because Qemu just couldn't know which password
> >>is correct since it doesn't know what content in a VM image is
> >>correct -- just using the wrong key gives you content but it's of
> >>course not understandable.
> >OK, we covered that on irc - the issue is that monitor
> >on destination is inactive until migration is complete.
> >Yes we need to fix that but no, it's not a tpm only
> >problem.
> I think the TPM is the first device that needs that password before
> the migration switch-over happens.

Yes. But we want the monitor on dest for other reasons,
for example to be able to check migration status.

> The reason is that the TPM
> emulation layer needs the password/key to read the data from the
> QCoW2 to be able to initialize a device BEFORE the Qemu on the
> source side terminates thinking that the migration went ok.
> Previously an OS image that was 'opened' with the wrong key/password
> would probably cause the OS to not be able to read the data and
> hopefully not destroy it by writing wrongly encrypted data into it
> -- QEMU wasn't able to detect whether the QCoW2 encryption key was
> correct or not since it has not knowledge of the organization of the
> data inside the image.
> [[You'd need some form of reference point, like a sector that when
> written to a hash is calculated over its data and that hash is
> written into a location in the image. If a wrong key is given and
> the sector's hash ends up being != the reference hash you could say
> the key is wrong.]]
> >>>>>>>>Similar problems occur when you start a
> >>>>>>>>VM with an encrypted QCoW2 image. The monitor will prompt you for
> >>>>>>>>the password and then you start the VM and if the password was wrong
> >>>>>>>>the OS just won't be able to access the image.
> >>>>>>>>
> >>>>>>>>    Stefan
> >>>>>>>Why can't you verify the password?
> >>>>>>>
> >>>>>>I do verify the key/password in the TPM driver. If the driver cannot
> >>>>>>make sense of the contents of the QCoW2 due to wrong key I simply
> >>>>>>put the driver into failure mode. That's all I can do with encrypted
> >>>>>>QCoW2.
> >>>>>You can return error from init script which will make qemu exit.
> >>>>>
> >>>>I can return an error code when the front- and backend interfaces
> >>>>are initialized, but that happens really early and the encyrption
> >>>>key entered through the monitor is not available at this point.
> >>>>
> >>>>I also don't get a notification about when the key was entered. In
> >>>>case of QCoW2 encryption (and migration) I delay initialization
> >>>>until very late, basically when the VM accesses the tpm tis hardware
> >>>>emulation layer again (needs to be done this way I think to support
> >>>>block migration where I cannot even access the block device early on
> >>>>at all).
> >>>So it in the loadvm callback. This happens when guest is
> >>>stopped on source, so no need for locks.
> >>Two bigger cases here:
> >>
> >>1) Encryption key passed via command line:
> >>     - Migration with shared storage: When Qemu is initializing on
> >>the destination side I try to access the QCoW2 file. I do some basic
> >>tests to check whether a key was needed but none was given or
> >>whether some of the content could be read to confirm a valid key.
> >>This is done really early on during startup of Qemu on the
> >>destination side while or before actually the memory pages were
> >>transferred. Graceful termination was easily possible here.
> >>     - Migration using block migration: During initialization I only
> >>see an empty QCoW2 file (created by libvirt). I terminate at this
> >>point and do another initialization later on which basically comes
> >>down to initializing upon access of the TPM TIS interface. At this
> >>point graceful termination wasn't possible anymore. There may be a
> >>possibility to do this in the loadvm callback, assuming block
> >>migration at that point has already finished, which I am not quite
> >>sure. Though along with case 2) below this would then end up in 3
> >>different times for initialization of the emulation layer.
> >>
> >>2) QCoW2 encryption:
> >>     - This maps to the last case above. Also here graceful
> >>termination wasn't possible.
> >>
> >>As for the loadvm callback: I have a note in my code that in case of
> >>QCoW2 encryption the key is not available, yet. So I even have to
> >>defer initialization further. In this case Qemu on the source
> >>machine will have terminated.
> >>
> >>    Stefan
> >The point is to decrypt when you start running on dest.
> When the monitor gets the key for the QCoW2 it would have to invoke
> the TPM driver code (callback) and return an error code if the key
> was found to be wrong and display an error message that libvirt
> could react to.
> Afaik none of the callback and error display logic
> is in place.
> Is that something we could add later as an improvement?

What we do for errors detected at destination is fail migration.
This logic is in place as migration can fail
when we get a state we don't recognize.
management is prepared to handle that already.

> >At that point source is stopped. On failure,
> >notify management and have it restart source.
> >On success, management can kill source qemu.
> >
> Stefan

Patch

Index: qemu-git/block.c
===================================================================
--- qemu-git.orig/block.c
+++ qemu-git/block.c
@@ -521,6 +521,8 @@  static int bdrv_open_common(BlockDriverS
         goto free_and_fail;
     }
 
+    drv->num_locks = 0;
+
     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
 
     ret = refresh_total_sectors(bs, bs->total_sectors);
@@ -1316,6 +1318,45 @@  void bdrv_get_geometry(BlockDriverState 
     *nb_sectors_ptr = length;
 }
 
+/* file locking */
+static int bdrv_lock_common(BlockDriverState *bs, BDRVLockType lock_type)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+
+    if (bs->file) {
+        drv = bs->file->drv;
+        if (drv->bdrv_lock) {
+            return drv->bdrv_lock(bs->file, lock_type);
+        }
+    }
+
+    if (drv->bdrv_lock) {
+        return drv->bdrv_lock(bs, lock_type);
+    }
+
+    return -ENOTSUP;
+}
+
+
+int bdrv_lock(BlockDriverState *bs)
+{
+    if (bdrv_is_read_only(bs)) {
+        return bdrv_lock_common(bs, BDRV_F_RDLCK);
+    }
+
+    return bdrv_lock_common(bs, BDRV_F_WRLCK);
+}
+
+void bdrv_unlock(BlockDriverState *bs)
+{
+    bdrv_lock_common(bs, BDRV_F_UNLCK);
+}
+
+
 struct partition {
         uint8_t boot_ind;           /* 0x80 - active */
         uint8_t head;               /* starting head */
Index: qemu-git/block.h
===================================================================
--- qemu-git.orig/block.h
+++ qemu-git/block.h
@@ -43,6 +43,12 @@  typedef struct QEMUSnapshotInfo {
 #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
 typedef enum {
+    BDRV_F_UNLCK,
+    BDRV_F_RDLCK,
+    BDRV_F_WRLCK,
+} BDRVLockType;
+
+typedef enum {
     BLOCK_ERR_REPORT, BLOCK_ERR_IGNORE, BLOCK_ERR_STOP_ENOSPC,
     BLOCK_ERR_STOP_ANY
 } BlockErrorAction;
@@ -100,6 +106,8 @@  int bdrv_commit(BlockDriverState *bs);
 void bdrv_commit_all(void);
 int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
+int bdrv_lock(BlockDriverState *bs);
+void bdrv_unlock(BlockDriverState *bs);
 void bdrv_register(BlockDriver *bdrv);
 
 
Index: qemu-git/block/raw-posix.c
===================================================================
--- qemu-git.orig/block/raw-posix.c
+++ qemu-git/block/raw-posix.c
@@ -803,6 +803,67 @@  static int64_t raw_get_allocated_file_si
     return (int64_t)st.st_blocks * 512;
 }
 
+static int raw_lock(BlockDriverState *bs, BDRVLockType lock_type)
+{
+    BlockDriver *drv = bs->drv;
+    BDRVRawState *s = bs->opaque;
+    struct flock flock = {
+        .l_whence = SEEK_SET,
+        .l_start = 0,
+        .l_len = 0,
+    };
+    int n;
+
+    switch (lock_type) {
+    case BDRV_F_RDLCK:
+    case BDRV_F_WRLCK:
+        if (drv->num_locks) {
+            drv->num_locks++;
+            return 0;
+        }
+        flock.l_type = (lock_type == BDRV_F_RDLCK) ? F_RDLCK : F_WRLCK;
+        break;
+
+    case BDRV_F_UNLCK:
+        if (--drv->num_locks > 0) {
+            return 0;
+        }
+
+        assert(drv->num_locks == 0);
+
+        flock.l_type = F_UNLCK;
+        break;
+
+    default:
+        return -EINVAL;
+    }
+
+    while (1) {
+        n = fcntl(s->fd, F_SETLKW, &flock);
+        if (n < 0) {
+            if (errno == EINTR) {
+                continue;
+            }
+            if (errno == EAGAIN) {
+                usleep(10000);
+                continue;
+            }
+        }
+        break;
+    }
+
+    if (n == 0 &&
+        ((lock_type == BDRV_F_RDLCK) || (lock_type == BDRV_F_WRLCK))) {
+        drv->num_locks = 1;
+    }
+
+    if (n) {
+        return -errno;
+    }
+
+    return 0;
+}
+
 static int raw_create(const char *filename, QEMUOptionParameter *options)
 {
     int fd;
@@ -901,6 +962,8 @@  static BlockDriver bdrv_file = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
+    .bdrv_lock = raw_lock,
+
     .create_options = raw_create_options,
 };
 
Index: qemu-git/block_int.h
===================================================================
--- qemu-git.orig/block_int.h
+++ qemu-git/block_int.h
@@ -146,6 +146,10 @@  struct BlockDriver {
      */
     int (*bdrv_has_zero_init)(BlockDriverState *bs);
 
+    /* File locking */
+    int num_locks;
+    int (*bdrv_lock)(BlockDriverState *bs, BDRVLockType lock_type);
+
     QLIST_ENTRY(BlockDriver) list;
 };
 
Index: qemu-git/block/raw-win32.c
===================================================================
--- qemu-git.orig/block/raw-win32.c
+++ qemu-git/block/raw-win32.c
@@ -242,6 +242,57 @@  static int64_t raw_get_allocated_file_si
     return st.st_size;
 }
 
+static int raw_lock(BlockDriverState *bs, int lock_type)
+{
+    BlockDriver *drv = bs->drv;
+    BDRVRawState *s = bs->opaque;
+    OVERLAPPED ov;
+    BOOL res;
+    DWORD num_bytes;
+
+    switch (lock_type) {
+    case BDRV_F_RDLCK:
+    case BDRV_F_WRLCK:
+        if (drv->num_locks) {
+            drv->num_locks++;
+            return 0;
+        }
+
+        memset(&ov, 0, sizeof(ov));
+
+        res = LockFileEx(s->hfile, LOCKFILE_EXCLUSIVE_LOCK, 0, ~0, ~0, &ov);
+
+        if (res == FALSE) {
+            res = GetOverlappedResult(s->hfile, &ov, &num_bytes, TRUE);
+        }
+
+        if (res == TRUE) {
+            drv->num_locks = 1;
+        }
+
+        break;
+
+    case BDRV_F_UNLCK:
+        if (--drv->num_locks > 0) {
+            return 0;
+        }
+
+        assert(drv->num_locks >= 0);
+
+        res = UnlockFile(s->hfile, 0, 0, ~0, ~0);
+        break;
+
+    default:
+        return -EINVAL;
+    }
+
+    if (res == FALSE) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
 static int raw_create(const char *filename, QEMUOptionParameter *options)
 {
     int fd;
@@ -289,6 +340,7 @@  static BlockDriver bdrv_file = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
+    .bdrv_lock		= raw_lock,
     .create_options = raw_create_options,
 };