diff mbox

[2/2] spapr: Fix stale HTAB during live migration (TCG)

Message ID 1415168221-2324-3-git-send-email-sam.mj@au1.ibm.com
State New
Headers show

Commit Message

Sam Mendoza-Jonas Nov. 5, 2014, 6:17 a.m. UTC
If a TCG guest reboots during a running migration HTAB entries are not
marked dirty, and the destination boots with an invalid HTAB.

When a reboot occurs reset the state of HTAB migration, and explicitly
inform the destination of invalid entries.

Signed-off-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
---
 hw/ppc/spapr.c         | 59 +++++++++++++++++++++++++++++++++++---------------
 include/hw/ppc/spapr.h |  1 +
 2 files changed, 42 insertions(+), 18 deletions(-)

Comments

Alexander Graf Nov. 5, 2014, 8:05 a.m. UTC | #1
On 05.11.14 07:17, Samuel Mendoza-Jonas wrote:
> If a TCG guest reboots during a running migration HTAB entries are not
> marked dirty, and the destination boots with an invalid HTAB.
> 
> When a reboot occurs reset the state of HTAB migration, and explicitly
> inform the destination of invalid entries.
> 
> Signed-off-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
> ---
>  hw/ppc/spapr.c         | 59 +++++++++++++++++++++++++++++++++++---------------
>  include/hw/ppc/spapr.h |  1 +
>  2 files changed, 42 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 1610c28..9f419e8 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -829,26 +829,30 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>  
>      shift = kvmppc_reset_htab(spapr->htab_shift);
>  
> +    pthread_mutex_lock(&spapr->htab_mutex);
>      if (shift > 0) {
>          /* Kernel handles htab, we don't need to allocate one */
>          spapr->htab_shift = shift;
>          kvmppc_kern_htab = true;
>  
>          /* Tell readers to update their file descriptor */
> -        pthread_mutex_lock(&spapr->htab_mutex);
>          if (spapr->htab_fd > 0) {
>              spapr->htab_fd_stale = true;
>          }
> -        pthread_mutex_unlock(&spapr->htab_mutex);
>      } else {
>          if (!spapr->htab) {
>              /* Allocate an htab if we don't yet have one */
>              spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
> +        } else {
> +            spapr->htab_mig_full = true;
> +            spapr->htab_first_pass = true;
> +            spapr->htab_save_index = 0;

You could just set the dirty bitmap to "all dirty" here, no? Then you
don't need all the changes belong I presume?

>          }
>  
>          /* And clear it */
>          memset(spapr->htab, 0, HTAB_SIZE(spapr));

... so instead of memset(0)ing it, you could just

  ppc_hash64_store_hpte(env, i, HPTE64_V_HPTE_DIRTY, 0);

the HTAB in a loop.


Alex
Sam Mendoza-Jonas Nov. 5, 2014, 10 p.m. UTC | #2
On 05/11/14 19:05, Alexander Graf wrote:
> 
> 
> On 05.11.14 07:17, Samuel Mendoza-Jonas wrote:
>> If a TCG guest reboots during a running migration HTAB entries are not
>> marked dirty, and the destination boots with an invalid HTAB.
>>
>> When a reboot occurs reset the state of HTAB migration, and explicitly
>> inform the destination of invalid entries.
>>
>> Signed-off-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
>> ---
>>  hw/ppc/spapr.c         | 59 +++++++++++++++++++++++++++++++++++---------------
>>  include/hw/ppc/spapr.h |  1 +
>>  2 files changed, 42 insertions(+), 18 deletions(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 1610c28..9f419e8 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -829,26 +829,30 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>>  
>>      shift = kvmppc_reset_htab(spapr->htab_shift);
>>  
>> +    pthread_mutex_lock(&spapr->htab_mutex);
>>      if (shift > 0) {
>>          /* Kernel handles htab, we don't need to allocate one */
>>          spapr->htab_shift = shift;
>>          kvmppc_kern_htab = true;
>>  
>>          /* Tell readers to update their file descriptor */
>> -        pthread_mutex_lock(&spapr->htab_mutex);
>>          if (spapr->htab_fd > 0) {
>>              spapr->htab_fd_stale = true;
>>          }
>> -        pthread_mutex_unlock(&spapr->htab_mutex);
>>      } else {
>>          if (!spapr->htab) {
>>              /* Allocate an htab if we don't yet have one */
>>              spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
>> +        } else {
>> +            spapr->htab_mig_full = true;
>> +            spapr->htab_first_pass = true;
>> +            spapr->htab_save_index = 0;
> 
> You could just set the dirty bitmap to "all dirty" here, no? Then you
> don't need all the changes belong I presume?

Yes, then you just need to reset htab_save_index to zero. The idea of this approach
was to avoid walking the HTAB twice (once to dirty and once to read it). But it is
a lot of changes for a fairly small benefit. If setting it dirty is preferred I'll
test and send that version. Thanks!
> 
>>          }
>>  
>>          /* And clear it */
>>          memset(spapr->htab, 0, HTAB_SIZE(spapr));
> 
> ... so instead of memset(0)ing it, you could just
> 
>   ppc_hash64_store_hpte(env, i, HPTE64_V_HPTE_DIRTY, 0);
> 
> the HTAB in a loop.
> 
> 
> Alex
>
Alexander Graf Nov. 5, 2014, 10:04 p.m. UTC | #3
On 05.11.14 23:00, Samuel Mendoza-Jonas wrote:
> On 05/11/14 19:05, Alexander Graf wrote:
>>
>>
>> On 05.11.14 07:17, Samuel Mendoza-Jonas wrote:
>>> If a TCG guest reboots during a running migration HTAB entries are not
>>> marked dirty, and the destination boots with an invalid HTAB.
>>>
>>> When a reboot occurs reset the state of HTAB migration, and explicitly
>>> inform the destination of invalid entries.
>>>
>>> Signed-off-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
>>> ---
>>>  hw/ppc/spapr.c         | 59 +++++++++++++++++++++++++++++++++++---------------
>>>  include/hw/ppc/spapr.h |  1 +
>>>  2 files changed, 42 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index 1610c28..9f419e8 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -829,26 +829,30 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>>>  
>>>      shift = kvmppc_reset_htab(spapr->htab_shift);
>>>  
>>> +    pthread_mutex_lock(&spapr->htab_mutex);
>>>      if (shift > 0) {
>>>          /* Kernel handles htab, we don't need to allocate one */
>>>          spapr->htab_shift = shift;
>>>          kvmppc_kern_htab = true;
>>>  
>>>          /* Tell readers to update their file descriptor */
>>> -        pthread_mutex_lock(&spapr->htab_mutex);
>>>          if (spapr->htab_fd > 0) {
>>>              spapr->htab_fd_stale = true;
>>>          }
>>> -        pthread_mutex_unlock(&spapr->htab_mutex);
>>>      } else {
>>>          if (!spapr->htab) {
>>>              /* Allocate an htab if we don't yet have one */
>>>              spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
>>> +        } else {
>>> +            spapr->htab_mig_full = true;
>>> +            spapr->htab_first_pass = true;
>>> +            spapr->htab_save_index = 0;
>>
>> You could just set the dirty bitmap to "all dirty" here, no? Then you
>> don't need all the changes belong I presume?
> 
> Yes, then you just need to reset htab_save_index to zero. The idea of this approach
> was to avoid walking the HTAB twice (once to dirty and once to read it). But it is
> a lot of changes for a fairly small benefit. If setting it dirty is preferred I'll
> test and send that version. Thanks!

Yes, please. I would prefer to keep this code as simple as I can ;). And
the less corner cases we need to watch out for, the easier it becomes.


Alex
Alexey Kardashevskiy Nov. 13, 2014, 7:12 a.m. UTC | #4
On 11/05/2014 05:17 PM, Samuel Mendoza-Jonas wrote:
> If a TCG guest reboots during a running migration HTAB entries are not
> marked dirty, and the destination boots with an invalid HTAB.
> 
> When a reboot occurs reset the state of HTAB migration, and explicitly
> inform the destination of invalid entries.
> 
> Signed-off-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
> ---
>  hw/ppc/spapr.c         | 59 +++++++++++++++++++++++++++++++++++---------------
>  include/hw/ppc/spapr.h |  1 +
>  2 files changed, 42 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 1610c28..9f419e8 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -829,26 +829,30 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>  
>      shift = kvmppc_reset_htab(spapr->htab_shift);
>  
> +    pthread_mutex_lock(&spapr->htab_mutex);
>      if (shift > 0) {
>          /* Kernel handles htab, we don't need to allocate one */
>          spapr->htab_shift = shift;
>          kvmppc_kern_htab = true;
>  
>          /* Tell readers to update their file descriptor */
> -        pthread_mutex_lock(&spapr->htab_mutex);
>          if (spapr->htab_fd > 0) {
>              spapr->htab_fd_stale = true;
>          }
> -        pthread_mutex_unlock(&spapr->htab_mutex);
>      } else {
>          if (!spapr->htab) {
>              /* Allocate an htab if we don't yet have one */
>              spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
> +        } else {
> +            spapr->htab_mig_full = true;
> +            spapr->htab_first_pass = true;
> +            spapr->htab_save_index = 0;
>          }
>  
>          /* And clear it */
>          memset(spapr->htab, 0, HTAB_SIZE(spapr));
>      }
> +    pthread_mutex_unlock(&spapr->htab_mutex);


The pthread_mutex_(un)lock things from the chunk above should go to the
previous patch.


>  
>      /* Update the RMA size if necessary */
>      if (spapr->vrma_adjust) {
> @@ -1019,6 +1023,7 @@ static int htab_save_setup(QEMUFile *f, void *opaque)
>          pthread_mutex_lock(&spapr->htab_mutex);
>          spapr->htab_fd = kvmppc_get_htab_fd(false);
>          spapr->htab_fd_stale = false;
> +        spapr->htab_mig_full = false;
>          pthread_mutex_unlock(&spapr->htab_mutex);
>          if (spapr->htab_fd < 0) {
>              fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
> @@ -1034,6 +1039,7 @@ static int htab_save_setup(QEMUFile *f, void *opaque)
>  static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
>                                   int64_t max_ns)
>  {
> +    bool final = max_ns < 0;
>      int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
>      int index = spapr->htab_save_index;
>      int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> @@ -1041,33 +1047,40 @@ static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
>      assert(spapr->htab_first_pass);
>  
>      do {
> -        int chunkstart;
> +        int chunkstart, invalidstart;
>  
> -        /* Consume invalid HPTEs */
> -        while ((index < htabslots)
> -               && !HPTE_VALID(HPTE(spapr->htab, index))) {


Nit: in most places in qemu "&&" would be in the end of the previos line :)


> +        chunkstart = index;
> +        /* Consume valid HPTEs */
> +        while ((index < htabslots && (index - chunkstart < USHRT_MAX))
> +               && HPTE_VALID(HPTE(spapr->htab, index))) {
>              index++;
>              CLEAN_HPTE(HPTE(spapr->htab, index));
>          }
>  
> -        /* Consume valid HPTEs */
> -        chunkstart = index;
> -        while ((index < htabslots)
> -               && HPTE_VALID(HPTE(spapr->htab, index))) {
> +        invalidstart = index;
> +        /* Consume invalid HPTEs */
> +        while ((index < htabslots && (index - invalidstart < USHRT_MAX))
> +               && !HPTE_VALID(HPTE(spapr->htab, index))) {
>              index++;
>              CLEAN_HPTE(HPTE(spapr->htab, index));
>          }
>  
> -        if (index > chunkstart) {
> -            int n_valid = index - chunkstart;
> +        /* Avoid writing an end marker (0,0,0) */
> +        if (index > chunkstart
> +               && !(chunkstart == invalidstart && !spapr->htab_mig_full)) {
> +            int n_valid = invalidstart - chunkstart;
> +            /* If a reset has occured we must explicitly overwrite the HTAB
> +             * of the destination */
> +            int n_invalid = spapr->htab_mig_full ? index - invalidstart : 0;
>  
>              qemu_put_be32(f, chunkstart);
>              qemu_put_be16(f, n_valid);
> -            qemu_put_be16(f, 0);
> +            qemu_put_be16(f, n_invalid);
>              qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
>                              HASH_PTE_SIZE_64 * n_valid);
>  
> -            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
> +            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns
> +                   && !final) {
>                  break;
>              }
>          }
> @@ -1182,10 +1195,14 @@ static int htab_save_iterate(QEMUFile *f, void *opaque)
>          if (rc < 0) {
>              return rc;
>          }
> -    } else  if (spapr->htab_first_pass) {
> -        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
>      } else {
> -        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
> +        pthread_mutex_lock(&spapr->htab_mutex);
> +        if (spapr->htab_first_pass) {
> +            htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
> +        } else {
> +            rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
> +        }
> +        pthread_mutex_unlock(&spapr->htab_mutex);
>      }
>  
>      /* End marker */
> @@ -1220,7 +1237,13 @@ static int htab_save_complete(QEMUFile *f, void *opaque)
>          close(spapr->htab_fd);
>          spapr->htab_fd = -1;
>      } else {
> -        htab_save_later_pass(f, spapr, -1);
> +        pthread_mutex_lock(&spapr->htab_mutex);
> +        if (spapr->htab_first_pass) {
> +            htab_save_first_pass(f, spapr, -1);
> +        } else {
> +            htab_save_later_pass(f, spapr, -1);
> +        }
> +        pthread_mutex_unlock(&spapr->htab_mutex);
>      }
>  
>      /* End marker */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 5e29bec..ee95459 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -38,6 +38,7 @@ typedef struct sPAPREnvironment {
>      bool htab_first_pass;
>      int htab_fd;
>      bool htab_fd_stale;
> +    bool htab_mig_full;

Not sure what @htab_mig_full stands for exactly. Please explain it in the
commit log.


>      pthread_mutex_t htab_mutex;
>  } sPAPREnvironment;
>  
>
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 1610c28..9f419e8 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -829,26 +829,30 @@  static void spapr_reset_htab(sPAPREnvironment *spapr)
 
     shift = kvmppc_reset_htab(spapr->htab_shift);
 
+    pthread_mutex_lock(&spapr->htab_mutex);
     if (shift > 0) {
         /* Kernel handles htab, we don't need to allocate one */
         spapr->htab_shift = shift;
         kvmppc_kern_htab = true;
 
         /* Tell readers to update their file descriptor */
-        pthread_mutex_lock(&spapr->htab_mutex);
         if (spapr->htab_fd > 0) {
             spapr->htab_fd_stale = true;
         }
-        pthread_mutex_unlock(&spapr->htab_mutex);
     } else {
         if (!spapr->htab) {
             /* Allocate an htab if we don't yet have one */
             spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
+        } else {
+            spapr->htab_mig_full = true;
+            spapr->htab_first_pass = true;
+            spapr->htab_save_index = 0;
         }
 
         /* And clear it */
         memset(spapr->htab, 0, HTAB_SIZE(spapr));
     }
+    pthread_mutex_unlock(&spapr->htab_mutex);
 
     /* Update the RMA size if necessary */
     if (spapr->vrma_adjust) {
@@ -1019,6 +1023,7 @@  static int htab_save_setup(QEMUFile *f, void *opaque)
         pthread_mutex_lock(&spapr->htab_mutex);
         spapr->htab_fd = kvmppc_get_htab_fd(false);
         spapr->htab_fd_stale = false;
+        spapr->htab_mig_full = false;
         pthread_mutex_unlock(&spapr->htab_mutex);
         if (spapr->htab_fd < 0) {
             fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
@@ -1034,6 +1039,7 @@  static int htab_save_setup(QEMUFile *f, void *opaque)
 static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
                                  int64_t max_ns)
 {
+    bool final = max_ns < 0;
     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
     int index = spapr->htab_save_index;
     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
@@ -1041,33 +1047,40 @@  static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
     assert(spapr->htab_first_pass);
 
     do {
-        int chunkstart;
+        int chunkstart, invalidstart;
 
-        /* Consume invalid HPTEs */
-        while ((index < htabslots)
-               && !HPTE_VALID(HPTE(spapr->htab, index))) {
+        chunkstart = index;
+        /* Consume valid HPTEs */
+        while ((index < htabslots && (index - chunkstart < USHRT_MAX))
+               && HPTE_VALID(HPTE(spapr->htab, index))) {
             index++;
             CLEAN_HPTE(HPTE(spapr->htab, index));
         }
 
-        /* Consume valid HPTEs */
-        chunkstart = index;
-        while ((index < htabslots)
-               && HPTE_VALID(HPTE(spapr->htab, index))) {
+        invalidstart = index;
+        /* Consume invalid HPTEs */
+        while ((index < htabslots && (index - invalidstart < USHRT_MAX))
+               && !HPTE_VALID(HPTE(spapr->htab, index))) {
             index++;
             CLEAN_HPTE(HPTE(spapr->htab, index));
         }
 
-        if (index > chunkstart) {
-            int n_valid = index - chunkstart;
+        /* Avoid writing an end marker (0,0,0) */
+        if (index > chunkstart
+               && !(chunkstart == invalidstart && !spapr->htab_mig_full)) {
+            int n_valid = invalidstart - chunkstart;
+            /* If a reset has occured we must explicitly overwrite the HTAB
+             * of the destination */
+            int n_invalid = spapr->htab_mig_full ? index - invalidstart : 0;
 
             qemu_put_be32(f, chunkstart);
             qemu_put_be16(f, n_valid);
-            qemu_put_be16(f, 0);
+            qemu_put_be16(f, n_invalid);
             qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
                             HASH_PTE_SIZE_64 * n_valid);
 
-            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
+            if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns
+                   && !final) {
                 break;
             }
         }
@@ -1182,10 +1195,14 @@  static int htab_save_iterate(QEMUFile *f, void *opaque)
         if (rc < 0) {
             return rc;
         }
-    } else  if (spapr->htab_first_pass) {
-        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
     } else {
-        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
+        pthread_mutex_lock(&spapr->htab_mutex);
+        if (spapr->htab_first_pass) {
+            htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
+        } else {
+            rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
+        }
+        pthread_mutex_unlock(&spapr->htab_mutex);
     }
 
     /* End marker */
@@ -1220,7 +1237,13 @@  static int htab_save_complete(QEMUFile *f, void *opaque)
         close(spapr->htab_fd);
         spapr->htab_fd = -1;
     } else {
-        htab_save_later_pass(f, spapr, -1);
+        pthread_mutex_lock(&spapr->htab_mutex);
+        if (spapr->htab_first_pass) {
+            htab_save_first_pass(f, spapr, -1);
+        } else {
+            htab_save_later_pass(f, spapr, -1);
+        }
+        pthread_mutex_unlock(&spapr->htab_mutex);
     }
 
     /* End marker */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 5e29bec..ee95459 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -38,6 +38,7 @@  typedef struct sPAPREnvironment {
     bool htab_first_pass;
     int htab_fd;
     bool htab_fd_stale;
+    bool htab_mig_full;
     pthread_mutex_t htab_mutex;
 } sPAPREnvironment;