diff mbox series

[v2,7/8] migration/ram: ensure write persistence on loading compressed pages to PMEM

Message ID 20180207073331.14158-8-haozhong.zhang@intel.com
State New
Headers show
Series nvdimm: guarantee persistence of QEMU writes to persistent memory | expand

Commit Message

Haozhong Zhang Feb. 7, 2018, 7:33 a.m. UTC
When loading a compressed page to persistent memory, flush CPU cache
after the data is decompressed. Combined with a call to pmem_drain()
at the end of memory loading, we can guarantee those compressed pages
are persistently loaded to PMEM.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 include/qemu/pmem.h |  4 ++++
 migration/ram.c     | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert Feb. 7, 2018, 11:54 a.m. UTC | #1
* Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> When loading a compressed page to persistent memory, flush CPU cache
> after the data is decompressed. Combined with a call to pmem_drain()
> at the end of memory loading, we can guarantee those compressed pages
> are persistently loaded to PMEM.

Can you explain why this can use the flush and doesn't need the special
memset?

Dave

> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  include/qemu/pmem.h |  4 ++++
>  migration/ram.c     | 16 +++++++++++-----
>  2 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
> index 77ee1fc4eb..20e3f6e71d 100644
> --- a/include/qemu/pmem.h
> +++ b/include/qemu/pmem.h
> @@ -37,6 +37,10 @@ static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
>      return memset(pmemdest, c, len);
>  }
>  
> +static inline void pmem_flush(const void *addr, size_t len)
> +{
> +}
> +
>  static inline void pmem_drain(void)
>  {
>  }
> diff --git a/migration/ram.c b/migration/ram.c
> index 5a79bbff64..924d2b9537 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -274,6 +274,7 @@ struct DecompressParam {
>      void *des;
>      uint8_t *compbuf;
>      int len;
> +    bool is_pmem;
>  };
>  typedef struct DecompressParam DecompressParam;
>  
> @@ -2502,7 +2503,7 @@ static void *do_data_decompress(void *opaque)
>      DecompressParam *param = opaque;
>      unsigned long pagesize;
>      uint8_t *des;
> -    int len;
> +    int len, rc;
>  
>      qemu_mutex_lock(&param->mutex);
>      while (!param->quit) {
> @@ -2518,8 +2519,11 @@ static void *do_data_decompress(void *opaque)
>               * not a problem because the dirty page will be retransferred
>               * and uncompress() won't break the data in other pages.
>               */
> -            uncompress((Bytef *)des, &pagesize,
> -                       (const Bytef *)param->compbuf, len);
> +            rc = uncompress((Bytef *)des, &pagesize,
> +                            (const Bytef *)param->compbuf, len);
> +            if (rc == Z_OK && param->is_pmem) {
> +                pmem_flush(des, len);
> +            }
>  
>              qemu_mutex_lock(&decomp_done_lock);
>              param->done = true;
> @@ -2605,7 +2609,8 @@ static void compress_threads_load_cleanup(void)
>  }
>  
>  static void decompress_data_with_multi_threads(QEMUFile *f,
> -                                               void *host, int len)
> +                                               void *host, int len,
> +                                               bool is_pmem)
>  {
>      int idx, thread_count;
>  
> @@ -2619,6 +2624,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f,
>                  qemu_get_buffer(f, decomp_param[idx].compbuf, len);
>                  decomp_param[idx].des = host;
>                  decomp_param[idx].len = len;
> +                decomp_param[idx].is_pmem = is_pmem;
>                  qemu_cond_signal(&decomp_param[idx].cond);
>                  qemu_mutex_unlock(&decomp_param[idx].mutex);
>                  break;
> @@ -2964,7 +2970,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>                  ret = -EINVAL;
>                  break;
>              }
> -            decompress_data_with_multi_threads(f, host, len);
> +            decompress_data_with_multi_threads(f, host, len, is_pmem);
>              break;
>  
>          case RAM_SAVE_FLAG_XBZRLE:
> -- 
> 2.14.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Haozhong Zhang Feb. 7, 2018, 12:15 p.m. UTC | #2
On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > When loading a compressed page to persistent memory, flush CPU cache
> > after the data is decompressed. Combined with a call to pmem_drain()
> > at the end of memory loading, we can guarantee those compressed pages
> > are persistently loaded to PMEM.
> 
> Can you explain why this can use the flush and doesn't need the special
> memset?

The best approach to ensure the write persistence is to operate pmem
all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
the write to pmem in this case is performed by uncompress() which is
implemented out of QEMU and libpmem. It may or may not use libpmem,
which is not controlled by QEMU. Therefore, we have to use the less
optimal approach, that is to flush cache for all pmem addresses that
uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
uncompress(), and pmem_flush() + pmem_drain() in QEMU.

Haozhong

> 
> Dave
> 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > ---
> >  include/qemu/pmem.h |  4 ++++
> >  migration/ram.c     | 16 +++++++++++-----
> >  2 files changed, 15 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
> > index 77ee1fc4eb..20e3f6e71d 100644
> > --- a/include/qemu/pmem.h
> > +++ b/include/qemu/pmem.h
> > @@ -37,6 +37,10 @@ static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
> >      return memset(pmemdest, c, len);
> >  }
> >  
> > +static inline void pmem_flush(const void *addr, size_t len)
> > +{
> > +}
> > +
> >  static inline void pmem_drain(void)
> >  {
> >  }
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 5a79bbff64..924d2b9537 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -274,6 +274,7 @@ struct DecompressParam {
> >      void *des;
> >      uint8_t *compbuf;
> >      int len;
> > +    bool is_pmem;
> >  };
> >  typedef struct DecompressParam DecompressParam;
> >  
> > @@ -2502,7 +2503,7 @@ static void *do_data_decompress(void *opaque)
> >      DecompressParam *param = opaque;
> >      unsigned long pagesize;
> >      uint8_t *des;
> > -    int len;
> > +    int len, rc;
> >  
> >      qemu_mutex_lock(&param->mutex);
> >      while (!param->quit) {
> > @@ -2518,8 +2519,11 @@ static void *do_data_decompress(void *opaque)
> >               * not a problem because the dirty page will be retransferred
> >               * and uncompress() won't break the data in other pages.
> >               */
> > -            uncompress((Bytef *)des, &pagesize,
> > -                       (const Bytef *)param->compbuf, len);
> > +            rc = uncompress((Bytef *)des, &pagesize,
> > +                            (const Bytef *)param->compbuf, len);
> > +            if (rc == Z_OK && param->is_pmem) {
> > +                pmem_flush(des, len);
> > +            }
> >  
> >              qemu_mutex_lock(&decomp_done_lock);
> >              param->done = true;
> > @@ -2605,7 +2609,8 @@ static void compress_threads_load_cleanup(void)
> >  }
> >  
> >  static void decompress_data_with_multi_threads(QEMUFile *f,
> > -                                               void *host, int len)
> > +                                               void *host, int len,
> > +                                               bool is_pmem)
> >  {
> >      int idx, thread_count;
> >  
> > @@ -2619,6 +2624,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f,
> >                  qemu_get_buffer(f, decomp_param[idx].compbuf, len);
> >                  decomp_param[idx].des = host;
> >                  decomp_param[idx].len = len;
> > +                decomp_param[idx].is_pmem = is_pmem;
> >                  qemu_cond_signal(&decomp_param[idx].cond);
> >                  qemu_mutex_unlock(&decomp_param[idx].mutex);
> >                  break;
> > @@ -2964,7 +2970,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
> >                  ret = -EINVAL;
> >                  break;
> >              }
> > -            decompress_data_with_multi_threads(f, host, len);
> > +            decompress_data_with_multi_threads(f, host, len, is_pmem);
> >              break;
> >  
> >          case RAM_SAVE_FLAG_XBZRLE:
> > -- 
> > 2.14.1
> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Feb. 7, 2018, 1:03 p.m. UTC | #3
* Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > > When loading a compressed page to persistent memory, flush CPU cache
> > > after the data is decompressed. Combined with a call to pmem_drain()
> > > at the end of memory loading, we can guarantee those compressed pages
> > > are persistently loaded to PMEM.
> > 
> > Can you explain why this can use the flush and doesn't need the special
> > memset?
> 
> The best approach to ensure the write persistence is to operate pmem
> all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
> the write to pmem in this case is performed by uncompress() which is
> implemented out of QEMU and libpmem. It may or may not use libpmem,
> which is not controlled by QEMU. Therefore, we have to use the less
> optimal approach, that is to flush cache for all pmem addresses that
> uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
> uncompress(), and pmem_flush() + pmem_drain() in QEMU.

In what way is it less optimal?
If that's a legal thing to do, then why not just do a pmem_flush +
pmem_drain right at the end of the ram loading and leave all the rest of
the code untouched?

Dave

> Haozhong
> 
> > 
> > Dave
> > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > ---
> > >  include/qemu/pmem.h |  4 ++++
> > >  migration/ram.c     | 16 +++++++++++-----
> > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
> > > index 77ee1fc4eb..20e3f6e71d 100644
> > > --- a/include/qemu/pmem.h
> > > +++ b/include/qemu/pmem.h
> > > @@ -37,6 +37,10 @@ static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
> > >      return memset(pmemdest, c, len);
> > >  }
> > >  
> > > +static inline void pmem_flush(const void *addr, size_t len)
> > > +{
> > > +}
> > > +
> > >  static inline void pmem_drain(void)
> > >  {
> > >  }
> > > diff --git a/migration/ram.c b/migration/ram.c
> > > index 5a79bbff64..924d2b9537 100644
> > > --- a/migration/ram.c
> > > +++ b/migration/ram.c
> > > @@ -274,6 +274,7 @@ struct DecompressParam {
> > >      void *des;
> > >      uint8_t *compbuf;
> > >      int len;
> > > +    bool is_pmem;
> > >  };
> > >  typedef struct DecompressParam DecompressParam;
> > >  
> > > @@ -2502,7 +2503,7 @@ static void *do_data_decompress(void *opaque)
> > >      DecompressParam *param = opaque;
> > >      unsigned long pagesize;
> > >      uint8_t *des;
> > > -    int len;
> > > +    int len, rc;
> > >  
> > >      qemu_mutex_lock(&param->mutex);
> > >      while (!param->quit) {
> > > @@ -2518,8 +2519,11 @@ static void *do_data_decompress(void *opaque)
> > >               * not a problem because the dirty page will be retransferred
> > >               * and uncompress() won't break the data in other pages.
> > >               */
> > > -            uncompress((Bytef *)des, &pagesize,
> > > -                       (const Bytef *)param->compbuf, len);
> > > +            rc = uncompress((Bytef *)des, &pagesize,
> > > +                            (const Bytef *)param->compbuf, len);
> > > +            if (rc == Z_OK && param->is_pmem) {
> > > +                pmem_flush(des, len);
> > > +            }
> > >  
> > >              qemu_mutex_lock(&decomp_done_lock);
> > >              param->done = true;
> > > @@ -2605,7 +2609,8 @@ static void compress_threads_load_cleanup(void)
> > >  }
> > >  
> > >  static void decompress_data_with_multi_threads(QEMUFile *f,
> > > -                                               void *host, int len)
> > > +                                               void *host, int len,
> > > +                                               bool is_pmem)
> > >  {
> > >      int idx, thread_count;
> > >  
> > > @@ -2619,6 +2624,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f,
> > >                  qemu_get_buffer(f, decomp_param[idx].compbuf, len);
> > >                  decomp_param[idx].des = host;
> > >                  decomp_param[idx].len = len;
> > > +                decomp_param[idx].is_pmem = is_pmem;
> > >                  qemu_cond_signal(&decomp_param[idx].cond);
> > >                  qemu_mutex_unlock(&decomp_param[idx].mutex);
> > >                  break;
> > > @@ -2964,7 +2970,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
> > >                  ret = -EINVAL;
> > >                  break;
> > >              }
> > > -            decompress_data_with_multi_threads(f, host, len);
> > > +            decompress_data_with_multi_threads(f, host, len, is_pmem);
> > >              break;
> > >  
> > >          case RAM_SAVE_FLAG_XBZRLE:
> > > -- 
> > > 2.14.1
> > > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Haozhong Zhang Feb. 7, 2018, 1:20 p.m. UTC | #4
On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
> * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > > > When loading a compressed page to persistent memory, flush CPU cache
> > > > after the data is decompressed. Combined with a call to pmem_drain()
> > > > at the end of memory loading, we can guarantee those compressed pages
> > > > are persistently loaded to PMEM.
> > > 
> > > Can you explain why this can use the flush and doesn't need the special
> > > memset?
> > 
> > The best approach to ensure the write persistence is to operate pmem
> > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
> > the write to pmem in this case is performed by uncompress() which is
> > implemented out of QEMU and libpmem. It may or may not use libpmem,
> > which is not controlled by QEMU. Therefore, we have to use the less
> > optimal approach, that is to flush cache for all pmem addresses that
> > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
> > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
> 
> In what way is it less optimal?
> If that's a legal thing to do, then why not just do a pmem_flush +
> pmem_drain right at the end of the ram loading and leave all the rest of
> the code untouched?

For example, the implementation pmem_memcpy_nodrain() prefers to use
movnt instructions w/o flush to write pmem if those instructions are
available, and falls back to memcpy() + flush if movnt are not
available, so I suppose the latter is less optimal.

Haozhong

> 
> Dave
> 
> > Haozhong
> > 
> > > 
> > > Dave
> > > 
> > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > > ---
> > > >  include/qemu/pmem.h |  4 ++++
> > > >  migration/ram.c     | 16 +++++++++++-----
> > > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
> > > > index 77ee1fc4eb..20e3f6e71d 100644
> > > > --- a/include/qemu/pmem.h
> > > > +++ b/include/qemu/pmem.h
> > > > @@ -37,6 +37,10 @@ static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
> > > >      return memset(pmemdest, c, len);
> > > >  }
> > > >  
> > > > +static inline void pmem_flush(const void *addr, size_t len)
> > > > +{
> > > > +}
> > > > +
> > > >  static inline void pmem_drain(void)
> > > >  {
> > > >  }
> > > > diff --git a/migration/ram.c b/migration/ram.c
> > > > index 5a79bbff64..924d2b9537 100644
> > > > --- a/migration/ram.c
> > > > +++ b/migration/ram.c
> > > > @@ -274,6 +274,7 @@ struct DecompressParam {
> > > >      void *des;
> > > >      uint8_t *compbuf;
> > > >      int len;
> > > > +    bool is_pmem;
> > > >  };
> > > >  typedef struct DecompressParam DecompressParam;
> > > >  
> > > > @@ -2502,7 +2503,7 @@ static void *do_data_decompress(void *opaque)
> > > >      DecompressParam *param = opaque;
> > > >      unsigned long pagesize;
> > > >      uint8_t *des;
> > > > -    int len;
> > > > +    int len, rc;
> > > >  
> > > >      qemu_mutex_lock(&param->mutex);
> > > >      while (!param->quit) {
> > > > @@ -2518,8 +2519,11 @@ static void *do_data_decompress(void *opaque)
> > > >               * not a problem because the dirty page will be retransferred
> > > >               * and uncompress() won't break the data in other pages.
> > > >               */
> > > > -            uncompress((Bytef *)des, &pagesize,
> > > > -                       (const Bytef *)param->compbuf, len);
> > > > +            rc = uncompress((Bytef *)des, &pagesize,
> > > > +                            (const Bytef *)param->compbuf, len);
> > > > +            if (rc == Z_OK && param->is_pmem) {
> > > > +                pmem_flush(des, len);
> > > > +            }
> > > >  
> > > >              qemu_mutex_lock(&decomp_done_lock);
> > > >              param->done = true;
> > > > @@ -2605,7 +2609,8 @@ static void compress_threads_load_cleanup(void)
> > > >  }
> > > >  
> > > >  static void decompress_data_with_multi_threads(QEMUFile *f,
> > > > -                                               void *host, int len)
> > > > +                                               void *host, int len,
> > > > +                                               bool is_pmem)
> > > >  {
> > > >      int idx, thread_count;
> > > >  
> > > > @@ -2619,6 +2624,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f,
> > > >                  qemu_get_buffer(f, decomp_param[idx].compbuf, len);
> > > >                  decomp_param[idx].des = host;
> > > >                  decomp_param[idx].len = len;
> > > > +                decomp_param[idx].is_pmem = is_pmem;
> > > >                  qemu_cond_signal(&decomp_param[idx].cond);
> > > >                  qemu_mutex_unlock(&decomp_param[idx].mutex);
> > > >                  break;
> > > > @@ -2964,7 +2970,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
> > > >                  ret = -EINVAL;
> > > >                  break;
> > > >              }
> > > > -            decompress_data_with_multi_threads(f, host, len);
> > > > +            decompress_data_with_multi_threads(f, host, len, is_pmem);
> > > >              break;
> > > >  
> > > >          case RAM_SAVE_FLAG_XBZRLE:
> > > > -- 
> > > > 2.14.1
> > > > 
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Dr. David Alan Gilbert Feb. 7, 2018, 1:24 p.m. UTC | #5
* Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> > > > > When loading a compressed page to persistent memory, flush CPU cache
> > > > > after the data is decompressed. Combined with a call to pmem_drain()
> > > > > at the end of memory loading, we can guarantee those compressed pages
> > > > > are persistently loaded to PMEM.
> > > > 
> > > > Can you explain why this can use the flush and doesn't need the special
> > > > memset?
> > > 
> > > The best approach to ensure the write persistence is to operate pmem
> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
> > > the write to pmem in this case is performed by uncompress() which is
> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
> > > which is not controlled by QEMU. Therefore, we have to use the less
> > > optimal approach, that is to flush cache for all pmem addresses that
> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
> > 
> > In what way is it less optimal?
> > If that's a legal thing to do, then why not just do a pmem_flush +
> > pmem_drain right at the end of the ram loading and leave all the rest of
> > the code untouched?
> 
> For example, the implementation pmem_memcpy_nodrain() prefers to use
> movnt instructions w/o flush to write pmem if those instructions are
> available, and falls back to memcpy() + flush if movnt are not
> available, so I suppose the latter is less optimal.

But if you use normal memcpy calls to copy a few GB of RAM in an
incoming migrate and then do a single flush at the end, isn't that
better?

Dave

> Haozhong
> 
> > 
> > Dave
> > 
> > > Haozhong
> > > 
> > > > 
> > > > Dave
> > > > 
> > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > > > ---
> > > > >  include/qemu/pmem.h |  4 ++++
> > > > >  migration/ram.c     | 16 +++++++++++-----
> > > > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
> > > > > index 77ee1fc4eb..20e3f6e71d 100644
> > > > > --- a/include/qemu/pmem.h
> > > > > +++ b/include/qemu/pmem.h
> > > > > @@ -37,6 +37,10 @@ static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
> > > > >      return memset(pmemdest, c, len);
> > > > >  }
> > > > >  
> > > > > +static inline void pmem_flush(const void *addr, size_t len)
> > > > > +{
> > > > > +}
> > > > > +
> > > > >  static inline void pmem_drain(void)
> > > > >  {
> > > > >  }
> > > > > diff --git a/migration/ram.c b/migration/ram.c
> > > > > index 5a79bbff64..924d2b9537 100644
> > > > > --- a/migration/ram.c
> > > > > +++ b/migration/ram.c
> > > > > @@ -274,6 +274,7 @@ struct DecompressParam {
> > > > >      void *des;
> > > > >      uint8_t *compbuf;
> > > > >      int len;
> > > > > +    bool is_pmem;
> > > > >  };
> > > > >  typedef struct DecompressParam DecompressParam;
> > > > >  
> > > > > @@ -2502,7 +2503,7 @@ static void *do_data_decompress(void *opaque)
> > > > >      DecompressParam *param = opaque;
> > > > >      unsigned long pagesize;
> > > > >      uint8_t *des;
> > > > > -    int len;
> > > > > +    int len, rc;
> > > > >  
> > > > >      qemu_mutex_lock(&param->mutex);
> > > > >      while (!param->quit) {
> > > > > @@ -2518,8 +2519,11 @@ static void *do_data_decompress(void *opaque)
> > > > >               * not a problem because the dirty page will be retransferred
> > > > >               * and uncompress() won't break the data in other pages.
> > > > >               */
> > > > > -            uncompress((Bytef *)des, &pagesize,
> > > > > -                       (const Bytef *)param->compbuf, len);
> > > > > +            rc = uncompress((Bytef *)des, &pagesize,
> > > > > +                            (const Bytef *)param->compbuf, len);
> > > > > +            if (rc == Z_OK && param->is_pmem) {
> > > > > +                pmem_flush(des, len);
> > > > > +            }
> > > > >  
> > > > >              qemu_mutex_lock(&decomp_done_lock);
> > > > >              param->done = true;
> > > > > @@ -2605,7 +2609,8 @@ static void compress_threads_load_cleanup(void)
> > > > >  }
> > > > >  
> > > > >  static void decompress_data_with_multi_threads(QEMUFile *f,
> > > > > -                                               void *host, int len)
> > > > > +                                               void *host, int len,
> > > > > +                                               bool is_pmem)
> > > > >  {
> > > > >      int idx, thread_count;
> > > > >  
> > > > > @@ -2619,6 +2624,7 @@ static void decompress_data_with_multi_threads(QEMUFile *f,
> > > > >                  qemu_get_buffer(f, decomp_param[idx].compbuf, len);
> > > > >                  decomp_param[idx].des = host;
> > > > >                  decomp_param[idx].len = len;
> > > > > +                decomp_param[idx].is_pmem = is_pmem;
> > > > >                  qemu_cond_signal(&decomp_param[idx].cond);
> > > > >                  qemu_mutex_unlock(&decomp_param[idx].mutex);
> > > > >                  break;
> > > > > @@ -2964,7 +2970,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
> > > > >                  ret = -EINVAL;
> > > > >                  break;
> > > > >              }
> > > > > -            decompress_data_with_multi_threads(f, host, len);
> > > > > +            decompress_data_with_multi_threads(f, host, len, is_pmem);
> > > > >              break;
> > > > >  
> > > > >          case RAM_SAVE_FLAG_XBZRLE:
> > > > > -- 
> > > > > 2.14.1
> > > > > 
> > > > --
> > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dan Williams Feb. 7, 2018, 6:05 p.m. UTC | #6
On Wed, Feb 7, 2018 at 5:24 AM, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
>> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
>> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> > > > > When loading a compressed page to persistent memory, flush CPU cache
>> > > > > after the data is decompressed. Combined with a call to pmem_drain()
>> > > > > at the end of memory loading, we can guarantee those compressed pages
>> > > > > are persistently loaded to PMEM.
>> > > >
>> > > > Can you explain why this can use the flush and doesn't need the special
>> > > > memset?
>> > >
>> > > The best approach to ensure the write persistence is to operate pmem
>> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
>> > > the write to pmem in this case is performed by uncompress() which is
>> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
>> > > which is not controlled by QEMU. Therefore, we have to use the less
>> > > optimal approach, that is to flush cache for all pmem addresses that
>> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
>> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
>> >
>> > In what way is it less optimal?
>> > If that's a legal thing to do, then why not just do a pmem_flush +
>> > pmem_drain right at the end of the ram loading and leave all the rest of
>> > the code untouched?
>>
>> For example, the implementation pmem_memcpy_nodrain() prefers to use
>> movnt instructions w/o flush to write pmem if those instructions are
>> available, and falls back to memcpy() + flush if movnt are not
>> available, so I suppose the latter is less optimal.
>
> But if you use normal memcpy calls to copy a few GB of RAM in an
> incoming migrate and then do a single flush at the end, isn't that
> better?

Not really, because now you've needlessly polluted the cache and are
spending CPU looping over the cachelines that could have been bypassed
with movnt.
Dr. David Alan Gilbert Feb. 7, 2018, 6:08 p.m. UTC | #7
* Dan Williams (dan.j.williams@intel.com) wrote:
> On Wed, Feb 7, 2018 at 5:24 AM, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> >> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> > > > > When loading a compressed page to persistent memory, flush CPU cache
> >> > > > > after the data is decompressed. Combined with a call to pmem_drain()
> >> > > > > at the end of memory loading, we can guarantee those compressed pages
> >> > > > > are persistently loaded to PMEM.
> >> > > >
> >> > > > Can you explain why this can use the flush and doesn't need the special
> >> > > > memset?
> >> > >
> >> > > The best approach to ensure the write persistence is to operate pmem
> >> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
> >> > > the write to pmem in this case is performed by uncompress() which is
> >> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
> >> > > which is not controlled by QEMU. Therefore, we have to use the less
> >> > > optimal approach, that is to flush cache for all pmem addresses that
> >> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
> >> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
> >> >
> >> > In what way is it less optimal?
> >> > If that's a legal thing to do, then why not just do a pmem_flush +
> >> > pmem_drain right at the end of the ram loading and leave all the rest of
> >> > the code untouched?
> >>
> >> For example, the implementation pmem_memcpy_nodrain() prefers to use
> >> movnt instructions w/o flush to write pmem if those instructions are
> >> available, and falls back to memcpy() + flush if movnt are not
> >> available, so I suppose the latter is less optimal.
> >
> > But if you use normal memcpy calls to copy a few GB of RAM in an
> > incoming migrate and then do a single flush at the end, isn't that
> > better?
> 
> Not really, because now you've needlessly polluted the cache and are
> spending CPU looping over the cachelines that could have been bypassed
> with movnt.

What's different in the pmem case?   Isn't what you've said true in the
normal migrate case as well?

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dan Williams Feb. 7, 2018, 6:31 p.m. UTC | #8
On Wed, Feb 7, 2018 at 10:08 AM, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> * Dan Williams (dan.j.williams@intel.com) wrote:
>> On Wed, Feb 7, 2018 at 5:24 AM, Dr. David Alan Gilbert
>> <dgilbert@redhat.com> wrote:
>> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
>> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
>> >> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> > > > > When loading a compressed page to persistent memory, flush CPU cache
>> >> > > > > after the data is decompressed. Combined with a call to pmem_drain()
>> >> > > > > at the end of memory loading, we can guarantee those compressed pages
>> >> > > > > are persistently loaded to PMEM.
>> >> > > >
>> >> > > > Can you explain why this can use the flush and doesn't need the special
>> >> > > > memset?
>> >> > >
>> >> > > The best approach to ensure the write persistence is to operate pmem
>> >> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
>> >> > > the write to pmem in this case is performed by uncompress() which is
>> >> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
>> >> > > which is not controlled by QEMU. Therefore, we have to use the less
>> >> > > optimal approach, that is to flush cache for all pmem addresses that
>> >> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
>> >> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
>> >> >
>> >> > In what way is it less optimal?
>> >> > If that's a legal thing to do, then why not just do a pmem_flush +
>> >> > pmem_drain right at the end of the ram loading and leave all the rest of
>> >> > the code untouched?
>> >>
>> >> For example, the implementation pmem_memcpy_nodrain() prefers to use
>> >> movnt instructions w/o flush to write pmem if those instructions are
>> >> available, and falls back to memcpy() + flush if movnt are not
>> >> available, so I suppose the latter is less optimal.
>> >
>> > But if you use normal memcpy calls to copy a few GB of RAM in an
>> > incoming migrate and then do a single flush at the end, isn't that
>> > better?
>>
>> Not really, because now you've needlessly polluted the cache and are
>> spending CPU looping over the cachelines that could have been bypassed
>> with movnt.
>
> What's different in the pmem case?   Isn't what you've said true in the
> normal migrate case as well?
>

In the normal migrate case the memory is volatile so once the copy is
globally visiable you're done. In the pmem case the migration is not
complete until the persistent state is synchronized.

Note, I'm talking in generalities because I don't know the deeper
details of the migrate flow.
Dr. David Alan Gilbert Feb. 7, 2018, 6:37 p.m. UTC | #9
* Dan Williams (dan.j.williams@intel.com) wrote:
> On Wed, Feb 7, 2018 at 10:08 AM, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> > * Dan Williams (dan.j.williams@intel.com) wrote:
> >> On Wed, Feb 7, 2018 at 5:24 AM, Dr. David Alan Gilbert
> >> <dgilbert@redhat.com> wrote:
> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> >> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
> >> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> >> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
> >> >> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
> >> >> > > > > When loading a compressed page to persistent memory, flush CPU cache
> >> >> > > > > after the data is decompressed. Combined with a call to pmem_drain()
> >> >> > > > > at the end of memory loading, we can guarantee those compressed pages
> >> >> > > > > are persistently loaded to PMEM.
> >> >> > > >
> >> >> > > > Can you explain why this can use the flush and doesn't need the special
> >> >> > > > memset?
> >> >> > >
> >> >> > > The best approach to ensure the write persistence is to operate pmem
> >> >> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
> >> >> > > the write to pmem in this case is performed by uncompress() which is
> >> >> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
> >> >> > > which is not controlled by QEMU. Therefore, we have to use the less
> >> >> > > optimal approach, that is to flush cache for all pmem addresses that
> >> >> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
> >> >> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
> >> >> >
> >> >> > In what way is it less optimal?
> >> >> > If that's a legal thing to do, then why not just do a pmem_flush +
> >> >> > pmem_drain right at the end of the ram loading and leave all the rest of
> >> >> > the code untouched?
> >> >>
> >> >> For example, the implementation pmem_memcpy_nodrain() prefers to use
> >> >> movnt instructions w/o flush to write pmem if those instructions are
> >> >> available, and falls back to memcpy() + flush if movnt are not
> >> >> available, so I suppose the latter is less optimal.
> >> >
> >> > But if you use normal memcpy calls to copy a few GB of RAM in an
> >> > incoming migrate and then do a single flush at the end, isn't that
> >> > better?
> >>
> >> Not really, because now you've needlessly polluted the cache and are
> >> spending CPU looping over the cachelines that could have been bypassed
> >> with movnt.
> >
> > What's different in the pmem case?   Isn't what you've said true in the
> > normal migrate case as well?
> >
> 
> In the normal migrate case the memory is volatile so once the copy is
> globally visiable you're done. In the pmem case the migration is not
> complete until the persistent state is synchronized.
> 
> Note, I'm talking in generalities because I don't know the deeper
> details of the migrate flow.

On the receive side of a migrate, during a normal precopy migrate
(without xbzrle) nothing is going to be reading that RAM until
the whole RAM load has completed anyway - so we're not benefiting from
it being in the caches either.

In the pmem case, again since nothing is going to be reading from that
RAM until the end anyway, why bother flushing as we go as opposed to at
the end?

Note, I'm also talking generalities because I don't know the deeper
details of the pmem flow. Hmm.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dan Williams Feb. 7, 2018, 10:43 p.m. UTC | #10
On Wed, Feb 7, 2018 at 10:37 AM, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
> * Dan Williams (dan.j.williams@intel.com) wrote:
>> On Wed, Feb 7, 2018 at 10:08 AM, Dr. David Alan Gilbert
>> <dgilbert@redhat.com> wrote:
>> > * Dan Williams (dan.j.williams@intel.com) wrote:
>> >> On Wed, Feb 7, 2018 at 5:24 AM, Dr. David Alan Gilbert
>> >> <dgilbert@redhat.com> wrote:
>> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> >> On 02/07/18 13:03 +0000, Dr. David Alan Gilbert wrote:
>> >> >> > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> >> > > On 02/07/18 11:54 +0000, Dr. David Alan Gilbert wrote:
>> >> >> > > > * Haozhong Zhang (haozhong.zhang@intel.com) wrote:
>> >> >> > > > > When loading a compressed page to persistent memory, flush CPU cache
>> >> >> > > > > after the data is decompressed. Combined with a call to pmem_drain()
>> >> >> > > > > at the end of memory loading, we can guarantee those compressed pages
>> >> >> > > > > are persistently loaded to PMEM.
>> >> >> > > >
>> >> >> > > > Can you explain why this can use the flush and doesn't need the special
>> >> >> > > > memset?
>> >> >> > >
>> >> >> > > The best approach to ensure the write persistence is to operate pmem
>> >> >> > > all via libpmem, e.g., pmem_memcpy_nodrain() + pmem_drain(). However,
>> >> >> > > the write to pmem in this case is performed by uncompress() which is
>> >> >> > > implemented out of QEMU and libpmem. It may or may not use libpmem,
>> >> >> > > which is not controlled by QEMU. Therefore, we have to use the less
>> >> >> > > optimal approach, that is to flush cache for all pmem addresses that
>> >> >> > > uncompress() may have written, i.e.,/e.g., memcpy() and/or memset() in
>> >> >> > > uncompress(), and pmem_flush() + pmem_drain() in QEMU.
>> >> >> >
>> >> >> > In what way is it less optimal?
>> >> >> > If that's a legal thing to do, then why not just do a pmem_flush +
>> >> >> > pmem_drain right at the end of the ram loading and leave all the rest of
>> >> >> > the code untouched?
>> >> >>
>> >> >> For example, the implementation pmem_memcpy_nodrain() prefers to use
>> >> >> movnt instructions w/o flush to write pmem if those instructions are
>> >> >> available, and falls back to memcpy() + flush if movnt are not
>> >> >> available, so I suppose the latter is less optimal.
>> >> >
>> >> > But if you use normal memcpy calls to copy a few GB of RAM in an
>> >> > incoming migrate and then do a single flush at the end, isn't that
>> >> > better?
>> >>
>> >> Not really, because now you've needlessly polluted the cache and are
>> >> spending CPU looping over the cachelines that could have been bypassed
>> >> with movnt.
>> >
>> > What's different in the pmem case?   Isn't what you've said true in the
>> > normal migrate case as well?
>> >
>>
>> In the normal migrate case the memory is volatile so once the copy is
>> globally visiable you're done. In the pmem case the migration is not
>> complete until the persistent state is synchronized.
>>
>> Note, I'm talking in generalities because I don't know the deeper
>> details of the migrate flow.
>
> On the receive side of a migrate, during a normal precopy migrate
> (without xbzrle) nothing is going to be reading that RAM until
> the whole RAM load has completed anyway - so we're not benefiting from
> it being in the caches either.
>
> In the pmem case, again since nothing is going to be reading from that
> RAM until the end anyway, why bother flushing as we go as opposed to at
> the end?

Flushing at the end implies doing a large loop flushing the caches at
the end of the transfer because the x86 ISA only exposes a
line-by-line flush to unprivileged code rather than a full cache flush
like what the kernel can do with wbinvd. So, better to flush as we go
rather than incur the overhead of the loop at the end. I.e. I'm
assuming it is more efficient to do 'movnt' in the first instance and
not worry about the flush loop.
diff mbox series

Patch

diff --git a/include/qemu/pmem.h b/include/qemu/pmem.h
index 77ee1fc4eb..20e3f6e71d 100644
--- a/include/qemu/pmem.h
+++ b/include/qemu/pmem.h
@@ -37,6 +37,10 @@  static inline void *pmem_memset_nodrain(void *pmemdest, int c, size_t len)
     return memset(pmemdest, c, len);
 }
 
+static inline void pmem_flush(const void *addr, size_t len)
+{
+}
+
 static inline void pmem_drain(void)
 {
 }
diff --git a/migration/ram.c b/migration/ram.c
index 5a79bbff64..924d2b9537 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -274,6 +274,7 @@  struct DecompressParam {
     void *des;
     uint8_t *compbuf;
     int len;
+    bool is_pmem;
 };
 typedef struct DecompressParam DecompressParam;
 
@@ -2502,7 +2503,7 @@  static void *do_data_decompress(void *opaque)
     DecompressParam *param = opaque;
     unsigned long pagesize;
     uint8_t *des;
-    int len;
+    int len, rc;
 
     qemu_mutex_lock(&param->mutex);
     while (!param->quit) {
@@ -2518,8 +2519,11 @@  static void *do_data_decompress(void *opaque)
              * not a problem because the dirty page will be retransferred
              * and uncompress() won't break the data in other pages.
              */
-            uncompress((Bytef *)des, &pagesize,
-                       (const Bytef *)param->compbuf, len);
+            rc = uncompress((Bytef *)des, &pagesize,
+                            (const Bytef *)param->compbuf, len);
+            if (rc == Z_OK && param->is_pmem) {
+                pmem_flush(des, len);
+            }
 
             qemu_mutex_lock(&decomp_done_lock);
             param->done = true;
@@ -2605,7 +2609,8 @@  static void compress_threads_load_cleanup(void)
 }
 
 static void decompress_data_with_multi_threads(QEMUFile *f,
-                                               void *host, int len)
+                                               void *host, int len,
+                                               bool is_pmem)
 {
     int idx, thread_count;
 
@@ -2619,6 +2624,7 @@  static void decompress_data_with_multi_threads(QEMUFile *f,
                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
                 decomp_param[idx].des = host;
                 decomp_param[idx].len = len;
+                decomp_param[idx].is_pmem = is_pmem;
                 qemu_cond_signal(&decomp_param[idx].cond);
                 qemu_mutex_unlock(&decomp_param[idx].mutex);
                 break;
@@ -2964,7 +2970,7 @@  static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 ret = -EINVAL;
                 break;
             }
-            decompress_data_with_multi_threads(f, host, len);
+            decompress_data_with_multi_threads(f, host, len, is_pmem);
             break;
 
         case RAM_SAVE_FLAG_XBZRLE: