[RFC,2/2] KVM: add virtio-pmem driver

Message ID 20171012155027.3277-3-pagupta@redhat.com
State New
Headers show
Series
  • KVM "fake DAX" device flushing
Related show

Commit Message

Pankaj Gupta Oct. 12, 2017, 3:50 p.m.
This patch adds virtio-pmem driver for KVM guest.
  Guest reads the persistent memory range information
  over virtio bus from Qemu and reserves the range
  as persistent memory. Guest also allocates a block
  device corresponding to the pmem range which later
  can be accessed with DAX compatible file systems.
  Idea is to use the virtio channel between guest and
  host to perform the block device flush for guest pmem 
  DAX device.

  There is work to do including DAX file system support 
  and other advanced features.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
---
 drivers/virtio/Kconfig           |  10 ++
 drivers/virtio/Makefile          |   1 +
 drivers/virtio/virtio_pmem.c     | 322 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_pmem.h |  55 +++++++
 4 files changed, 388 insertions(+)
 create mode 100644 drivers/virtio/virtio_pmem.c
 create mode 100644 include/uapi/linux/virtio_pmem.h

Comments

Dan Williams Oct. 12, 2017, 8:51 p.m. | #1
On Thu, Oct 12, 2017 at 8:50 AM, Pankaj Gupta <pagupta@redhat.com> wrote:
>   This patch adds virtio-pmem driver for KVM guest.
>   Guest reads the persistent memory range information
>   over virtio bus from Qemu and reserves the range
>   as persistent memory. Guest also allocates a block
>   device corresponding to the pmem range which later
>   can be accessed with DAX compatible file systems.
>   Idea is to use the virtio channel between guest and
>   host to perform the block device flush for guest pmem
>   DAX device.
>
>   There is work to do including DAX file system support
>   and other advanced features.
>
> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> ---
>  drivers/virtio/Kconfig           |  10 ++
>  drivers/virtio/Makefile          |   1 +
>  drivers/virtio/virtio_pmem.c     | 322 +++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/virtio_pmem.h |  55 +++++++
>  4 files changed, 388 insertions(+)
>  create mode 100644 drivers/virtio/virtio_pmem.c
>  create mode 100644 include/uapi/linux/virtio_pmem.h
>
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index cff773f15b7e..0192c4bda54b 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
>
>           If unsure, say Y.
>
> +config VIRTIO_PMEM
> +       tristate "Virtio pmem driver"
> +       depends on VIRTIO
> +       ---help---
> +        This driver adds persistent memory range within a KVM guest.

I think we need to call this something other than persistent memory to
make it clear that this not memory where the persistence can be
managed from userspace. The persistence point always requires a driver
call, so this is something distinctly different than "persistent
memory". For example, it's a bug if this memory range ends up backing
a device-dax range in the guest where there is no such thing as a
driver callback to perform the flushing. How does this solution
protect against that scenario?

> +         It also associates a block device corresponding to the pmem
> +        range.
> +
> +        If unsure, say M.
> +
>  config VIRTIO_BALLOON
>         tristate "Virtio balloon driver"
>         depends on VIRTIO
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 41e30e3dc842..032ade725cc2 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
>  obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
> diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
> new file mode 100644
> index 000000000000..74e47cae0e24
> --- /dev/null
> +++ b/drivers/virtio/virtio_pmem.c
> @@ -0,0 +1,322 @@
> +/*
> + * virtio-pmem driver
> + */
> +
> +#include <linux/virtio.h>
> +#include <linux/swap.h>
> +#include <linux/workqueue.h>
> +#include <linux/delay.h>
> +#include <linux/slab.h>
> +#include <linux/module.h>
> +#include <linux/oom.h>
> +#include <linux/wait.h>
> +#include <linux/mm.h>
> +#include <linux/mount.h>
> +#include <linux/magic.h>
> +#include <linux/virtio_pmem.h>
> +
> +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr)
> +{
> +       devm_memunmap(dev, addr);
> +       devm_release_mem_region(dev, res->start, resource_size(res));
> +}
> +
> +static void pmem_flush_done(struct virtqueue *vq)
> +{
> +       return;
> +};
> +
> +static void virtio_pmem_release_queue(void *q)
> +{
> +       blk_cleanup_queue(q);
> +}
> +
> +static void virtio_pmem_freeze_queue(void *q)
> +{
> +       blk_freeze_queue_start(q);
> +}
> +
> +static void virtio_pmem_release_disk(void *__pmem)
> +{
> +       struct virtio_pmem *pmem = __pmem;
> +
> +       del_gendisk(pmem->disk);
> +       put_disk(pmem->disk);
> +}

This code seems identical to the base pmem case, it should move to the
shared code object.

> +
> +static int init_vq(struct virtio_pmem *vpmem)
> +{
> +       struct virtqueue *vq;
> +
> +       /* single vq */
> +       vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue");
> +
> +       if (IS_ERR(vq))
> +               return PTR_ERR(vq);
> +
> +       return 0;
> +}
> +
> +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem,
> +                       struct resource *res, struct vmem_altmap *altmap)
> +{
> +       u32 start_pad = 0, end_trunc = 0;
> +       resource_size_t start, size;
> +       unsigned long npfns;
> +       phys_addr_t offset;
> +
> +       size = resource_size(res);
> +       start = PHYS_SECTION_ALIGN_DOWN(res->start);
> +
> +       if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +               IORES_DESC_NONE) == REGION_MIXED) {
> +
> +               start = res->start;
> +               start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> +       }
> +       start = res->start;
> +       size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> +
> +       if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +               IORES_DESC_NONE) == REGION_MIXED) {
> +
> +               size = resource_size(res);
> +               end_trunc = start + size -
> +                               PHYS_SECTION_ALIGN_DOWN(start + size);
> +       }
> +
> +       start += start_pad;
> +       size = resource_size(res);
> +       npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
> +                                               / PAGE_SIZE);
> +
> +      /*
> +       * vmemmap_populate_hugepages() allocates the memmap array in
> +       * HPAGE_SIZE chunks.
> +       */
> +       offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start;
> +       vpmem->data_offset = offset;
> +
> +       struct vmem_altmap __altmap = {
> +               .base_pfn = init_altmap_base(start+start_pad),
> +               .reserve = init_altmap_reserve(start+start_pad),
> +       };
> +
> +       res->start += start_pad;
> +       res->end -= end_trunc;
> +       memcpy(altmap, &__altmap, sizeof(*altmap));
> +       altmap->free = PHYS_PFN(offset - SZ_8K);
> +       altmap->alloc = 0;
> +
> +       return altmap;
> +}
> +
> +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page,
> +                       unsigned int len, unsigned int off, bool is_write,
> +                       sector_t sector)
> +{
> +       blk_status_t rc = BLK_STS_OK;
> +       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
> +       void *pmem_addr = pmem->virt_addr + pmem_off;
> +
> +       if (!is_write) {
> +               rc = read_pmem(page, off, pmem_addr, len);
> +                       flush_dcache_page(page);
> +       } else {
> +               flush_dcache_page(page);
> +               write_pmem(pmem_addr, page, off, len);
> +       }
> +
> +       return rc;
> +}
> +
> +static int vpmem_rw_page(struct block_device *bdev, sector_t sector,
> +                      struct page *page, bool is_write)
> +{
> +       struct virtio_pmem  *pmem = bdev->bd_queue->queuedata;
> +       blk_status_t rc;
> +
> +       rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
> +                         0, is_write, sector);
> +
> +       if (rc == 0)
> +               page_endio(page, is_write, 0);
> +
> +       return blk_status_to_errno(rc);
> +}
> +
> +#ifndef REQ_FLUSH
> +#define REQ_FLUSH REQ_PREFLUSH
> +#endif
> +
> +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> +                       struct bio *bio)
> +{
> +       blk_status_t rc = 0;
> +       struct bio_vec bvec;
> +       struct bvec_iter iter;
> +       struct virtio_pmem *pmem = q->queuedata;
> +
> +       if (bio->bi_opf & REQ_FLUSH)
> +               //todo host flush command
> +
> +       bio_for_each_segment(bvec, bio, iter) {
> +               rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
> +                               bvec.bv_offset, op_is_write(bio_op(bio)),
> +                               iter.bi_sector);
> +               if (rc) {
> +                       bio->bi_status = rc;
> +                       break;
> +               }
> +       }
> +
> +       bio_endio(bio);
> +       return BLK_QC_T_NONE;
> +}

Again, the above could be shared by both drivers.

> +
> +static const struct block_device_operations pmem_fops = {
> +       .owner =                THIS_MODULE,
> +       .rw_page =              vpmem_rw_page,
> +       //.revalidate_disk =    nvdimm_revalidate_disk,
> +};
> +
> +static int virtio_pmem_probe(struct virtio_device *vdev)
> +{
> +       struct virtio_pmem *vpmem;
> +       int err = 0;
> +       void *addr;
> +       struct resource *res, res_pfn;
> +       struct request_queue *q;
> +       struct vmem_altmap __altmap, *altmap = NULL;
> +       struct gendisk *disk;
> +       struct device *gendev;
> +       int nid = dev_to_node(&vdev->dev);
> +
> +       if (!vdev->config->get) {
> +               dev_err(&vdev->dev, "%s failure: config disabled\n",
> +                       __func__);
> +               return -EINVAL;
> +       }
> +
> +       vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
> +                       GFP_KERNEL);
> +
> +       if (!vpmem) {
> +               err = -ENOMEM;
> +               goto out;
> +       }
> +
> +       dev_set_drvdata(&vdev->dev, vpmem);
> +
> +       vpmem->vdev = vdev;
> +       err = init_vq(vpmem);
> +       if (err)
> +               goto out;
> +
> +       if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) {
> +               dev_err(&vdev->dev, "%s : pmem not supported\n",
> +                       __func__);
> +               goto out;
> +       }
> +
> +       virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +                       start, &vpmem->start);
> +       virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +                       size, &vpmem->size);
> +
> +       res_pfn.start = vpmem->start;
> +       res_pfn.end   = vpmem->start + vpmem->size-1;
> +
> +       /* used for allocating memmap in the pmem device */
> +       altmap        = setup_pmem_pfn(vpmem, &res_pfn, &__altmap);
> +
> +       res = devm_request_mem_region(&vdev->dev,
> +                       res_pfn.start, resource_size(&res_pfn), "virtio-pmem");
> +
> +       if (!res) {
> +               dev_warn(&vdev->dev, "could not reserve region ");
> +               return -EBUSY;
> +       }
> +
> +       q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev));
> +
> +       if (!q)
> +               return -ENOMEM;
> +
> +       if (devm_add_action_or_reset(&vdev->dev,
> +                               virtio_pmem_release_queue, q))
> +               return -ENOMEM;
> +
> +       vpmem->pfn_flags = PFN_DEV;
> +
> +       /* allocate memap in pmem device itself */
> +       if (IS_ENABLED(CONFIG_ZONE_DEVICE)) {
> +
> +               addr = devm_memremap_pages(&vdev->dev, res,
> +                               &q->q_usage_counter, altmap);
> +               vpmem->pfn_flags |= PFN_MAP;
> +       } else
> +               addr = devm_memremap(&vdev->dev, vpmem->start,
> +                               vpmem->size, ARCH_MEMREMAP_PMEM);
> +
> +        /*
> +         * At release time the queue must be frozen before
> +         * devm_memremap_pages is unwound
> +         */
> +       if (devm_add_action_or_reset(&vdev->dev,
> +                               virtio_pmem_freeze_queue, q))
> +               return -ENOMEM;
> +
> +       if (IS_ERR(addr))
> +               return PTR_ERR(addr);
> +
> +       vpmem->virt_addr = addr;
> +       blk_queue_write_cache(q, 0, 0);
> +       blk_queue_make_request(q, virtio_pmem_make_request);
> +       blk_queue_physical_block_size(q, PAGE_SIZE);
> +       blk_queue_logical_block_size(q, 512);
> +       blk_queue_max_hw_sectors(q, UINT_MAX);
> +       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
> +       queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
> +       q->queuedata = vpmem;
> +
> +       disk = alloc_disk_node(0, nid);
> +       if (!disk)
> +               return -ENOMEM;
> +       vpmem->disk = disk;
> +
> +       disk->fops                = &pmem_fops;
> +       disk->queue               = q;
> +       disk->flags               = GENHD_FL_EXT_DEVT;
> +       strcpy(disk->disk_name, "vpmem");
> +       set_capacity(disk, vpmem->size/512);
> +       gendev = disk_to_dev(disk);
> +
> +       virtio_device_ready(vdev);
> +       device_add_disk(&vdev->dev, disk);
> +
> +       if (devm_add_action_or_reset(&vdev->dev,
> +                       virtio_pmem_release_disk, vpmem))
> +               return -ENOMEM;
> +
> +       revalidate_disk(disk);
> +       return 0;
> +out:
> +       vdev->config->del_vqs(vdev);
> +       return err;
> +}

Here we have a mix of code that is common and some that is virtio
specific, the shared code should be factored out into a common helper
that both drivers call.
Pankaj Gupta Oct. 12, 2017, 9:25 p.m. | #2
> >   This patch adds virtio-pmem driver for KVM guest.
> >   Guest reads the persistent memory range information
> >   over virtio bus from Qemu and reserves the range
> >   as persistent memory. Guest also allocates a block
> >   device corresponding to the pmem range which later
> >   can be accessed with DAX compatible file systems.
> >   Idea is to use the virtio channel between guest and
> >   host to perform the block device flush for guest pmem
> >   DAX device.
> >
> >   There is work to do including DAX file system support
> >   and other advanced features.
> >
> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > ---
> >  drivers/virtio/Kconfig           |  10 ++
> >  drivers/virtio/Makefile          |   1 +
> >  drivers/virtio/virtio_pmem.c     | 322
> >  +++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> >  4 files changed, 388 insertions(+)
> >  create mode 100644 drivers/virtio/virtio_pmem.c
> >  create mode 100644 include/uapi/linux/virtio_pmem.h
> >
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index cff773f15b7e..0192c4bda54b 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> >
> >           If unsure, say Y.
> >
> > +config VIRTIO_PMEM
> > +       tristate "Virtio pmem driver"
> > +       depends on VIRTIO
> > +       ---help---
> > +        This driver adds persistent memory range within a KVM guest.
> 
> I think we need to call this something other than persistent memory to
> make it clear that this not memory where the persistence can be
> managed from userspace. The persistence point always requires a driver
> call, so this is something distinctly different than "persistent
> memory". For example, it's a bug if this memory range ends up backing
> a device-dax range in the guest where there is no such thing as a
> driver callback to perform the flushing. How does this solution
> protect against that scenario?

yes, you are right we are not providing device_dax in this case so it should
be clear from name. Any suggestion for name?  

> 
> > +         It also associates a block device corresponding to the pmem
> > +        range.
> > +
> > +        If unsure, say M.
> > +
> >  config VIRTIO_BALLOON
> >         tristate "Virtio balloon driver"
> >         depends on VIRTIO
> > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> > index 41e30e3dc842..032ade725cc2 100644
> > --- a/drivers/virtio/Makefile
> > +++ b/drivers/virtio/Makefile
> > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
> >  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
> >  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
> >  obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
> > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
> > new file mode 100644
> > index 000000000000..74e47cae0e24
> > --- /dev/null
> > +++ b/drivers/virtio/virtio_pmem.c
> > @@ -0,0 +1,322 @@
> > +/*
> > + * virtio-pmem driver
> > + */
> > +
> > +#include <linux/virtio.h>
> > +#include <linux/swap.h>
> > +#include <linux/workqueue.h>
> > +#include <linux/delay.h>
> > +#include <linux/slab.h>
> > +#include <linux/module.h>
> > +#include <linux/oom.h>
> > +#include <linux/wait.h>
> > +#include <linux/mm.h>
> > +#include <linux/mount.h>
> > +#include <linux/magic.h>
> > +#include <linux/virtio_pmem.h>
> > +
> > +void devm_vpmem_disable(struct device *dev, struct resource *res, void
> > *addr)
> > +{
> > +       devm_memunmap(dev, addr);
> > +       devm_release_mem_region(dev, res->start, resource_size(res));
> > +}
> > +
> > +static void pmem_flush_done(struct virtqueue *vq)
> > +{
> > +       return;
> > +};
> > +
> > +static void virtio_pmem_release_queue(void *q)
> > +{
> > +       blk_cleanup_queue(q);
> > +}
> > +
> > +static void virtio_pmem_freeze_queue(void *q)
> > +{
> > +       blk_freeze_queue_start(q);
> > +}
> > +
> > +static void virtio_pmem_release_disk(void *__pmem)
> > +{
> > +       struct virtio_pmem *pmem = __pmem;
> > +
> > +       del_gendisk(pmem->disk);
> > +       put_disk(pmem->disk);
> > +}
> 
> This code seems identical to the base pmem case, it should move to the
> shared code object.

Sure!
> 
> > +
> > +static int init_vq(struct virtio_pmem *vpmem)
> > +{
> > +       struct virtqueue *vq;
> > +
> > +       /* single vq */
> > +       vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done,
> > "flush_queue");
> > +
> > +       if (IS_ERR(vq))
> > +               return PTR_ERR(vq);
> > +
> > +       return 0;
> > +}
> > +
> > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem,
> > +                       struct resource *res, struct vmem_altmap *altmap)
> > +{
> > +       u32 start_pad = 0, end_trunc = 0;
> > +       resource_size_t start, size;
> > +       unsigned long npfns;
> > +       phys_addr_t offset;
> > +
> > +       size = resource_size(res);
> > +       start = PHYS_SECTION_ALIGN_DOWN(res->start);
> > +
> > +       if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > +               IORES_DESC_NONE) == REGION_MIXED) {
> > +
> > +               start = res->start;
> > +               start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> > +       }
> > +       start = res->start;
> > +       size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> > +
> > +       if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > +               IORES_DESC_NONE) == REGION_MIXED) {
> > +
> > +               size = resource_size(res);
> > +               end_trunc = start + size -
> > +                               PHYS_SECTION_ALIGN_DOWN(start + size);
> > +       }
> > +
> > +       start += start_pad;
> > +       size = resource_size(res);
> > +       npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
> > +                                               / PAGE_SIZE);
> > +
> > +      /*
> > +       * vmemmap_populate_hugepages() allocates the memmap array in
> > +       * HPAGE_SIZE chunks.
> > +       */
> > +       offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start;
> > +       vpmem->data_offset = offset;
> > +
> > +       struct vmem_altmap __altmap = {
> > +               .base_pfn = init_altmap_base(start+start_pad),
> > +               .reserve = init_altmap_reserve(start+start_pad),
> > +       };
> > +
> > +       res->start += start_pad;
> > +       res->end -= end_trunc;
> > +       memcpy(altmap, &__altmap, sizeof(*altmap));
> > +       altmap->free = PHYS_PFN(offset - SZ_8K);
> > +       altmap->alloc = 0;
> > +
> > +       return altmap;
> > +}
> > +
> > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page
> > *page,
> > +                       unsigned int len, unsigned int off, bool is_write,
> > +                       sector_t sector)
> > +{
> > +       blk_status_t rc = BLK_STS_OK;
> > +       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
> > +       void *pmem_addr = pmem->virt_addr + pmem_off;
> > +
> > +       if (!is_write) {
> > +               rc = read_pmem(page, off, pmem_addr, len);
> > +                       flush_dcache_page(page);
> > +       } else {
> > +               flush_dcache_page(page);
> > +               write_pmem(pmem_addr, page, off, len);
> > +       }
> > +
> > +       return rc;
> > +}
> > +
> > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector,
> > +                      struct page *page, bool is_write)
> > +{
> > +       struct virtio_pmem  *pmem = bdev->bd_queue->queuedata;
> > +       blk_status_t rc;
> > +
> > +       rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
> > +                         0, is_write, sector);
> > +
> > +       if (rc == 0)
> > +               page_endio(page, is_write, 0);
> > +
> > +       return blk_status_to_errno(rc);
> > +}
> > +
> > +#ifndef REQ_FLUSH
> > +#define REQ_FLUSH REQ_PREFLUSH
> > +#endif
> > +
> > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> > +                       struct bio *bio)
> > +{
> > +       blk_status_t rc = 0;
> > +       struct bio_vec bvec;
> > +       struct bvec_iter iter;
> > +       struct virtio_pmem *pmem = q->queuedata;
> > +
> > +       if (bio->bi_opf & REQ_FLUSH)
> > +               //todo host flush command
> > +
> > +       bio_for_each_segment(bvec, bio, iter) {
> > +               rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
> > +                               bvec.bv_offset, op_is_write(bio_op(bio)),
> > +                               iter.bi_sector);
> > +               if (rc) {
> > +                       bio->bi_status = rc;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       bio_endio(bio);
> > +       return BLK_QC_T_NONE;
> > +}
> 
> Again, the above could be shared by both drivers.

yes, I will do that.
> 
> > +
> > +static const struct block_device_operations pmem_fops = {
> > +       .owner =                THIS_MODULE,
> > +       .rw_page =              vpmem_rw_page,
> > +       //.revalidate_disk =    nvdimm_revalidate_disk,
> > +};
> > +
> > +static int virtio_pmem_probe(struct virtio_device *vdev)
> > +{
> > +       struct virtio_pmem *vpmem;
> > +       int err = 0;
> > +       void *addr;
> > +       struct resource *res, res_pfn;
> > +       struct request_queue *q;
> > +       struct vmem_altmap __altmap, *altmap = NULL;
> > +       struct gendisk *disk;
> > +       struct device *gendev;
> > +       int nid = dev_to_node(&vdev->dev);
> > +
> > +       if (!vdev->config->get) {
> > +               dev_err(&vdev->dev, "%s failure: config disabled\n",
> > +                       __func__);
> > +               return -EINVAL;
> > +       }
> > +
> > +       vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
> > +                       GFP_KERNEL);
> > +
> > +       if (!vpmem) {
> > +               err = -ENOMEM;
> > +               goto out;
> > +       }
> > +
> > +       dev_set_drvdata(&vdev->dev, vpmem);
> > +
> > +       vpmem->vdev = vdev;
> > +       err = init_vq(vpmem);
> > +       if (err)
> > +               goto out;
> > +
> > +       if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) {
> > +               dev_err(&vdev->dev, "%s : pmem not supported\n",
> > +                       __func__);
> > +               goto out;
> > +       }
> > +
> > +       virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +                       start, &vpmem->start);
> > +       virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +                       size, &vpmem->size);
> > +
> > +       res_pfn.start = vpmem->start;
> > +       res_pfn.end   = vpmem->start + vpmem->size-1;
> > +
> > +       /* used for allocating memmap in the pmem device */
> > +       altmap        = setup_pmem_pfn(vpmem, &res_pfn, &__altmap);
> > +
> > +       res = devm_request_mem_region(&vdev->dev,
> > +                       res_pfn.start, resource_size(&res_pfn),
> > "virtio-pmem");
> > +
> > +       if (!res) {
> > +               dev_warn(&vdev->dev, "could not reserve region ");
> > +               return -EBUSY;
> > +       }
> > +
> > +       q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev));
> > +
> > +       if (!q)
> > +               return -ENOMEM;
> > +
> > +       if (devm_add_action_or_reset(&vdev->dev,
> > +                               virtio_pmem_release_queue, q))
> > +               return -ENOMEM;
> > +
> > +       vpmem->pfn_flags = PFN_DEV;
> > +
> > +       /* allocate memap in pmem device itself */
> > +       if (IS_ENABLED(CONFIG_ZONE_DEVICE)) {
> > +
> > +               addr = devm_memremap_pages(&vdev->dev, res,
> > +                               &q->q_usage_counter, altmap);
> > +               vpmem->pfn_flags |= PFN_MAP;
> > +       } else
> > +               addr = devm_memremap(&vdev->dev, vpmem->start,
> > +                               vpmem->size, ARCH_MEMREMAP_PMEM);
> > +
> > +        /*
> > +         * At release time the queue must be frozen before
> > +         * devm_memremap_pages is unwound
> > +         */
> > +       if (devm_add_action_or_reset(&vdev->dev,
> > +                               virtio_pmem_freeze_queue, q))
> > +               return -ENOMEM;
> > +
> > +       if (IS_ERR(addr))
> > +               return PTR_ERR(addr);
> > +
> > +       vpmem->virt_addr = addr;
> > +       blk_queue_write_cache(q, 0, 0);
> > +       blk_queue_make_request(q, virtio_pmem_make_request);
> > +       blk_queue_physical_block_size(q, PAGE_SIZE);
> > +       blk_queue_logical_block_size(q, 512);
> > +       blk_queue_max_hw_sectors(q, UINT_MAX);
> > +       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
> > +       queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
> > +       q->queuedata = vpmem;
> > +
> > +       disk = alloc_disk_node(0, nid);
> > +       if (!disk)
> > +               return -ENOMEM;
> > +       vpmem->disk = disk;
> > +
> > +       disk->fops                = &pmem_fops;
> > +       disk->queue               = q;
> > +       disk->flags               = GENHD_FL_EXT_DEVT;
> > +       strcpy(disk->disk_name, "vpmem");
> > +       set_capacity(disk, vpmem->size/512);
> > +       gendev = disk_to_dev(disk);
> > +
> > +       virtio_device_ready(vdev);
> > +       device_add_disk(&vdev->dev, disk);
> > +
> > +       if (devm_add_action_or_reset(&vdev->dev,
> > +                       virtio_pmem_release_disk, vpmem))
> > +               return -ENOMEM;
> > +
> > +       revalidate_disk(disk);
> > +       return 0;
> > +out:
> > +       vdev->config->del_vqs(vdev);
> > +       return err;
> > +}
> 
> Here we have a mix of code that is common and some that is virtio
> specific, the shared code should be factored out into a common helper
> that both drivers call.

yes, i will factor this as well.
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
Dan Williams Oct. 12, 2017, 9:54 p.m. | #3
On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com> wrote:
>
>> >   This patch adds virtio-pmem driver for KVM guest.
>> >   Guest reads the persistent memory range information
>> >   over virtio bus from Qemu and reserves the range
>> >   as persistent memory. Guest also allocates a block
>> >   device corresponding to the pmem range which later
>> >   can be accessed with DAX compatible file systems.
>> >   Idea is to use the virtio channel between guest and
>> >   host to perform the block device flush for guest pmem
>> >   DAX device.
>> >
>> >   There is work to do including DAX file system support
>> >   and other advanced features.
>> >
>> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
>> > ---
>> >  drivers/virtio/Kconfig           |  10 ++
>> >  drivers/virtio/Makefile          |   1 +
>> >  drivers/virtio/virtio_pmem.c     | 322
>> >  +++++++++++++++++++++++++++++++++++++++
>> >  include/uapi/linux/virtio_pmem.h |  55 +++++++
>> >  4 files changed, 388 insertions(+)
>> >  create mode 100644 drivers/virtio/virtio_pmem.c
>> >  create mode 100644 include/uapi/linux/virtio_pmem.h
>> >
>> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
>> > index cff773f15b7e..0192c4bda54b 100644
>> > --- a/drivers/virtio/Kconfig
>> > +++ b/drivers/virtio/Kconfig
>> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
>> >
>> >           If unsure, say Y.
>> >
>> > +config VIRTIO_PMEM
>> > +       tristate "Virtio pmem driver"
>> > +       depends on VIRTIO
>> > +       ---help---
>> > +        This driver adds persistent memory range within a KVM guest.
>>
>> I think we need to call this something other than persistent memory to
>> make it clear that this not memory where the persistence can be
>> managed from userspace. The persistence point always requires a driver
>> call, so this is something distinctly different than "persistent
>> memory". For example, it's a bug if this memory range ends up backing
>> a device-dax range in the guest where there is no such thing as a
>> driver callback to perform the flushing. How does this solution
>> protect against that scenario?
>
> yes, you are right we are not providing device_dax in this case so it should
> be clear from name. Any suggestion for name?

So currently /proc/iomem in a guest with a pmem device attached to a
namespace looks like this:

    c00000000-13bfffffff : Persistent Memory
       c00000000-13bfffffff : namespace2.0

Can we call it "Virtio Shared Memory" to make it clear it is a
different beast than typical "Persistent Memory"?  You can likely
inject your own name into the resource tree the same way we do in the
NFIT driver. See acpi_nfit_insert_resource().
Pankaj Gupta Oct. 12, 2017, 10:18 p.m. | #4
> 
> On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com> wrote:
> >
> >> >   This patch adds virtio-pmem driver for KVM guest.
> >> >   Guest reads the persistent memory range information
> >> >   over virtio bus from Qemu and reserves the range
> >> >   as persistent memory. Guest also allocates a block
> >> >   device corresponding to the pmem range which later
> >> >   can be accessed with DAX compatible file systems.
> >> >   Idea is to use the virtio channel between guest and
> >> >   host to perform the block device flush for guest pmem
> >> >   DAX device.
> >> >
> >> >   There is work to do including DAX file system support
> >> >   and other advanced features.
> >> >
> >> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> >> > ---
> >> >  drivers/virtio/Kconfig           |  10 ++
> >> >  drivers/virtio/Makefile          |   1 +
> >> >  drivers/virtio/virtio_pmem.c     | 322
> >> >  +++++++++++++++++++++++++++++++++++++++
> >> >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> >> >  4 files changed, 388 insertions(+)
> >> >  create mode 100644 drivers/virtio/virtio_pmem.c
> >> >  create mode 100644 include/uapi/linux/virtio_pmem.h
> >> >
> >> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> >> > index cff773f15b7e..0192c4bda54b 100644
> >> > --- a/drivers/virtio/Kconfig
> >> > +++ b/drivers/virtio/Kconfig
> >> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> >> >
> >> >           If unsure, say Y.
> >> >
> >> > +config VIRTIO_PMEM
> >> > +       tristate "Virtio pmem driver"
> >> > +       depends on VIRTIO
> >> > +       ---help---
> >> > +        This driver adds persistent memory range within a KVM guest.
> >>
> >> I think we need to call this something other than persistent memory to
> >> make it clear that this not memory where the persistence can be
> >> managed from userspace. The persistence point always requires a driver
> >> call, so this is something distinctly different than "persistent
> >> memory". For example, it's a bug if this memory range ends up backing
> >> a device-dax range in the guest where there is no such thing as a
> >> driver callback to perform the flushing. How does this solution
> >> protect against that scenario?
> >
> > yes, you are right we are not providing device_dax in this case so it
> > should
> > be clear from name. Any suggestion for name?
> 
> So currently /proc/iomem in a guest with a pmem device attached to a
> namespace looks like this:
> 
>     c00000000-13bfffffff : Persistent Memory
>        c00000000-13bfffffff : namespace2.0
> 
> Can we call it "Virtio Shared Memory" to make it clear it is a
> different beast than typical "Persistent Memory"?  You can likely

I think somewhere we need persistent keyword 'Virtio Persistent Memory' or 
so.

> inject your own name into the resource tree the same way we do in the
> NFIT driver. See acpi_nfit_insert_resource().

Sure! thank you.
Rik van Riel Oct. 12, 2017, 10:27 p.m. | #5
On Thu, 2017-10-12 at 18:18 -0400, Pankaj Gupta wrote:
> > 
> > On Thu, Oct 12, 2017 at 2:25 PM, Pankaj Gupta <pagupta@redhat.com>
> > wrote:
> > > 
> > > > >   This patch adds virtio-pmem driver for KVM guest.
> > > > >   Guest reads the persistent memory range information
> > > > >   over virtio bus from Qemu and reserves the range
> > > > >   as persistent memory. Guest also allocates a block
> > > > >   device corresponding to the pmem range which later
> > > > >   can be accessed with DAX compatible file systems.
> > > > >   Idea is to use the virtio channel between guest and
> > > > >   host to perform the block device flush for guest pmem
> > > > >   DAX device.
> > > > > 
> > > > >   There is work to do including DAX file system support
> > > > >   and other advanced features.
> > > > > 
> > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > > > ---
> > > > >  drivers/virtio/Kconfig           |  10 ++
> > > > >  drivers/virtio/Makefile          |   1 +
> > > > >  drivers/virtio/virtio_pmem.c     | 322
> > > > >  +++++++++++++++++++++++++++++++++++++++
> > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> > > > >  4 files changed, 388 insertions(+)
> > > > >  create mode 100644 drivers/virtio/virtio_pmem.c
> > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h
> > > > > 
> > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > index cff773f15b7e..0192c4bda54b 100644
> > > > > --- a/drivers/virtio/Kconfig
> > > > > +++ b/drivers/virtio/Kconfig
> > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> > > > > 
> > > > >           If unsure, say Y.
> > > > > 
> > > > > +config VIRTIO_PMEM
> > > > > +       tristate "Virtio pmem driver"
> > > > > +       depends on VIRTIO
> > > > > +       ---help---
> > > > > +        This driver adds persistent memory range within a
> > > > > KVM guest.

With "Virtio Block Backed Pmem" we could name the config
option VIRTIO_BLOCK_PMEM

The documentation text could make it clear to people that the
image shows up as a disk image on the host, but as a pmem
memory range in the guest.

> > > > I think we need to call this something other than persistent
> > > > memory to
> > > > make it clear that this not memory where the persistence can be
> > > > managed from userspace. The persistence point always requires
> > > > 
> > So currently /proc/iomem in a guest with a pmem device attached to
> > a
> > namespace looks like this:
> > 
> >     c00000000-13bfffffff : Persistent Memory
> >        c00000000-13bfffffff : namespace2.0
> > 
> > Can we call it "Virtio Shared Memory" to make it clear it is a
> > different beast than typical "Persistent Memory"?  You can likely
> 
> I think somewhere we need persistent keyword 'Virtio Persistent
> Memory' or 
> so.

Still hoping for better ideas than "Virtio Block Backed Pmem" :)
Pankaj Gupta Oct. 12, 2017, 10:39 p.m. | #6
> > > > 
> > > > > >   This patch adds virtio-pmem driver for KVM guest.
> > > > > >   Guest reads the persistent memory range information
> > > > > >   over virtio bus from Qemu and reserves the range
> > > > > >   as persistent memory. Guest also allocates a block
> > > > > >   device corresponding to the pmem range which later
> > > > > >   can be accessed with DAX compatible file systems.
> > > > > >   Idea is to use the virtio channel between guest and
> > > > > >   host to perform the block device flush for guest pmem
> > > > > >   DAX device.
> > > > > > 
> > > > > >   There is work to do including DAX file system support
> > > > > >   and other advanced features.
> > > > > > 
> > > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > > > > ---
> > > > > >  drivers/virtio/Kconfig           |  10 ++
> > > > > >  drivers/virtio/Makefile          |   1 +
> > > > > >  drivers/virtio/virtio_pmem.c     | 322
> > > > > >  +++++++++++++++++++++++++++++++++++++++
> > > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> > > > > >  4 files changed, 388 insertions(+)
> > > > > >  create mode 100644 drivers/virtio/virtio_pmem.c
> > > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h
> > > > > > 
> > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > index cff773f15b7e..0192c4bda54b 100644
> > > > > > --- a/drivers/virtio/Kconfig
> > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> > > > > > 
> > > > > >           If unsure, say Y.
> > > > > > 
> > > > > > +config VIRTIO_PMEM
> > > > > > +       tristate "Virtio pmem driver"
> > > > > > +       depends on VIRTIO
> > > > > > +       ---help---
> > > > > > +        This driver adds persistent memory range within a
> > > > > > KVM guest.
> 
> With "Virtio Block Backed Pmem" we could name the config
> option VIRTIO_BLOCK_PMEM
> 
> The documentation text could make it clear to people that the
> image shows up as a disk image on the host, but as a pmem
> memory range in the guest.

yes, this looks better. 
thank you.

> 
> > > > > I think we need to call this something other than persistent
> > > > > memory to
> > > > > make it clear that this not memory where the persistence can be
> > > > > managed from userspace. The persistence point always requires
> > > > > 
> > > So currently /proc/iomem in a guest with a pmem device attached to
> > > a
> > > namespace looks like this:
> > > 
> > >     c00000000-13bfffffff : Persistent Memory
> > >        c00000000-13bfffffff : namespace2.0
> > > 
> > > Can we call it "Virtio Shared Memory" to make it clear it is a
> > > different beast than typical "Persistent Memory"?  You can likely
> > 
> > I think somewhere we need persistent keyword 'Virtio Persistent
> > Memory' or
> > so.
> 
> Still hoping for better ideas than "Virtio Block Backed Pmem" :)

:-)
>
Pankaj Gupta Oct. 12, 2017, 10:52 p.m. | #7
> > > wrote:
> > > > 
> > > > > >   This patch adds virtio-pmem driver for KVM guest.
> > > > > >   Guest reads the persistent memory range information
> > > > > >   over virtio bus from Qemu and reserves the range
> > > > > >   as persistent memory. Guest also allocates a block
> > > > > >   device corresponding to the pmem range which later
> > > > > >   can be accessed with DAX compatible file systems.
> > > > > >   Idea is to use the virtio channel between guest and
> > > > > >   host to perform the block device flush for guest pmem
> > > > > >   DAX device.
> > > > > > 
> > > > > >   There is work to do including DAX file system support
> > > > > >   and other advanced features.
> > > > > > 
> > > > > > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > > > > > ---
> > > > > >  drivers/virtio/Kconfig           |  10 ++
> > > > > >  drivers/virtio/Makefile          |   1 +
> > > > > >  drivers/virtio/virtio_pmem.c     | 322
> > > > > >  +++++++++++++++++++++++++++++++++++++++
> > > > > >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> > > > > >  4 files changed, 388 insertions(+)
> > > > > >  create mode 100644 drivers/virtio/virtio_pmem.c
> > > > > >  create mode 100644 include/uapi/linux/virtio_pmem.h
> > > > > > 
> > > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > > > > > index cff773f15b7e..0192c4bda54b 100644
> > > > > > --- a/drivers/virtio/Kconfig
> > > > > > +++ b/drivers/virtio/Kconfig
> > > > > > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> > > > > > 
> > > > > >           If unsure, say Y.
> > > > > > 
> > > > > > +config VIRTIO_PMEM
> > > > > > +       tristate "Virtio pmem driver"
> > > > > > +       depends on VIRTIO
> > > > > > +       ---help---
> > > > > > +        This driver adds persistent memory range within a
> > > > > > KVM guest.
> 
> With "Virtio Block Backed Pmem" we could name the config
> option VIRTIO_BLOCK_PMEM
> 
> The documentation text could make it clear to people that the
> image shows up as a disk image on the host, but as a pmem
> memory range in the guest.
> 
> > > > > I think we need to call this something other than persistent
> > > > > memory to
> > > > > make it clear that this not memory where the persistence can be
> > > > > managed from userspace. The persistence point always requires
> > > > > 
> > > So currently /proc/iomem in a guest with a pmem device attached to
> > > a
> > > namespace looks like this:
> > > 
> > >     c00000000-13bfffffff : Persistent Memory
> > >        c00000000-13bfffffff : namespace2.0
> > > 
> > > Can we call it "Virtio Shared Memory" to make it clear it is a
> > > different beast than typical "Persistent Memory"?  You can likely
> > 
> > I think somewhere we need persistent keyword 'Virtio Persistent
> > Memory' or
> > so.
> 
> Still hoping for better ideas than "Virtio Block Backed Pmem" :)
> 

Dan,

I have a query regarding below patch [*]. My assumption is its halted
because of memory hotplug restructuring work? Anything I am missing 
here?

[*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html
Dan Williams Oct. 12, 2017, 10:59 p.m. | #8
On Thu, Oct 12, 2017 at 3:52 PM, Pankaj Gupta <pagupta@redhat.com> wrote:
> Dan,
>
> I have a query regarding below patch [*]. My assumption is its halted
> because of memory hotplug restructuring work? Anything I am missing
> here?
>
> [*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html

It's fallen to the back of my queue since the original driving need of
platform firmware changing offsets from boot-to-boot is no longer an
issue. However, it does mean that you need to arrange for 128MB
aligned devm_memremap_pages() ranges for the foreseeable future.
Pankaj Gupta Oct. 12, 2017, 11:07 p.m. | #9
> > Dan,
> >
> > I have a query regarding below patch [*]. My assumption is its halted
> > because of memory hotplug restructuring work? Anything I am missing
> > here?
> >
> > [*] https://www.mail-archive.com/linux-nvdimm@lists.01.org/msg02978.html
> 
> It's fallen to the back of my queue since the original driving need of
> platform firmware changing offsets from boot-to-boot is no longer an
> issue. However, it does mean that you need to arrange for 128MB
> aligned devm_memremap_pages() ranges for the foreseeable future.

o.k I will provide 128MB aligned pages to devm_memremap_pages() function.

Thanks,
Pankaj
Stefan Hajnoczi Oct. 13, 2017, 9:44 a.m. | #10
On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
>   This patch adds virtio-pmem driver for KVM guest.
>   Guest reads the persistent memory range information
>   over virtio bus from Qemu and reserves the range
>   as persistent memory. Guest also allocates a block
>   device corresponding to the pmem range which later
>   can be accessed with DAX compatible file systems.
>   Idea is to use the virtio channel between guest and
>   host to perform the block device flush for guest pmem 
>   DAX device.
> 
>   There is work to do including DAX file system support 
>   and other advanced features.
> 
> Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> ---
>  drivers/virtio/Kconfig           |  10 ++
>  drivers/virtio/Makefile          |   1 +
>  drivers/virtio/virtio_pmem.c     | 322 +++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/virtio_pmem.h |  55 +++++++
>  4 files changed, 388 insertions(+)
>  create mode 100644 drivers/virtio/virtio_pmem.c
>  create mode 100644 include/uapi/linux/virtio_pmem.h
> 
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index cff773f15b7e..0192c4bda54b 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
>  
>  	  If unsure, say Y.
>  
> +config VIRTIO_PMEM
> +	tristate "Virtio pmem driver"
> +	depends on VIRTIO
> +	---help---
> +	 This driver adds persistent memory range within a KVM guest.
> +         It also associates a block device corresponding to the pmem
> +	 range.
> +
> +	 If unsure, say M.
> +
>  config VIRTIO_BALLOON
>  	tristate "Virtio balloon driver"
>  	depends on VIRTIO
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 41e30e3dc842..032ade725cc2 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
>  obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
> diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
> new file mode 100644
> index 000000000000..74e47cae0e24
> --- /dev/null
> +++ b/drivers/virtio/virtio_pmem.c
> @@ -0,0 +1,322 @@
> +/*
> + * virtio-pmem driver
> + */
> +
> +#include <linux/virtio.h>
> +#include <linux/swap.h>
> +#include <linux/workqueue.h>
> +#include <linux/delay.h>
> +#include <linux/slab.h>
> +#include <linux/module.h>
> +#include <linux/oom.h>
> +#include <linux/wait.h>
> +#include <linux/mm.h>
> +#include <linux/mount.h>
> +#include <linux/magic.h>
> +#include <linux/virtio_pmem.h>
> +
> +void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr)
> +{
> +	devm_memunmap(dev, addr);
> +	devm_release_mem_region(dev, res->start, resource_size(res));
> +}
> +
> +static void pmem_flush_done(struct virtqueue *vq)
> +{
> +	return;
> +};
> +
> +static void virtio_pmem_release_queue(void *q)
> +{
> +	blk_cleanup_queue(q);
> +}
> +
> +static void virtio_pmem_freeze_queue(void *q)
> +{
> +	blk_freeze_queue_start(q);
> +}
> +
> +static void virtio_pmem_release_disk(void *__pmem)
> +{
> +	struct virtio_pmem *pmem = __pmem;
> +
> +	del_gendisk(pmem->disk);
> +	put_disk(pmem->disk);
> +}
> +
> +static int init_vq(struct virtio_pmem *vpmem)
> +{
> +	struct virtqueue *vq;
> +
> +	/* single vq */
> +	vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue");
> +
> +	if (IS_ERR(vq))
> +		return PTR_ERR(vq);
> +
> +	return 0;
> +}
> +
> +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem,
> +			struct resource *res, struct vmem_altmap *altmap)
> +{
> +	u32 start_pad = 0, end_trunc = 0;
> +	resource_size_t start, size;
> +	unsigned long npfns;
> +	phys_addr_t offset;
> +
> +	size = resource_size(res);
> +	start = PHYS_SECTION_ALIGN_DOWN(res->start);
> +
> +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +		IORES_DESC_NONE) == REGION_MIXED) {
> +
> +		start = res->start;
> +		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> +	}
> +	start = res->start;
> +	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> +
> +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> +		IORES_DESC_NONE) == REGION_MIXED) {
> +
> +		size = resource_size(res);
> +		end_trunc = start + size -
> +				PHYS_SECTION_ALIGN_DOWN(start + size);
> +	}
> +
> +	start += start_pad;
> +	size = resource_size(res);
> +	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
> +						/ PAGE_SIZE);
> +
> +      /*
> +       * vmemmap_populate_hugepages() allocates the memmap array in
> +       * HPAGE_SIZE chunks.
> +       */
> +	offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start;
> +	vpmem->data_offset = offset;
> +
> +	struct vmem_altmap __altmap = {
> +		.base_pfn = init_altmap_base(start+start_pad),
> +		.reserve = init_altmap_reserve(start+start_pad),
> +	};
> +
> +	res->start += start_pad;
> +	res->end -= end_trunc;
> +	memcpy(altmap, &__altmap, sizeof(*altmap));
> +	altmap->free = PHYS_PFN(offset - SZ_8K);
> +	altmap->alloc = 0;

The __altmap part can be rewritten using compound literal syntax:

  *altmap = (struct vmem_altmap) {
      .base_pfn = init_altmap_base(start+start_pad),
      .reserve = init_altmap_reserve(start+start_pad),
      .free = PHYS_PFN(offset - SZ_8K),
  };

This eliminates the temporary variable, memcpy, and explicit alloc = 0
initialization.

> +
> +	return altmap;
> +}
> +
> +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page,
> +			unsigned int len, unsigned int off, bool is_write,
> +			sector_t sector)
> +{
> +	blk_status_t rc = BLK_STS_OK;
> +	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
> +	void *pmem_addr = pmem->virt_addr + pmem_off;
> +
> +	if (!is_write) {
> +		rc = read_pmem(page, off, pmem_addr, len);
> +			flush_dcache_page(page);
> +	} else {
> +		flush_dcache_page(page);

What are the semantics of this device?  Why flush the dcache here if an
explicit flush command needs to be sent over the virtqueue?

> +		write_pmem(pmem_addr, page, off, len);
> +	}
> +
> +	return rc;
> +}
> +
> +static int vpmem_rw_page(struct block_device *bdev, sector_t sector,
> +		       struct page *page, bool is_write)
> +{
> +	struct virtio_pmem  *pmem = bdev->bd_queue->queuedata;
> +	blk_status_t rc;
> +
> +	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
> +			  0, is_write, sector);
> +
> +	if (rc == 0)
> +		page_endio(page, is_write, 0);
> +
> +	return blk_status_to_errno(rc);
> +}
> +
> +#ifndef REQ_FLUSH
> +#define REQ_FLUSH REQ_PREFLUSH
> +#endif

Is this out-of-tree kernel module compatibility stuff that can be
removed?

> +
> +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> +			struct bio *bio)
> +{
> +	blk_status_t rc = 0;
> +	struct bio_vec bvec;
> +	struct bvec_iter iter;
> +	struct virtio_pmem *pmem = q->queuedata;
> +
> +	if (bio->bi_opf & REQ_FLUSH)
> +		//todo host flush command

This detail is critical to the device design.  What is the plan?

> +
> +	bio_for_each_segment(bvec, bio, iter) {
> +		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
> +				bvec.bv_offset, op_is_write(bio_op(bio)),
> +				iter.bi_sector);
> +		if (rc) {
> +			bio->bi_status = rc;
> +			break;
> +		}
> +	}
> +
> +	bio_endio(bio);
> +	return BLK_QC_T_NONE;
> +}
> +
> +static const struct block_device_operations pmem_fops = {
> +	.owner =		THIS_MODULE,
> +	.rw_page =		vpmem_rw_page,
> +	//.revalidate_disk =	nvdimm_revalidate_disk,
> +};
> +
> +static int virtio_pmem_probe(struct virtio_device *vdev)
> +{
> +	struct virtio_pmem *vpmem;
> +	int err = 0;
> +	void *addr;
> +	struct resource *res, res_pfn;
> +	struct request_queue *q;
> +	struct vmem_altmap __altmap, *altmap = NULL;
> +	struct gendisk *disk;
> +	struct device *gendev;
> +	int nid = dev_to_node(&vdev->dev);
> +
> +	if (!vdev->config->get) {
> +		dev_err(&vdev->dev, "%s failure: config disabled\n",
> +			__func__);
> +		return -EINVAL;
> +	}
> +
> +	vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
> +			GFP_KERNEL);
> +
> +	if (!vpmem) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	dev_set_drvdata(&vdev->dev, vpmem);
> +
> +	vpmem->vdev = vdev;
> +	err = init_vq(vpmem);
> +	if (err)
> +		goto out;
> +
> +	if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) {
> +		dev_err(&vdev->dev, "%s : pmem not supported\n",
> +			__func__);
> +		goto out;
> +	}

Why is VIRTIO_PMEM_PLUG optional for this new device type if it's
already required by the first version of the driver?

err is not set when the error path is taken.

> +
> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +			start, &vpmem->start);
> +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> +			size, &vpmem->size);

The struct resource start and size fields can vary between
architectures.  Virtio device configuration space layout must not vary
between architectures.  Please use u64.

> +
> +	res_pfn.start = vpmem->start;
> +	res_pfn.end   = vpmem->start + vpmem->size-1;
> +
> +	/* used for allocating memmap in the pmem device */
> +	altmap	      = setup_pmem_pfn(vpmem, &res_pfn, &__altmap);
> +
> +	res = devm_request_mem_region(&vdev->dev,
> +			res_pfn.start, resource_size(&res_pfn), "virtio-pmem");
> +
> +	if (!res) {
> +		dev_warn(&vdev->dev, "could not reserve region ");
> +		return -EBUSY;
> +	}
> +
> +	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev));
> +
> +	if (!q)
> +		return -ENOMEM;
> +
> +	if (devm_add_action_or_reset(&vdev->dev,
> +				virtio_pmem_release_queue, q))
> +		return -ENOMEM;
> +
> +	vpmem->pfn_flags = PFN_DEV;
> +
> +	/* allocate memap in pmem device itself */
> +	if (IS_ENABLED(CONFIG_ZONE_DEVICE)) {
> +
> +		addr = devm_memremap_pages(&vdev->dev, res,
> +				&q->q_usage_counter, altmap);
> +		vpmem->pfn_flags |= PFN_MAP;
> +	} else
> +		addr = devm_memremap(&vdev->dev, vpmem->start,
> +				vpmem->size, ARCH_MEMREMAP_PMEM);
> +
> +        /*
> +         * At release time the queue must be frozen before
> +         * devm_memremap_pages is unwound
> +         */
> +	if (devm_add_action_or_reset(&vdev->dev,
> +				virtio_pmem_freeze_queue, q))
> +		return -ENOMEM;
> +
> +	if (IS_ERR(addr))
> +		return PTR_ERR(addr);
> +
> +	vpmem->virt_addr = addr;
> +	blk_queue_write_cache(q, 0, 0);
> +	blk_queue_make_request(q, virtio_pmem_make_request);
> +	blk_queue_physical_block_size(q, PAGE_SIZE);
> +	blk_queue_logical_block_size(q, 512);
> +	blk_queue_max_hw_sectors(q, UINT_MAX);
> +	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
> +	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
> +	q->queuedata = vpmem;
> +
> +	disk = alloc_disk_node(0, nid);
> +	if (!disk)
> +		return -ENOMEM;

The return statements in this function look suspicious.  There probably
needs to be a cleanup to avoid memory leaks.

> +	vpmem->disk = disk;
> +
> +	disk->fops                = &pmem_fops;
> +	disk->queue               = q;
> +	disk->flags               = GENHD_FL_EXT_DEVT;
> +	strcpy(disk->disk_name, "vpmem");
> +	set_capacity(disk, vpmem->size/512);
> +	gendev = disk_to_dev(disk);
> +
> +	virtio_device_ready(vdev);
> +	device_add_disk(&vdev->dev, disk);
> +
> +	if (devm_add_action_or_reset(&vdev->dev,
> +			virtio_pmem_release_disk, vpmem))
> +		return -ENOMEM;
> +
> +	revalidate_disk(disk);
> +	return 0;
> +out:
> +	vdev->config->del_vqs(vdev);
> +	return err;
> +}
> +
> +static struct virtio_driver virtio_pmem_driver = {
> +	.feature_table		= features,
> +	.feature_table_size	= ARRAY_SIZE(features),
> +	.driver.name		= KBUILD_MODNAME,
> +	.driver.owner		= THIS_MODULE,
> +	.id_table		= id_table,
> +	.probe			= virtio_pmem_probe,
> +	//.remove		= virtio_pmem_remove,
> +};
> +
> +module_virtio_driver(virtio_pmem_driver);
> +MODULE_DEVICE_TABLE(virtio, id_table);
> +MODULE_DESCRIPTION("Virtio pmem driver");
> +MODULE_LICENSE("GPL");
> diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
> new file mode 100644
> index 000000000000..ec0c728c79ba
> --- /dev/null
> +++ b/include/uapi/linux/virtio_pmem.h
> @@ -0,0 +1,55 @@
> +/*
> + * Virtio pmem Device
> + *
> + *
> + */
> +
> +#ifndef _LINUX_VIRTIO_PMEM_H
> +#define _LINUX_VIRTIO_PMEM_H
> +
> +#include <linux/types.h>
> +#include <linux/virtio_types.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_config.h>
> +#include <linux/pfn_t.h>
> +#include <linux/fs.h>
> +#include <linux/blk-mq.h>
> +#include <linux/pmem_common.h>

include/uapi/ headers must compile when included from userspace
applications.  The header should define userspace APIs only.

If you want to define kernel-internal APIs, please add them to
include/linux/ or similar.

This also means that include/uapi/ headers cannot include
include/linux/pmem_common.h or other kernel headers outside #ifdef
__KERNEL__.

This header file should only contain struct virtio_pmem_config,
VIRTIO_PMEM_PLUG, and other virtio device definitions.

> +
> +bool pmem_should_map_pages(struct device *dev);
> +
> +#define VIRTIO_PMEM_PLUG 0
> +
> +struct virtio_pmem_config {
> +
> +uint64_t start;
> +uint64_t size;
> +uint64_t align;
> +uint64_t data_offset;
> +};
> +
> +struct virtio_pmem {
> +
> +	struct virtio_device *vdev;
> +	struct virtqueue *req_vq;
> +
> +	uint64_t start;
> +	uint64_t size;
> +	uint64_t align;
> +	uint64_t data_offset;
> +	u64 pfn_flags;
> +	void *virt_addr;
> +	struct gendisk *disk;
> +} __packed;
> +
> +static struct virtio_device_id id_table[] = {
> +	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
> +	{ 0 },
> +};
> +
> +static unsigned int features[] = {
> +	VIRTIO_PMEM_PLUG,
> +};
> +
> +#endif
> +
> -- 
> 2.13.0
>
Pankaj Gupta Oct. 13, 2017, 10:48 a.m. | #11
> 
> On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
> >   This patch adds virtio-pmem driver for KVM guest.
> >   Guest reads the persistent memory range information
> >   over virtio bus from Qemu and reserves the range
> >   as persistent memory. Guest also allocates a block
> >   device corresponding to the pmem range which later
> >   can be accessed with DAX compatible file systems.
> >   Idea is to use the virtio channel between guest and
> >   host to perform the block device flush for guest pmem
> >   DAX device.
> > 
> >   There is work to do including DAX file system support
> >   and other advanced features.
> > 
> > Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
> > ---
> >  drivers/virtio/Kconfig           |  10 ++
> >  drivers/virtio/Makefile          |   1 +
> >  drivers/virtio/virtio_pmem.c     | 322
> >  +++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/virtio_pmem.h |  55 +++++++
> >  4 files changed, 388 insertions(+)
> >  create mode 100644 drivers/virtio/virtio_pmem.c
> >  create mode 100644 include/uapi/linux/virtio_pmem.h
> > 
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index cff773f15b7e..0192c4bda54b 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -38,6 +38,16 @@ config VIRTIO_PCI_LEGACY
> >  
> >  	  If unsure, say Y.
> >  
> > +config VIRTIO_PMEM
> > +	tristate "Virtio pmem driver"
> > +	depends on VIRTIO
> > +	---help---
> > +	 This driver adds persistent memory range within a KVM guest.
> > +         It also associates a block device corresponding to the pmem
> > +	 range.
> > +
> > +	 If unsure, say M.
> > +
> >  config VIRTIO_BALLOON
> >  	tristate "Virtio balloon driver"
> >  	depends on VIRTIO
> > diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> > index 41e30e3dc842..032ade725cc2 100644
> > --- a/drivers/virtio/Makefile
> > +++ b/drivers/virtio/Makefile
> > @@ -5,3 +5,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
> >  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
> >  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
> >  obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> > +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
> > diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
> > new file mode 100644
> > index 000000000000..74e47cae0e24
> > --- /dev/null
> > +++ b/drivers/virtio/virtio_pmem.c
> > @@ -0,0 +1,322 @@
> > +/*
> > + * virtio-pmem driver
> > + */
> > +
> > +#include <linux/virtio.h>
> > +#include <linux/swap.h>
> > +#include <linux/workqueue.h>
> > +#include <linux/delay.h>
> > +#include <linux/slab.h>
> > +#include <linux/module.h>
> > +#include <linux/oom.h>
> > +#include <linux/wait.h>
> > +#include <linux/mm.h>
> > +#include <linux/mount.h>
> > +#include <linux/magic.h>
> > +#include <linux/virtio_pmem.h>
> > +
> > +void devm_vpmem_disable(struct device *dev, struct resource *res, void
> > *addr)
> > +{
> > +	devm_memunmap(dev, addr);
> > +	devm_release_mem_region(dev, res->start, resource_size(res));
> > +}
> > +
> > +static void pmem_flush_done(struct virtqueue *vq)
> > +{
> > +	return;
> > +};
> > +
> > +static void virtio_pmem_release_queue(void *q)
> > +{
> > +	blk_cleanup_queue(q);
> > +}
> > +
> > +static void virtio_pmem_freeze_queue(void *q)
> > +{
> > +	blk_freeze_queue_start(q);
> > +}
> > +
> > +static void virtio_pmem_release_disk(void *__pmem)
> > +{
> > +	struct virtio_pmem *pmem = __pmem;
> > +
> > +	del_gendisk(pmem->disk);
> > +	put_disk(pmem->disk);
> > +}
> > +
> > +static int init_vq(struct virtio_pmem *vpmem)
> > +{
> > +	struct virtqueue *vq;
> > +
> > +	/* single vq */
> > +	vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue");
> > +
> > +	if (IS_ERR(vq))
> > +		return PTR_ERR(vq);
> > +
> > +	return 0;
> > +}
> > +
> > +static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem,
> > +			struct resource *res, struct vmem_altmap *altmap)
> > +{
> > +	u32 start_pad = 0, end_trunc = 0;
> > +	resource_size_t start, size;
> > +	unsigned long npfns;
> > +	phys_addr_t offset;
> > +
> > +	size = resource_size(res);
> > +	start = PHYS_SECTION_ALIGN_DOWN(res->start);
> > +
> > +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > +		IORES_DESC_NONE) == REGION_MIXED) {
> > +
> > +		start = res->start;
> > +		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
> > +	}
> > +	start = res->start;
> > +	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
> > +
> > +	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
> > +		IORES_DESC_NONE) == REGION_MIXED) {
> > +
> > +		size = resource_size(res);
> > +		end_trunc = start + size -
> > +				PHYS_SECTION_ALIGN_DOWN(start + size);
> > +	}
> > +
> > +	start += start_pad;
> > +	size = resource_size(res);
> > +	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
> > +						/ PAGE_SIZE);
> > +
> > +      /*
> > +       * vmemmap_populate_hugepages() allocates the memmap array in
> > +       * HPAGE_SIZE chunks.
> > +       */
> > +	offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start;
> > +	vpmem->data_offset = offset;
> > +
> > +	struct vmem_altmap __altmap = {
> > +		.base_pfn = init_altmap_base(start+start_pad),
> > +		.reserve = init_altmap_reserve(start+start_pad),
> > +	};
> > +
> > +	res->start += start_pad;
> > +	res->end -= end_trunc;
> > +	memcpy(altmap, &__altmap, sizeof(*altmap));
> > +	altmap->free = PHYS_PFN(offset - SZ_8K);
> > +	altmap->alloc = 0;
> 
> The __altmap part can be rewritten using compound literal syntax:
> 
>   *altmap = (struct vmem_altmap) {
>       .base_pfn = init_altmap_base(start+start_pad),
>       .reserve = init_altmap_reserve(start+start_pad),
>       .free = PHYS_PFN(offset - SZ_8K),
>   };
> 
> This eliminates the temporary variable, memcpy, and explicit alloc = 0
> initialization.

o.k will use it.
  
> 
> > +
> > +	return altmap;
> > +}
> > +
> > +static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page
> > *page,
> > +			unsigned int len, unsigned int off, bool is_write,
> > +			sector_t sector)
> > +{
> > +	blk_status_t rc = BLK_STS_OK;
> > +	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
> > +	void *pmem_addr = pmem->virt_addr + pmem_off;
> > +
> > +	if (!is_write) {
> > +		rc = read_pmem(page, off, pmem_addr, len);
> > +			flush_dcache_page(page);
> > +	} else {
> > +		flush_dcache_page(page);
> 
> What are the semantics of this device?  Why flush the dcache here if an
> explicit flush command needs to be sent over the virtqueue?

yes, we don't need it in this case. I yet have to think more
about flushing with block and file-system level support :)

> 
> > +		write_pmem(pmem_addr, page, off, len);
> > +	}
> > +
> > +	return rc;
> > +}
> > +
> > +static int vpmem_rw_page(struct block_device *bdev, sector_t sector,
> > +		       struct page *page, bool is_write)
> > +{
> > +	struct virtio_pmem  *pmem = bdev->bd_queue->queuedata;
> > +	blk_status_t rc;
> > +
> > +	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
> > +			  0, is_write, sector);
> > +
> > +	if (rc == 0)
> > +		page_endio(page, is_write, 0);
> > +
> > +	return blk_status_to_errno(rc);
> > +}
> > +
> > +#ifndef REQ_FLUSH
> > +#define REQ_FLUSH REQ_PREFLUSH
> > +#endif
> 
> Is this out-of-tree kernel module compatibility stuff that can be
> removed?

yes, will check what filesystem expects for flush if not used then
remove.

> 
> > +
> > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> > +			struct bio *bio)
> > +{
> > +	blk_status_t rc = 0;
> > +	struct bio_vec bvec;
> > +	struct bvec_iter iter;
> > +	struct virtio_pmem *pmem = q->queuedata;
> > +
> > +	if (bio->bi_opf & REQ_FLUSH)
> > +		//todo host flush command
> 
> This detail is critical to the device design.  What is the plan?

yes, this is good point.

was thinking of guest sending a flush command to Qemu which
will do a fsync on file fd.

If we do a async flush and move the task to wait queue till we receive 
flush complete reply from host we can allow other tasks to execute
in current cpu.

Any suggestions you have or anything I am not foreseeing here?

> 
> > +
> > +	bio_for_each_segment(bvec, bio, iter) {
> > +		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
> > +				bvec.bv_offset, op_is_write(bio_op(bio)),
> > +				iter.bi_sector);
> > +		if (rc) {
> > +			bio->bi_status = rc;
> > +			break;
> > +		}
> > +	}
> > +
> > +	bio_endio(bio);
> > +	return BLK_QC_T_NONE;
> > +}
> > +
> > +static const struct block_device_operations pmem_fops = {
> > +	.owner =		THIS_MODULE,
> > +	.rw_page =		vpmem_rw_page,
> > +	//.revalidate_disk =	nvdimm_revalidate_disk,
> > +};
> > +
> > +static int virtio_pmem_probe(struct virtio_device *vdev)
> > +{
> > +	struct virtio_pmem *vpmem;
> > +	int err = 0;
> > +	void *addr;
> > +	struct resource *res, res_pfn;
> > +	struct request_queue *q;
> > +	struct vmem_altmap __altmap, *altmap = NULL;
> > +	struct gendisk *disk;
> > +	struct device *gendev;
> > +	int nid = dev_to_node(&vdev->dev);
> > +
> > +	if (!vdev->config->get) {
> > +		dev_err(&vdev->dev, "%s failure: config disabled\n",
> > +			__func__);
> > +		return -EINVAL;
> > +	}
> > +
> > +	vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
> > +			GFP_KERNEL);
> > +
> > +	if (!vpmem) {
> > +		err = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	dev_set_drvdata(&vdev->dev, vpmem);
> > +
> > +	vpmem->vdev = vdev;
> > +	err = init_vq(vpmem);
> > +	if (err)
> > +		goto out;
> > +
> > +	if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) {
> > +		dev_err(&vdev->dev, "%s : pmem not supported\n",
> > +			__func__);
> > +		goto out;
> > +	}
> 
> Why is VIRTIO_PMEM_PLUG optional for this new device type if it's
> already required by the first version of the driver?

I just added it, initially kept it a place holder for any feature bit use.
I will remove it if we are not using it.

> 
> err is not set when the error path is taken.
> 
> > +
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			start, &vpmem->start);
> > +	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +			size, &vpmem->size);
> 
> The struct resource start and size fields can vary between
> architectures.  Virtio device configuration space layout must not vary
> between architectures.  Please use u64.

sure.

> 
> > +
> > +	res_pfn.start = vpmem->start;
> > +	res_pfn.end   = vpmem->start + vpmem->size-1;
> > +
> > +	/* used for allocating memmap in the pmem device */
> > +	altmap	      = setup_pmem_pfn(vpmem, &res_pfn, &__altmap);
> > +
> > +	res = devm_request_mem_region(&vdev->dev,
> > +			res_pfn.start, resource_size(&res_pfn), "virtio-pmem");
> > +
> > +	if (!res) {
> > +		dev_warn(&vdev->dev, "could not reserve region ");
> > +		return -EBUSY;
> > +	}
> > +
> > +	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev));
> > +
> > +	if (!q)
> > +		return -ENOMEM;
> > +
> > +	if (devm_add_action_or_reset(&vdev->dev,
> > +				virtio_pmem_release_queue, q))
> > +		return -ENOMEM;
> > +
> > +	vpmem->pfn_flags = PFN_DEV;
> > +
> > +	/* allocate memap in pmem device itself */
> > +	if (IS_ENABLED(CONFIG_ZONE_DEVICE)) {
> > +
> > +		addr = devm_memremap_pages(&vdev->dev, res,
> > +				&q->q_usage_counter, altmap);
> > +		vpmem->pfn_flags |= PFN_MAP;
> > +	} else
> > +		addr = devm_memremap(&vdev->dev, vpmem->start,
> > +				vpmem->size, ARCH_MEMREMAP_PMEM);
> > +
> > +        /*
> > +         * At release time the queue must be frozen before
> > +         * devm_memremap_pages is unwound
> > +         */
> > +	if (devm_add_action_or_reset(&vdev->dev,
> > +				virtio_pmem_freeze_queue, q))
> > +		return -ENOMEM;
> > +
> > +	if (IS_ERR(addr))
> > +		return PTR_ERR(addr);
> > +
> > +	vpmem->virt_addr = addr;
> > +	blk_queue_write_cache(q, 0, 0);
> > +	blk_queue_make_request(q, virtio_pmem_make_request);
> > +	blk_queue_physical_block_size(q, PAGE_SIZE);
> > +	blk_queue_logical_block_size(q, 512);
> > +	blk_queue_max_hw_sectors(q, UINT_MAX);
> > +	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
> > +	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
> > +	q->queuedata = vpmem;
> > +
> > +	disk = alloc_disk_node(0, nid);
> > +	if (!disk)
> > +		return -ENOMEM;
> 
> The return statements in this function look suspicious.  There probably
> needs to be a cleanup to avoid memory leaks.

Sure! valid point.
> 
> > +	vpmem->disk = disk;
> > +
> > +	disk->fops                = &pmem_fops;
> > +	disk->queue               = q;
> > +	disk->flags               = GENHD_FL_EXT_DEVT;
> > +	strcpy(disk->disk_name, "vpmem");
> > +	set_capacity(disk, vpmem->size/512);
> > +	gendev = disk_to_dev(disk);
> > +
> > +	virtio_device_ready(vdev);
> > +	device_add_disk(&vdev->dev, disk);
> > +
> > +	if (devm_add_action_or_reset(&vdev->dev,
> > +			virtio_pmem_release_disk, vpmem))
> > +		return -ENOMEM;
> > +
> > +	revalidate_disk(disk);
> > +	return 0;
> > +out:
> > +	vdev->config->del_vqs(vdev);
> > +	return err;
> > +}
> > +
> > +static struct virtio_driver virtio_pmem_driver = {
> > +	.feature_table		= features,
> > +	.feature_table_size	= ARRAY_SIZE(features),
> > +	.driver.name		= KBUILD_MODNAME,
> > +	.driver.owner		= THIS_MODULE,
> > +	.id_table		= id_table,
> > +	.probe			= virtio_pmem_probe,
> > +	//.remove		= virtio_pmem_remove,
> > +};
> > +
> > +module_virtio_driver(virtio_pmem_driver);
> > +MODULE_DEVICE_TABLE(virtio, id_table);
> > +MODULE_DESCRIPTION("Virtio pmem driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/include/uapi/linux/virtio_pmem.h
> > b/include/uapi/linux/virtio_pmem.h
> > new file mode 100644
> > index 000000000000..ec0c728c79ba
> > --- /dev/null
> > +++ b/include/uapi/linux/virtio_pmem.h
> > @@ -0,0 +1,55 @@
> > +/*
> > + * Virtio pmem Device
> > + *
> > + *
> > + */
> > +
> > +#ifndef _LINUX_VIRTIO_PMEM_H
> > +#define _LINUX_VIRTIO_PMEM_H
> > +
> > +#include <linux/types.h>
> > +#include <linux/virtio_types.h>
> > +#include <linux/virtio_ids.h>
> > +#include <linux/virtio_config.h>
> > +#include <linux/pfn_t.h>
> > +#include <linux/fs.h>
> > +#include <linux/blk-mq.h>
> > +#include <linux/pmem_common.h>
> 
> include/uapi/ headers must compile when included from userspace
> applications.  The header should define userspace APIs only.
> 
> If you want to define kernel-internal APIs, please add them to
> include/linux/ or similar.
> 
> This also means that include/uapi/ headers cannot include
> include/linux/pmem_common.h or other kernel headers outside #ifdef
> __KERNEL__.

o.k

> 
> This header file should only contain struct virtio_pmem_config,
> VIRTIO_PMEM_PLUG, and other virtio device definitions.

Sure!
> 
> > +
> > +bool pmem_should_map_pages(struct device *dev);
> > +
> > +#define VIRTIO_PMEM_PLUG 0
> > +
> > +struct virtio_pmem_config {
> > +
> > +uint64_t start;
> > +uint64_t size;
> > +uint64_t align;
> > +uint64_t data_offset;
> > +};
> > +
> > +struct virtio_pmem {
> > +
> > +	struct virtio_device *vdev;
> > +	struct virtqueue *req_vq;
> > +
> > +	uint64_t start;
> > +	uint64_t size;
> > +	uint64_t align;
> > +	uint64_t data_offset;
> > +	u64 pfn_flags;
> > +	void *virt_addr;
> > +	struct gendisk *disk;
> > +} __packed;
> > +
> > +static struct virtio_device_id id_table[] = {
> > +	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
> > +	{ 0 },
> > +};
> > +
> > +static unsigned int features[] = {
> > +	VIRTIO_PMEM_PLUG,
> > +};
> > +
> > +#endif
> > +
> > --
> > 2.13.0
> > 
> 

Thanks,
Pankaj
Dan Williams Oct. 13, 2017, 3:25 p.m. | #12
On Fri, Oct 13, 2017 at 2:44 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
[..]
>> +#ifndef REQ_FLUSH
>> +#define REQ_FLUSH REQ_PREFLUSH
>> +#endif
>
> Is this out-of-tree kernel module compatibility stuff that can be
> removed?

Yes, this was copied from the pmem driver where it can also be
removed, it was used to workaround a merge order problem in linux-next
when these definitions were changed several kernel releases back.
Stefan Hajnoczi Oct. 16, 2017, 2:47 p.m. | #13
On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote:
> > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
> > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> > > +			struct bio *bio)
> > > +{
> > > +	blk_status_t rc = 0;
> > > +	struct bio_vec bvec;
> > > +	struct bvec_iter iter;
> > > +	struct virtio_pmem *pmem = q->queuedata;
> > > +
> > > +	if (bio->bi_opf & REQ_FLUSH)
> > > +		//todo host flush command
> > 
> > This detail is critical to the device design.  What is the plan?
> 
> yes, this is good point.
> 
> was thinking of guest sending a flush command to Qemu which
> will do a fsync on file fd.

Previously there was discussion about fsyncing a specific file range
instead of the whole file.  This could perform better in cases where
only a subset of dirty pages need to be flushed.

One possibility is to design the virtio interface to communicate ranges
but the emulation code simply fsyncs the fd for the time being.  Later
on, if the necessary kernel and userspace interfaces are added, we can
make use of the interface.

> If we do a async flush and move the task to wait queue till we receive 
> flush complete reply from host we can allow other tasks to execute
> in current cpu.
> 
> Any suggestions you have or anything I am not foreseeing here?

My main thought about this patch series is whether pmem should be a
virtio-blk feature bit instead of a whole new device.  There is quite a
bit of overlap between the two.

Stefan
Dan Williams Oct. 16, 2017, 3:58 p.m. | #14
On Mon, Oct 16, 2017 at 7:47 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote:
>> > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
>> > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
>> > > +                 struct bio *bio)
>> > > +{
>> > > + blk_status_t rc = 0;
>> > > + struct bio_vec bvec;
>> > > + struct bvec_iter iter;
>> > > + struct virtio_pmem *pmem = q->queuedata;
>> > > +
>> > > + if (bio->bi_opf & REQ_FLUSH)
>> > > +         //todo host flush command
>> >
>> > This detail is critical to the device design.  What is the plan?
>>
>> yes, this is good point.
>>
>> was thinking of guest sending a flush command to Qemu which
>> will do a fsync on file fd.
>
> Previously there was discussion about fsyncing a specific file range
> instead of the whole file.  This could perform better in cases where
> only a subset of dirty pages need to be flushed.
>
> One possibility is to design the virtio interface to communicate ranges
> but the emulation code simply fsyncs the fd for the time being.  Later
> on, if the necessary kernel and userspace interfaces are added, we can
> make use of the interface.

Range based is not a natural storage cache management mechanism. All
that is it available typically is a full write-cache-flush mechanism
and upper layers would need to customized for range-based flushing.

>> If we do a async flush and move the task to wait queue till we receive
>> flush complete reply from host we can allow other tasks to execute
>> in current cpu.
>>
>> Any suggestions you have or anything I am not foreseeing here?
>
> My main thought about this patch series is whether pmem should be a
> virtio-blk feature bit instead of a whole new device.  There is quite a
> bit of overlap between the two.

I'd be open to that... there's already provisions in the pmem driver
for platforms where cpu caches are flushed on power-loss, a virtio
mode for this shared-memory case seems reasonable.
Pankaj Gupta Oct. 16, 2017, 5:04 p.m. | #15
> 
> On Fri, Oct 13, 2017 at 06:48:15AM -0400, Pankaj Gupta wrote:
> > > On Thu, Oct 12, 2017 at 09:20:26PM +0530, Pankaj Gupta wrote:
> > > > +static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
> > > > +			struct bio *bio)
> > > > +{
> > > > +	blk_status_t rc = 0;
> > > > +	struct bio_vec bvec;
> > > > +	struct bvec_iter iter;
> > > > +	struct virtio_pmem *pmem = q->queuedata;
> > > > +
> > > > +	if (bio->bi_opf & REQ_FLUSH)
> > > > +		//todo host flush command
> > > 
> > > This detail is critical to the device design.  What is the plan?
> > 
> > yes, this is good point.
> > 
> > was thinking of guest sending a flush command to Qemu which
> > will do a fsync on file fd.
> 
> Previously there was discussion about fsyncing a specific file range
> instead of the whole file.  This could perform better in cases where
> only a subset of dirty pages need to be flushed.

yes, We had discussion about this and decided to do entire block flush
then to range level flush.

> 
> One possibility is to design the virtio interface to communicate ranges
> but the emulation code simply fsyncs the fd for the time being.  Later
> on, if the necessary kernel and userspace interfaces are added, we can
> make use of the interface.
> 
> > If we do a async flush and move the task to wait queue till we receive
> > flush complete reply from host we can allow other tasks to execute
> > in current cpu.
> > 
> > Any suggestions you have or anything I am not foreseeing here?
> 
> My main thought about this patch series is whether pmem should be a
> virtio-blk feature bit instead of a whole new device.  There is quite a
> bit of overlap between the two.

Exposing options with existing virtio-blk device to be used as persistent memory
range at high level would require additional below features:

- Use a persistent memory range with an option to allocate memmap array in the device
  itself for .

- Block operations for DAX and persistent memory range.

- Bifurcation at filesystem level based on type of virtio-blk device selected.

- Bifurcation of flushing interface and communication channel between guest & host.

But yes these features can be dynamically configured based on type of device
added? What if we have virtio-blk:virtio-pmem (m:n) devices ratio?And scale involved? 

If i understand correctly virtio-blk is high performance interface with multiqueue support 
and additional features at host side like data-plane mode etc. If we bloat it with additional
stuff(even when we need them) and provide locking with additional features both at guest as 
well as host side we will get a hit in performance? Also as requirement of both the interfaces
would grow it will be more difficult to maintain? I would prefer more simpler interfaces with
defined functionality but yes common code can be shared and used using well defined wrappers. 

> 
> Stefan
>
Christoph Hellwig Oct. 17, 2017, 7:16 a.m. | #16
I think this driver is at entirely the wrong level.

If you want to expose pmem to a guest with flushing assist do it
as pmem, and not a block driver.
Pankaj Gupta Oct. 17, 2017, 7:40 a.m. | #17
> 
> I think this driver is at entirely the wrong level.
> 
> If you want to expose pmem to a guest with flushing assist do it
> as pmem, and not a block driver.

Are you saying do it as existing i.e ACPI pmem like interface?
The reason we have created this new driver is exiting pmem driver
does not define proper semantics for guest flushing requests.

Regarding block support of driver, we want to achieve DAX support
to bypass guest page cache. Also, we want to utilize existing DAX
capable file-system interfaces(e.g fsync) from userspace file API's
to trigger the host side flush request.

Below link has details of previous discussion:
https://marc.info/?l=kvm&m=150091133700361&w=2

Thanks,
Pankaj
Christoph Hellwig Oct. 17, 2017, 8:02 a.m. | #18
On Tue, Oct 17, 2017 at 03:40:56AM -0400, Pankaj Gupta wrote:
> Are you saying do it as existing i.e ACPI pmem like interface?
> The reason we have created this new driver is exiting pmem driver
> does not define proper semantics for guest flushing requests.

At this point I'm caring about the Linux-internal interface, and
for that it should be integrated into the nvdimm subsystem and not
a block driver.  How the host <-> guest interface looks is a different
idea.

> 
> Regarding block support of driver, we want to achieve DAX support
> to bypass guest page cache. Also, we want to utilize existing DAX
> capable file-system interfaces(e.g fsync) from userspace file API's
> to trigger the host side flush request.

Well, if you want to support XFS+DAX better don't make it a block
devices, because I'll post patches soon to stop using the block device
entirely for the DAX case.
Pankaj Gupta Oct. 17, 2017, 8:30 a.m. | #19
> > Are you saying do it as existing i.e ACPI pmem like interface?
> > The reason we have created this new driver is exiting pmem driver
> > does not define proper semantics for guest flushing requests.
> 
> At this point I'm caring about the Linux-internal interface, and
> for that it should be integrated into the nvdimm subsystem and not
> a block driver.  How the host <-> guest interface looks is a different
> idea.
> 
> > 
> > Regarding block support of driver, we want to achieve DAX support
> > to bypass guest page cache. Also, we want to utilize existing DAX
> > capable file-system interfaces(e.g fsync) from userspace file API's
> > to trigger the host side flush request.
> 
> Well, if you want to support XFS+DAX better don't make it a block
> devices, because I'll post patches soon to stop using the block device
> entirely for the DAX case.

o.k I will look at your patches once they are in mailing list.
Thanks for the heads up.

If I am guessing it right, we don't need block device additional features
for pmem? We can bypass block device features like blk device cache flush etc.
Also, still we would be supporting ext4 & XFS filesystem with pmem?

If there is time to your patches can you please elaborate on this a bit.

Thanks,
Pankaj
Stefan Hajnoczi Oct. 18, 2017, 1:03 p.m. | #20
On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote:
> 
> > > Are you saying do it as existing i.e ACPI pmem like interface?
> > > The reason we have created this new driver is exiting pmem driver
> > > does not define proper semantics for guest flushing requests.
> > 
> > At this point I'm caring about the Linux-internal interface, and
> > for that it should be integrated into the nvdimm subsystem and not
> > a block driver.  How the host <-> guest interface looks is a different
> > idea.
> > 
> > > 
> > > Regarding block support of driver, we want to achieve DAX support
> > > to bypass guest page cache. Also, we want to utilize existing DAX
> > > capable file-system interfaces(e.g fsync) from userspace file API's
> > > to trigger the host side flush request.
> > 
> > Well, if you want to support XFS+DAX better don't make it a block
> > devices, because I'll post patches soon to stop using the block device
> > entirely for the DAX case.
> 
> o.k I will look at your patches once they are in mailing list.
> Thanks for the heads up.
> 
> If I am guessing it right, we don't need block device additional features
> for pmem? We can bypass block device features like blk device cache flush etc.
> Also, still we would be supporting ext4 & XFS filesystem with pmem?
> 
> If there is time to your patches can you please elaborate on this a bit.

I think the idea is that the nvdimm subsystem already adds block device
semantics on top of the struct nvdimms that it manages.  See
drivers/nvdimm/blk.c.

So it would be cleaner to make virtio-pmem an nvdimm bus.  This will
eliminate the duplication between your driver and drivers/nvdimm/ code.
Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm
subsystem.

The virtio-pmem driver could register itself similar to the existing
ACPI NFIT and BIOS e820 pmem drivers.

Stefan
Dan Williams Oct. 18, 2017, 3:51 p.m. | #21
On Wed, Oct 18, 2017 at 6:03 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote:
>>
>> > > Are you saying do it as existing i.e ACPI pmem like interface?
>> > > The reason we have created this new driver is exiting pmem driver
>> > > does not define proper semantics for guest flushing requests.
>> >
>> > At this point I'm caring about the Linux-internal interface, and
>> > for that it should be integrated into the nvdimm subsystem and not
>> > a block driver.  How the host <-> guest interface looks is a different
>> > idea.
>> >
>> > >
>> > > Regarding block support of driver, we want to achieve DAX support
>> > > to bypass guest page cache. Also, we want to utilize existing DAX
>> > > capable file-system interfaces(e.g fsync) from userspace file API's
>> > > to trigger the host side flush request.
>> >
>> > Well, if you want to support XFS+DAX better don't make it a block
>> > devices, because I'll post patches soon to stop using the block device
>> > entirely for the DAX case.
>>
>> o.k I will look at your patches once they are in mailing list.
>> Thanks for the heads up.
>>
>> If I am guessing it right, we don't need block device additional features
>> for pmem? We can bypass block device features like blk device cache flush etc.
>> Also, still we would be supporting ext4 & XFS filesystem with pmem?
>>
>> If there is time to your patches can you please elaborate on this a bit.
>
> I think the idea is that the nvdimm subsystem already adds block device
> semantics on top of the struct nvdimms that it manages.  See
> drivers/nvdimm/blk.c.
>
> So it would be cleaner to make virtio-pmem an nvdimm bus.  This will
> eliminate the duplication between your driver and drivers/nvdimm/ code.
> Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm
> subsystem.

This use case is not "Persistent Memory". Persistent Memory is
something you can map and make persistent with CPU instructions.
Anything that requires a driver call is device driver managed "Shared
Memory".
Stefan Hajnoczi Oct. 19, 2017, 8:01 a.m. | #22
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote:
> On Wed, Oct 18, 2017 at 6:03 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote:
> >>
> >> > > Are you saying do it as existing i.e ACPI pmem like interface?
> >> > > The reason we have created this new driver is exiting pmem driver
> >> > > does not define proper semantics for guest flushing requests.
> >> >
> >> > At this point I'm caring about the Linux-internal interface, and
> >> > for that it should be integrated into the nvdimm subsystem and not
> >> > a block driver.  How the host <-> guest interface looks is a different
> >> > idea.
> >> >
> >> > >
> >> > > Regarding block support of driver, we want to achieve DAX support
> >> > > to bypass guest page cache. Also, we want to utilize existing DAX
> >> > > capable file-system interfaces(e.g fsync) from userspace file API's
> >> > > to trigger the host side flush request.
> >> >
> >> > Well, if you want to support XFS+DAX better don't make it a block
> >> > devices, because I'll post patches soon to stop using the block device
> >> > entirely for the DAX case.
> >>
> >> o.k I will look at your patches once they are in mailing list.
> >> Thanks for the heads up.
> >>
> >> If I am guessing it right, we don't need block device additional features
> >> for pmem? We can bypass block device features like blk device cache flush etc.
> >> Also, still we would be supporting ext4 & XFS filesystem with pmem?
> >>
> >> If there is time to your patches can you please elaborate on this a bit.
> >
> > I think the idea is that the nvdimm subsystem already adds block device
> > semantics on top of the struct nvdimms that it manages.  See
> > drivers/nvdimm/blk.c.
> >
> > So it would be cleaner to make virtio-pmem an nvdimm bus.  This will
> > eliminate the duplication between your driver and drivers/nvdimm/ code.
> > Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm
> > subsystem.
> 
> This use case is not "Persistent Memory". Persistent Memory is
> something you can map and make persistent with CPU instructions.
> Anything that requires a driver call is device driver managed "Shared
> Memory".

Dan, in that case do you have ideas regarding Christoph Hellwig's
comment that this driver should be integrated into the nvdimm subsystem
instead of a new block driver?

Stefan
Christoph Hellwig Oct. 19, 2017, 8:01 a.m. | #23
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote:
> This use case is not "Persistent Memory". Persistent Memory is
> something you can map and make persistent with CPU instructions.
> Anything that requires a driver call is device driver managed "Shared
> Memory".

How is this any different than the existing nvdimm_flush()? If you
really care about the not driver thing it could easily be a write
to a doorbell page or a hypercall, but in the end that's just semantics.
Dan Williams Oct. 19, 2017, 6:21 p.m. | #24
On Thu, Oct 19, 2017 at 1:01 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote:
>> This use case is not "Persistent Memory". Persistent Memory is
>> something you can map and make persistent with CPU instructions.
>> Anything that requires a driver call is device driver managed "Shared
>> Memory".
>
> How is this any different than the existing nvdimm_flush()? If you
> really care about the not driver thing it could easily be a write
> to a doorbell page or a hypercall, but in the end that's just semantics.

The difference is that nvdimm_flush() is not mandatory, and that the
platform will automatically perform the same flush at power-fail.
Applications should be able to assume that if they are using MAP_SYNC
that no other coordination with the kernel or the hypervisor is
necessary.

Advertising this as a generic Persistent Memory range to the guest
means that the guest could theoretically use it with device-dax where
there is no driver or filesystem sync interface. The hypervisor will
be waiting for flush notifications and the guest will just issue cache
flushes and sfence instructions. So, as far as I can see we need to
differentiate this virtio-model from standard "Persistent Memory" to
the guest and remove the possibility of guests/applications making the
wrong assumption.

Non-ODP RDMA in a guest comes to mind...
Christoph Hellwig Oct. 20, 2017, 8 a.m. | #25
On Thu, Oct 19, 2017 at 11:21:26AM -0700, Dan Williams wrote:
> The difference is that nvdimm_flush() is not mandatory, and that the
> platform will automatically perform the same flush at power-fail.
> Applications should be able to assume that if they are using MAP_SYNC
> that no other coordination with the kernel or the hypervisor is
> necessary.
> 
> Advertising this as a generic Persistent Memory range to the guest
> means that the guest could theoretically use it with device-dax where
> there is no driver or filesystem sync interface. The hypervisor will
> be waiting for flush notifications and the guest will just issue cache
> flushes and sfence instructions. So, as far as I can see we need to
> differentiate this virtio-model from standard "Persistent Memory" to
> the guest and remove the possibility of guests/applications making the
> wrong assumption.

So add a flag that it is not.  We already have the nd_volatile type,
that is special.  For now only in Linux, but I think adding this type
to the spec eventually would be very useful for efficiently exposing
directly mappable device to VM guests.
Dan Williams Oct. 20, 2017, 3:05 p.m. | #26
On Fri, Oct 20, 2017 at 1:00 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Oct 19, 2017 at 11:21:26AM -0700, Dan Williams wrote:
>> The difference is that nvdimm_flush() is not mandatory, and that the
>> platform will automatically perform the same flush at power-fail.
>> Applications should be able to assume that if they are using MAP_SYNC
>> that no other coordination with the kernel or the hypervisor is
>> necessary.
>>
>> Advertising this as a generic Persistent Memory range to the guest
>> means that the guest could theoretically use it with device-dax where
>> there is no driver or filesystem sync interface. The hypervisor will
>> be waiting for flush notifications and the guest will just issue cache
>> flushes and sfence instructions. So, as far as I can see we need to
>> differentiate this virtio-model from standard "Persistent Memory" to
>> the guest and remove the possibility of guests/applications making the
>> wrong assumption.
>
> So add a flag that it is not.  We already have the nd_volatile type,
> that is special.  For now only in Linux, but I think adding this type
> to the spec eventually would be very useful for efficiently exposing
> directly mappable device to VM guests.

Right, that's the same recommendation I gave.

    https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html

...so maybe I'm misunderstanding your concern? It sounds like we're on
the same page.
Christoph Hellwig Oct. 20, 2017, 4:06 p.m. | #27
On Fri, Oct 20, 2017 at 08:05:09AM -0700, Dan Williams wrote:
> Right, that's the same recommendation I gave.
> 
>     https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html
> 
> ...so maybe I'm misunderstanding your concern? It sounds like we're on
> the same page.

Yes, the above is exactly what I think we should do it.  And in many
ways virtio seems overkill if we could just have a hypercall or doorbell
page as the queueing infrastructure in virtio shouldn't really be
needed.
Dan Williams Oct. 20, 2017, 4:11 p.m. | #28
On Fri, Oct 20, 2017 at 9:06 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Oct 20, 2017 at 08:05:09AM -0700, Dan Williams wrote:
>> Right, that's the same recommendation I gave.
>>
>>     https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg08404.html
>>
>> ...so maybe I'm misunderstanding your concern? It sounds like we're on
>> the same page.
>
> Yes, the above is exactly what I think we should do it.  And in many
> ways virtio seems overkill if we could just have a hypercall or doorbell
> page as the queueing infrastructure in virtio shouldn't really be
> needed.

Ah ok, yes, get rid of the virtio-pmem driver and just make
nvdimm_flush() do a hypercall based on region-type flag.

Patch

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cff773f15b7e..0192c4bda54b 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -38,6 +38,16 @@  config VIRTIO_PCI_LEGACY
 
 	  If unsure, say Y.
 
+config VIRTIO_PMEM
+	tristate "Virtio pmem driver"
+	depends on VIRTIO
+	---help---
+	 This driver adds persistent memory range within a KVM guest.
+         It also associates a block device corresponding to the pmem
+	 range.
+
+	 If unsure, say M.
+
 config VIRTIO_BALLOON
 	tristate "Virtio balloon driver"
 	depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 41e30e3dc842..032ade725cc2 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -5,3 +5,4 @@  virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o
diff --git a/drivers/virtio/virtio_pmem.c b/drivers/virtio/virtio_pmem.c
new file mode 100644
index 000000000000..74e47cae0e24
--- /dev/null
+++ b/drivers/virtio/virtio_pmem.c
@@ -0,0 +1,322 @@ 
+/*
+ * virtio-pmem driver
+ */
+
+#include <linux/virtio.h>
+#include <linux/swap.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/virtio_pmem.h>
+
+void devm_vpmem_disable(struct device *dev, struct resource *res, void *addr)
+{
+	devm_memunmap(dev, addr);
+	devm_release_mem_region(dev, res->start, resource_size(res));
+}
+
+static void pmem_flush_done(struct virtqueue *vq)
+{
+	return;
+};
+
+static void virtio_pmem_release_queue(void *q)
+{
+	blk_cleanup_queue(q);
+}
+
+static void virtio_pmem_freeze_queue(void *q)
+{
+	blk_freeze_queue_start(q);
+}
+
+static void virtio_pmem_release_disk(void *__pmem)
+{
+	struct virtio_pmem *pmem = __pmem;
+
+	del_gendisk(pmem->disk);
+	put_disk(pmem->disk);
+}
+
+static int init_vq(struct virtio_pmem *vpmem)
+{
+	struct virtqueue *vq;
+
+	/* single vq */
+	vq = virtio_find_single_vq(vpmem->vdev, pmem_flush_done, "flush_queue");
+
+	if (IS_ERR(vq))
+		return PTR_ERR(vq);
+
+	return 0;
+}
+
+static struct vmem_altmap *setup_pmem_pfn(struct virtio_pmem *vpmem,
+			struct resource *res, struct vmem_altmap *altmap)
+{
+	u32 start_pad = 0, end_trunc = 0;
+	resource_size_t start, size;
+	unsigned long npfns;
+	phys_addr_t offset;
+
+	size = resource_size(res);
+	start = PHYS_SECTION_ALIGN_DOWN(res->start);
+
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+		IORES_DESC_NONE) == REGION_MIXED) {
+
+		start = res->start;
+		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
+	}
+	start = res->start;
+	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
+
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+		IORES_DESC_NONE) == REGION_MIXED) {
+
+		size = resource_size(res);
+		end_trunc = start + size -
+				PHYS_SECTION_ALIGN_DOWN(start + size);
+	}
+
+	start += start_pad;
+	size = resource_size(res);
+	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
+						/ PAGE_SIZE);
+
+      /*
+       * vmemmap_populate_hugepages() allocates the memmap array in
+       * HPAGE_SIZE chunks.
+       */
+	offset = ALIGN(start + SZ_8K + 64 * npfns, HPAGE_SIZE) - start;
+	vpmem->data_offset = offset;
+
+	struct vmem_altmap __altmap = {
+		.base_pfn = init_altmap_base(start+start_pad),
+		.reserve = init_altmap_reserve(start+start_pad),
+	};
+
+	res->start += start_pad;
+	res->end -= end_trunc;
+	memcpy(altmap, &__altmap, sizeof(*altmap));
+	altmap->free = PHYS_PFN(offset - SZ_8K);
+	altmap->alloc = 0;
+
+	return altmap;
+}
+
+static blk_status_t pmem_do_bvec(struct virtio_pmem *pmem, struct page *page,
+			unsigned int len, unsigned int off, bool is_write,
+			sector_t sector)
+{
+	blk_status_t rc = BLK_STS_OK;
+	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+	void *pmem_addr = pmem->virt_addr + pmem_off;
+
+	if (!is_write) {
+		rc = read_pmem(page, off, pmem_addr, len);
+			flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_pmem(pmem_addr, page, off, len);
+	}
+
+	return rc;
+}
+
+static int vpmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct virtio_pmem  *pmem = bdev->bd_queue->queuedata;
+	blk_status_t rc;
+
+	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
+			  0, is_write, sector);
+
+	if (rc == 0)
+		page_endio(page, is_write, 0);
+
+	return blk_status_to_errno(rc);
+}
+
+#ifndef REQ_FLUSH
+#define REQ_FLUSH REQ_PREFLUSH
+#endif
+
+static blk_qc_t virtio_pmem_make_request(struct request_queue *q,
+			struct bio *bio)
+{
+	blk_status_t rc = 0;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct virtio_pmem *pmem = q->queuedata;
+
+	if (bio->bi_opf & REQ_FLUSH)
+		//todo host flush command
+
+	bio_for_each_segment(bvec, bio, iter) {
+		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
+				bvec.bv_offset, op_is_write(bio_op(bio)),
+				iter.bi_sector);
+		if (rc) {
+			bio->bi_status = rc;
+			break;
+		}
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static const struct block_device_operations pmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		vpmem_rw_page,
+	//.revalidate_disk =	nvdimm_revalidate_disk,
+};
+
+static int virtio_pmem_probe(struct virtio_device *vdev)
+{
+	struct virtio_pmem *vpmem;
+	int err = 0;
+	void *addr;
+	struct resource *res, res_pfn;
+	struct request_queue *q;
+	struct vmem_altmap __altmap, *altmap = NULL;
+	struct gendisk *disk;
+	struct device *gendev;
+	int nid = dev_to_node(&vdev->dev);
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	vdev->priv = vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem),
+			GFP_KERNEL);
+
+	if (!vpmem) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dev_set_drvdata(&vdev->dev, vpmem);
+
+	vpmem->vdev = vdev;
+	err = init_vq(vpmem);
+	if (err)
+		goto out;
+
+	if (!virtio_has_feature(vdev, VIRTIO_PMEM_PLUG)) {
+		dev_err(&vdev->dev, "%s : pmem not supported\n",
+			__func__);
+		goto out;
+	}
+
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			start, &vpmem->start);
+	virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+			size, &vpmem->size);
+
+	res_pfn.start = vpmem->start;
+	res_pfn.end   = vpmem->start + vpmem->size-1;
+
+	/* used for allocating memmap in the pmem device */
+	altmap	      = setup_pmem_pfn(vpmem, &res_pfn, &__altmap);
+
+	res = devm_request_mem_region(&vdev->dev,
+			res_pfn.start, resource_size(&res_pfn), "virtio-pmem");
+
+	if (!res) {
+		dev_warn(&vdev->dev, "could not reserve region ");
+		return -EBUSY;
+	}
+
+	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(&vdev->dev));
+
+	if (!q)
+		return -ENOMEM;
+
+	if (devm_add_action_or_reset(&vdev->dev,
+				virtio_pmem_release_queue, q))
+		return -ENOMEM;
+
+	vpmem->pfn_flags = PFN_DEV;
+
+	/* allocate memap in pmem device itself */
+	if (IS_ENABLED(CONFIG_ZONE_DEVICE)) {
+
+		addr = devm_memremap_pages(&vdev->dev, res,
+				&q->q_usage_counter, altmap);
+		vpmem->pfn_flags |= PFN_MAP;
+	} else
+		addr = devm_memremap(&vdev->dev, vpmem->start,
+				vpmem->size, ARCH_MEMREMAP_PMEM);
+
+        /*
+         * At release time the queue must be frozen before
+         * devm_memremap_pages is unwound
+         */
+	if (devm_add_action_or_reset(&vdev->dev,
+				virtio_pmem_freeze_queue, q))
+		return -ENOMEM;
+
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+
+	vpmem->virt_addr = addr;
+	blk_queue_write_cache(q, 0, 0);
+	blk_queue_make_request(q, virtio_pmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = vpmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (!disk)
+		return -ENOMEM;
+	vpmem->disk = disk;
+
+	disk->fops                = &pmem_fops;
+	disk->queue               = q;
+	disk->flags               = GENHD_FL_EXT_DEVT;
+	strcpy(disk->disk_name, "vpmem");
+	set_capacity(disk, vpmem->size/512);
+	gendev = disk_to_dev(disk);
+
+	virtio_device_ready(vdev);
+	device_add_disk(&vdev->dev, disk);
+
+	if (devm_add_action_or_reset(&vdev->dev,
+			virtio_pmem_release_disk, vpmem))
+		return -ENOMEM;
+
+	revalidate_disk(disk);
+	return 0;
+out:
+	vdev->config->del_vqs(vdev);
+	return err;
+}
+
+static struct virtio_driver virtio_pmem_driver = {
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtio_pmem_probe,
+	//.remove		= virtio_pmem_remove,
+};
+
+module_virtio_driver(virtio_pmem_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio pmem driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h
new file mode 100644
index 000000000000..ec0c728c79ba
--- /dev/null
+++ b/include/uapi/linux/virtio_pmem.h
@@ -0,0 +1,55 @@ 
+/*
+ * Virtio pmem Device
+ *
+ *
+ */
+
+#ifndef _LINUX_VIRTIO_PMEM_H
+#define _LINUX_VIRTIO_PMEM_H
+
+#include <linux/types.h>
+#include <linux/virtio_types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+#include <linux/blk-mq.h>
+#include <linux/pmem_common.h>
+
+bool pmem_should_map_pages(struct device *dev);
+
+#define VIRTIO_PMEM_PLUG 0
+
+struct virtio_pmem_config {
+
+uint64_t start;
+uint64_t size;
+uint64_t align;
+uint64_t data_offset;
+};
+
+struct virtio_pmem {
+
+	struct virtio_device *vdev;
+	struct virtqueue *req_vq;
+
+	uint64_t start;
+	uint64_t size;
+	uint64_t align;
+	uint64_t data_offset;
+	u64 pfn_flags;
+	void *virt_addr;
+	struct gendisk *disk;
+} __packed;
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+	VIRTIO_PMEM_PLUG,
+};
+
+#endif
+