diff mbox

[RFC,3/3] x86: Create dma_mark_dirty to dirty pages used for DMA by VM guest

Message ID 20151213212831.5410.84365.stgit@localhost.localdomain
State Not Applicable
Headers show

Commit Message

Alexander Duyck Dec. 13, 2015, 9:28 p.m. UTC
This patch is meant to provide the guest with a way of flagging DMA pages
as being dirty to the host when using a direct-assign device within a
guest.  The advantage to this approach is that it is fairly simple, however
It currently has a singificant impact on device performance in all the
scenerios where it won't be needed.

As such this is really meant only as a proof of concept and to get the ball
rolling in terms of figuring out how best to approach the issue of dirty
page tracking for a guest that is using a direct assigned device.  In
addition with just this patch it should be possible to modify current
migration approaches so that instead of having to hot-remove the device
before starting the migration this can instead be delayed until the period
before the final stop and copy.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 arch/arm/include/asm/dma-mapping.h       |    3 ++-
 arch/arm64/include/asm/dma-mapping.h     |    5 ++---
 arch/ia64/include/asm/dma.h              |    1 +
 arch/mips/include/asm/dma-mapping.h      |    1 +
 arch/powerpc/include/asm/swiotlb.h       |    1 +
 arch/tile/include/asm/dma-mapping.h      |    1 +
 arch/unicore32/include/asm/dma-mapping.h |    1 +
 arch/x86/Kconfig                         |   11 +++++++++++
 arch/x86/include/asm/swiotlb.h           |   26 ++++++++++++++++++++++++++
 drivers/xen/swiotlb-xen.c                |    6 ++++++
 lib/swiotlb.c                            |    6 ++++++
 11 files changed, 58 insertions(+), 4 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin Dec. 14, 2015, 2 p.m. UTC | #1
On Sun, Dec 13, 2015 at 01:28:31PM -0800, Alexander Duyck wrote:
> This patch is meant to provide the guest with a way of flagging DMA pages
> as being dirty to the host when using a direct-assign device within a
> guest.  The advantage to this approach is that it is fairly simple, however
> It currently has a singificant impact on device performance in all the
> scenerios where it won't be needed.
> 
> As such this is really meant only as a proof of concept and to get the ball
> rolling in terms of figuring out how best to approach the issue of dirty
> page tracking for a guest that is using a direct assigned device.  In
> addition with just this patch it should be possible to modify current
> migration approaches so that instead of having to hot-remove the device
> before starting the migration this can instead be delayed until the period
> before the final stop and copy.
> 
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> ---
>  arch/arm/include/asm/dma-mapping.h       |    3 ++-
>  arch/arm64/include/asm/dma-mapping.h     |    5 ++---
>  arch/ia64/include/asm/dma.h              |    1 +
>  arch/mips/include/asm/dma-mapping.h      |    1 +
>  arch/powerpc/include/asm/swiotlb.h       |    1 +
>  arch/tile/include/asm/dma-mapping.h      |    1 +
>  arch/unicore32/include/asm/dma-mapping.h |    1 +
>  arch/x86/Kconfig                         |   11 +++++++++++
>  arch/x86/include/asm/swiotlb.h           |   26 ++++++++++++++++++++++++++
>  drivers/xen/swiotlb-xen.c                |    6 ++++++
>  lib/swiotlb.c                            |    6 ++++++
>  11 files changed, 58 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
> index ccb3aa64640d..1962d7b471c7 100644
> --- a/arch/arm/include/asm/dma-mapping.h
> +++ b/arch/arm/include/asm/dma-mapping.h
> @@ -167,7 +167,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>  	return 1;
>  }
>  
> -static inline void dma_mark_clean(void *addr, size_t size) { }
> +static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
>  
> diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
> index 61e08f360e31..8d24fe11c8a3 100644
> --- a/arch/arm64/include/asm/dma-mapping.h
> +++ b/arch/arm64/include/asm/dma-mapping.h
> @@ -84,9 +84,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>  	return addr + size - 1 <= *dev->dma_mask;
>  }
>  
> -static inline void dma_mark_clean(void *addr, size_t size)
> -{
> -}
> +static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  #endif	/* __KERNEL__ */
>  #endif	/* __ASM_DMA_MAPPING_H */
> diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
> index 4d97f60f1ef5..d92ebeb2758e 100644
> --- a/arch/ia64/include/asm/dma.h
> +++ b/arch/ia64/include/asm/dma.h
> @@ -20,5 +20,6 @@ extern unsigned long MAX_DMA_ADDRESS;
>  #define free_dma(x)
>  
>  void dma_mark_clean(void *addr, size_t size);
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  #endif /* _ASM_IA64_DMA_H */
> diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
> index e604f760c4a0..567f6e03e337 100644
> --- a/arch/mips/include/asm/dma-mapping.h
> +++ b/arch/mips/include/asm/dma-mapping.h
> @@ -28,6 +28,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>  }
>  
>  static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  #include <asm-generic/dma-mapping-common.h>
>  
> diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h
> index de99d6e29430..b694e8399e28 100644
> --- a/arch/powerpc/include/asm/swiotlb.h
> +++ b/arch/powerpc/include/asm/swiotlb.h
> @@ -16,6 +16,7 @@
>  extern struct dma_map_ops swiotlb_dma_ops;
>  
>  static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  extern unsigned int ppc_swiotlb_enable;
>  int __init swiotlb_setup_bus_notifier(void);
> diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
> index 96ac6cce4a32..79953f09e938 100644
> --- a/arch/tile/include/asm/dma-mapping.h
> +++ b/arch/tile/include/asm/dma-mapping.h
> @@ -58,6 +58,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
>  }
>  
>  static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
>  {
> diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
> index 8140e053ccd3..b9d357ab122d 100644
> --- a/arch/unicore32/include/asm/dma-mapping.h
> +++ b/arch/unicore32/include/asm/dma-mapping.h
> @@ -49,6 +49,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
>  }
>  
>  static inline void dma_mark_clean(void *addr, size_t size) {}
> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>  
>  static inline void dma_cache_sync(struct device *dev, void *vaddr,
>  		size_t size, enum dma_data_direction direction)
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index db3622f22b61..f0b09156d7d8 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -841,6 +841,17 @@ config SWIOTLB
>  	  with more than 3 GB of memory.
>  	  If unsure, say Y.
>  
> +config SWIOTLB_PAGE_DIRTYING
> +	bool "SWIOTLB page dirtying"
> +	depends on SWIOTLB
> +	default n
> +	---help---
> +	  SWIOTLB page dirtying support provides a means for the guest to
> +	  trigger write faults on pages which received DMA from the device
> +	  without changing the data contained within.  By doing this the
> +	  guest can then support migration assuming the device and any
> +	  remaining pages are unmapped prior to the CPU itself being halted.
> +
>  config IOMMU_HELPER
>  	def_bool y
>  	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
> diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
> index ab05d73e2bb7..7f9f2e76d081 100644
> --- a/arch/x86/include/asm/swiotlb.h
> +++ b/arch/x86/include/asm/swiotlb.h
> @@ -29,6 +29,32 @@ static inline void pci_swiotlb_late_init(void)
>  
>  static inline void dma_mark_clean(void *addr, size_t size) {}
>  
> +/*
> + * Make certain that the pages get marked as dirty
> + * now that the device has completed the DMA transaction.
> + *
> + * Without this we run the risk of a guest migration missing
> + * the pages that the device has written to as they are not
> + * tracked as a part of the dirty page tracking.
> + */
> +static inline void dma_mark_dirty(void *addr, size_t size)
> +{
> +#ifdef CONFIG_SWIOTLB_PAGE_DIRTYING

I like where this is going. However
as distributions don't like shipping multiple kernels,
I think we also need a way to configure this
at runtime, even if enabled at build time.

How about
- mark dirty is enabled at boot if requested (e.g. by kernel command line)
- mark dirty can later be disabled/enabled by sysctl

(Enabling at runtime might be a bit tricky as it has to
 sync with all CPUs - use e.g. RCU for this?).

This way distro can use a guest agent to disable
dirtying until before migration starts.

> +	unsigned long pg_addr, start;
> +
> +	start = (unsigned long)addr;
> +	pg_addr = PAGE_ALIGN(start + size);
> +	start &= ~(sizeof(atomic_t) - 1);
> +
> +	/* trigger a write fault on each page, excluding first page */
> +	while ((pg_addr -= PAGE_SIZE) > start)
> +		atomic_add(0, (atomic_t *)pg_addr);
> +
> +	/* trigger a write fault on first word of DMA */
> +	atomic_add(0, (atomic_t *)start);

start might not be aligned correctly for a cast to atomic_t.
It's harmless to do this for any memory, so I think you should
just do this for 1st byte of all pages including the first one.


> +#endif /* CONFIG_SWIOTLB_PAGE_DIRTYING */
> +}
> +
>  extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
>  					dma_addr_t *dma_handle, gfp_t flags,
>  					struct dma_attrs *attrs);
> diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
> index 2154c70e47da..1533b3eefb67 100644
> --- a/drivers/xen/swiotlb-xen.c
> +++ b/drivers/xen/swiotlb-xen.c
> @@ -456,6 +456,9 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
>  	 */
>  	if (dir == DMA_FROM_DEVICE)
>  		dma_mark_clean(phys_to_virt(paddr), size);
> +
> +	if (dir != DMA_TO_DEVICE)
> +		dma_mark_dirty(phys_to_virt(paddr), size);
>  }
>  EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
>  
> @@ -485,6 +488,9 @@ xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
>  
>  	if (dir == DMA_FROM_DEVICE)
>  		dma_mark_clean(phys_to_virt(paddr), size);
> +
> +	if (dir != DMA_TO_DEVICE)
> +		dma_mark_dirty(phys_to_virt(paddr), size);
>  }
>  EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
>  
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index 384ac06217b2..4223d6c54724 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -802,6 +802,9 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
>  	 */
>  	if (dir == DMA_FROM_DEVICE)
>  		dma_mark_clean(phys_to_virt(paddr), size);
> +
> +	if (dir != DMA_TO_DEVICE)
> +		dma_mark_dirty(phys_to_virt(paddr), size);
>  }
>  EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
>  
> @@ -830,6 +833,9 @@ swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
>  
>  	if (dir == DMA_FROM_DEVICE)
>  		dma_mark_clean(phys_to_virt(paddr), size);
> +
> +	if (dir != DMA_TO_DEVICE)
> +		dma_mark_dirty(phys_to_virt(paddr), size);
>  }
>  EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
>  
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck Dec. 14, 2015, 4:34 p.m. UTC | #2
On Mon, Dec 14, 2015 at 6:00 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Dec 13, 2015 at 01:28:31PM -0800, Alexander Duyck wrote:
>> This patch is meant to provide the guest with a way of flagging DMA pages
>> as being dirty to the host when using a direct-assign device within a
>> guest.  The advantage to this approach is that it is fairly simple, however
>> It currently has a singificant impact on device performance in all the
>> scenerios where it won't be needed.
>>
>> As such this is really meant only as a proof of concept and to get the ball
>> rolling in terms of figuring out how best to approach the issue of dirty
>> page tracking for a guest that is using a direct assigned device.  In
>> addition with just this patch it should be possible to modify current
>> migration approaches so that instead of having to hot-remove the device
>> before starting the migration this can instead be delayed until the period
>> before the final stop and copy.
>>
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> ---
>>  arch/arm/include/asm/dma-mapping.h       |    3 ++-
>>  arch/arm64/include/asm/dma-mapping.h     |    5 ++---
>>  arch/ia64/include/asm/dma.h              |    1 +
>>  arch/mips/include/asm/dma-mapping.h      |    1 +
>>  arch/powerpc/include/asm/swiotlb.h       |    1 +
>>  arch/tile/include/asm/dma-mapping.h      |    1 +
>>  arch/unicore32/include/asm/dma-mapping.h |    1 +
>>  arch/x86/Kconfig                         |   11 +++++++++++
>>  arch/x86/include/asm/swiotlb.h           |   26 ++++++++++++++++++++++++++
>>  drivers/xen/swiotlb-xen.c                |    6 ++++++
>>  lib/swiotlb.c                            |    6 ++++++
>>  11 files changed, 58 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
>> index ccb3aa64640d..1962d7b471c7 100644
>> --- a/arch/arm/include/asm/dma-mapping.h
>> +++ b/arch/arm/include/asm/dma-mapping.h
>> @@ -167,7 +167,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>>       return 1;
>>  }
>>
>> -static inline void dma_mark_clean(void *addr, size_t size) { }
>> +static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
>>
>> diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
>> index 61e08f360e31..8d24fe11c8a3 100644
>> --- a/arch/arm64/include/asm/dma-mapping.h
>> +++ b/arch/arm64/include/asm/dma-mapping.h
>> @@ -84,9 +84,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>>       return addr + size - 1 <= *dev->dma_mask;
>>  }
>>
>> -static inline void dma_mark_clean(void *addr, size_t size)
>> -{
>> -}
>> +static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  #endif       /* __KERNEL__ */
>>  #endif       /* __ASM_DMA_MAPPING_H */
>> diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
>> index 4d97f60f1ef5..d92ebeb2758e 100644
>> --- a/arch/ia64/include/asm/dma.h
>> +++ b/arch/ia64/include/asm/dma.h
>> @@ -20,5 +20,6 @@ extern unsigned long MAX_DMA_ADDRESS;
>>  #define free_dma(x)
>>
>>  void dma_mark_clean(void *addr, size_t size);
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  #endif /* _ASM_IA64_DMA_H */
>> diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
>> index e604f760c4a0..567f6e03e337 100644
>> --- a/arch/mips/include/asm/dma-mapping.h
>> +++ b/arch/mips/include/asm/dma-mapping.h
>> @@ -28,6 +28,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
>>  }
>>
>>  static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  #include <asm-generic/dma-mapping-common.h>
>>
>> diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h
>> index de99d6e29430..b694e8399e28 100644
>> --- a/arch/powerpc/include/asm/swiotlb.h
>> +++ b/arch/powerpc/include/asm/swiotlb.h
>> @@ -16,6 +16,7 @@
>>  extern struct dma_map_ops swiotlb_dma_ops;
>>
>>  static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  extern unsigned int ppc_swiotlb_enable;
>>  int __init swiotlb_setup_bus_notifier(void);
>> diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
>> index 96ac6cce4a32..79953f09e938 100644
>> --- a/arch/tile/include/asm/dma-mapping.h
>> +++ b/arch/tile/include/asm/dma-mapping.h
>> @@ -58,6 +58,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
>>  }
>>
>>  static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
>>  {
>> diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
>> index 8140e053ccd3..b9d357ab122d 100644
>> --- a/arch/unicore32/include/asm/dma-mapping.h
>> +++ b/arch/unicore32/include/asm/dma-mapping.h
>> @@ -49,6 +49,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
>>  }
>>
>>  static inline void dma_mark_clean(void *addr, size_t size) {}
>> +static inline void dma_mark_dirty(void *addr, size_t size) {}
>>
>>  static inline void dma_cache_sync(struct device *dev, void *vaddr,
>>               size_t size, enum dma_data_direction direction)
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index db3622f22b61..f0b09156d7d8 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -841,6 +841,17 @@ config SWIOTLB
>>         with more than 3 GB of memory.
>>         If unsure, say Y.
>>
>> +config SWIOTLB_PAGE_DIRTYING
>> +     bool "SWIOTLB page dirtying"
>> +     depends on SWIOTLB
>> +     default n
>> +     ---help---
>> +       SWIOTLB page dirtying support provides a means for the guest to
>> +       trigger write faults on pages which received DMA from the device
>> +       without changing the data contained within.  By doing this the
>> +       guest can then support migration assuming the device and any
>> +       remaining pages are unmapped prior to the CPU itself being halted.
>> +
>>  config IOMMU_HELPER
>>       def_bool y
>>       depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
>> diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
>> index ab05d73e2bb7..7f9f2e76d081 100644
>> --- a/arch/x86/include/asm/swiotlb.h
>> +++ b/arch/x86/include/asm/swiotlb.h
>> @@ -29,6 +29,32 @@ static inline void pci_swiotlb_late_init(void)
>>
>>  static inline void dma_mark_clean(void *addr, size_t size) {}
>>
>> +/*
>> + * Make certain that the pages get marked as dirty
>> + * now that the device has completed the DMA transaction.
>> + *
>> + * Without this we run the risk of a guest migration missing
>> + * the pages that the device has written to as they are not
>> + * tracked as a part of the dirty page tracking.
>> + */
>> +static inline void dma_mark_dirty(void *addr, size_t size)
>> +{
>> +#ifdef CONFIG_SWIOTLB_PAGE_DIRTYING
>
> I like where this is going. However
> as distributions don't like shipping multiple kernels,
> I think we also need a way to configure this
> at runtime, even if enabled at build time.

Agreed.  Like I sad in the cover page this is just needed until we can
come up with a way to limit the scope.  Then we could probably default
this to Y and distributions can have it enabled by default.

> How about
> - mark dirty is enabled at boot if requested (e.g. by kernel command line)
> - mark dirty can later be disabled/enabled by sysctl
>
> (Enabling at runtime might be a bit tricky as it has to
>  sync with all CPUs - use e.g. RCU for this?).

I was considering RCU but I am still not sure it is the best way to go
since all we essentially need to do is swap a couple of function
pointers.  I was thinking of making use of the dma_ops pointer
contained in dev_archdata.  If I were to create two dma_ops setups,
one with standard swiotlb and one with a dirty page pointer version
for the unmap and sync calls then it is just a matter of assigning a
pointer to enable the DMA page dirtying, and clearing the pointer to
disable it.  An alternative might be to just add a device specific
flag and then pass the device to the dma_mark_dirty function.  I'm
still debating the possible options.

> This way distro can use a guest agent to disable
> dirtying until before migration starts.

Right.  For a v2 version I would definitely want to have some way to
limit the scope of this.  My main reason for putting this out here is
to start altering the course of discussions since it seems like were
weren't getting anywhere with the ixgbevf migration changes that were
being proposed.

>> +     unsigned long pg_addr, start;
>> +
>> +     start = (unsigned long)addr;
>> +     pg_addr = PAGE_ALIGN(start + size);
>> +     start &= ~(sizeof(atomic_t) - 1);
>> +
>> +     /* trigger a write fault on each page, excluding first page */
>> +     while ((pg_addr -= PAGE_SIZE) > start)
>> +             atomic_add(0, (atomic_t *)pg_addr);
>> +
>> +     /* trigger a write fault on first word of DMA */
>> +     atomic_add(0, (atomic_t *)start);
>
> start might not be aligned correctly for a cast to atomic_t.
> It's harmless to do this for any memory, so I think you should
> just do this for 1st byte of all pages including the first one.

You may not have noticed it but I actually aligned start in the line
after pg_addr.  However instead of aligning to the start of the next
atomic_t I just masked off the lower bits so that we start at the
DWORD that contains the first byte of the starting address.  The
assumption here is that I cannot trigger any sort of fault since if I
have access to a given byte within a DWORD I will have access to the
entire DWORD.  I coded this up so that the spots where we touch the
memory should match up with addresses provided by the hardware to
perform the DMA over the PCI bus.

Also I intentionally ran from highest address to lowest since that way
we don't risk pushing the first cache line of the DMA buffer out of
the L1 cache due to the PAGE_SIZE stride.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 14, 2015, 5:20 p.m. UTC | #3
On Mon, Dec 14, 2015 at 08:34:00AM -0800, Alexander Duyck wrote:
> > This way distro can use a guest agent to disable
> > dirtying until before migration starts.
> 
> Right.  For a v2 version I would definitely want to have some way to
> limit the scope of this.  My main reason for putting this out here is
> to start altering the course of discussions since it seems like were
> weren't getting anywhere with the ixgbevf migration changes that were
> being proposed.

Absolutely, thanks for working on this.

> >> +     unsigned long pg_addr, start;
> >> +
> >> +     start = (unsigned long)addr;
> >> +     pg_addr = PAGE_ALIGN(start + size);
> >> +     start &= ~(sizeof(atomic_t) - 1);
> >> +
> >> +     /* trigger a write fault on each page, excluding first page */
> >> +     while ((pg_addr -= PAGE_SIZE) > start)
> >> +             atomic_add(0, (atomic_t *)pg_addr);
> >> +
> >> +     /* trigger a write fault on first word of DMA */
> >> +     atomic_add(0, (atomic_t *)start);
> >
> > start might not be aligned correctly for a cast to atomic_t.
> > It's harmless to do this for any memory, so I think you should
> > just do this for 1st byte of all pages including the first one.
> 
> You may not have noticed it but I actually aligned start in the line
> after pg_addr.

Yes you did. alignof would make it a bit more noticeable.

>  However instead of aligning to the start of the next
> atomic_t I just masked off the lower bits so that we start at the
> DWORD that contains the first byte of the starting address.  The
> assumption here is that I cannot trigger any sort of fault since if I
> have access to a given byte within a DWORD I will have access to the
> entire DWORD.

I'm curious where does this come from.  Isn't it true that access is
controlled at page granularity normally, so you can touch beginning of
page just as well?

>  I coded this up so that the spots where we touch the
> memory should match up with addresses provided by the hardware to
> perform the DMA over the PCI bus.

Yes but there's no requirement to do it like this from
virt POV. You just need to touch each page.

> Also I intentionally ran from highest address to lowest since that way
> we don't risk pushing the first cache line of the DMA buffer out of
> the L1 cache due to the PAGE_SIZE stride.
> 
> - Alex

Interesting. How does order of access help with this?

By the way, if you are into these micro-optimizations you might want to
limit prefetch, to this end you want to access the last line of the
page.  And it's probably worth benchmarking a bit and not doing it all just
based on theory, keep code simple in v1 otherwise.
Alexander H Duyck Dec. 14, 2015, 5:59 p.m. UTC | #4
On Mon, Dec 14, 2015 at 9:20 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Dec 14, 2015 at 08:34:00AM -0800, Alexander Duyck wrote:
>> > This way distro can use a guest agent to disable
>> > dirtying until before migration starts.
>>
>> Right.  For a v2 version I would definitely want to have some way to
>> limit the scope of this.  My main reason for putting this out here is
>> to start altering the course of discussions since it seems like were
>> weren't getting anywhere with the ixgbevf migration changes that were
>> being proposed.
>
> Absolutely, thanks for working on this.
>
>> >> +     unsigned long pg_addr, start;
>> >> +
>> >> +     start = (unsigned long)addr;
>> >> +     pg_addr = PAGE_ALIGN(start + size);
>> >> +     start &= ~(sizeof(atomic_t) - 1);
>> >> +
>> >> +     /* trigger a write fault on each page, excluding first page */
>> >> +     while ((pg_addr -= PAGE_SIZE) > start)
>> >> +             atomic_add(0, (atomic_t *)pg_addr);
>> >> +
>> >> +     /* trigger a write fault on first word of DMA */
>> >> +     atomic_add(0, (atomic_t *)start);
>> >
>> > start might not be aligned correctly for a cast to atomic_t.
>> > It's harmless to do this for any memory, so I think you should
>> > just do this for 1st byte of all pages including the first one.
>>
>> You may not have noticed it but I actually aligned start in the line
>> after pg_addr.
>
> Yes you did. alignof would make it a bit more noticeable.
>
>>  However instead of aligning to the start of the next
>> atomic_t I just masked off the lower bits so that we start at the
>> DWORD that contains the first byte of the starting address.  The
>> assumption here is that I cannot trigger any sort of fault since if I
>> have access to a given byte within a DWORD I will have access to the
>> entire DWORD.
>
> I'm curious where does this come from.  Isn't it true that access is
> controlled at page granularity normally, so you can touch beginning of
> page just as well?

Yeah, I am pretty sure it probably is page granularity.  However my
thought was to try and stick to the start of the DMA as the last
access.  That way we don't pull in any more cache lines than we need
to in order to dirty the pages.  Usually the start of the DMA region
will contain some sort of headers or something that needs to be
accessed with the highest priority so I wanted to make certain that we
were forcing usable data into the L1 cache rather than just the first
cache line of the page where the DMA started.  If however the start of
a DMA was the start of the page there is nothing there to prevent
that.

>>  I coded this up so that the spots where we touch the
>> memory should match up with addresses provided by the hardware to
>> perform the DMA over the PCI bus.
>
> Yes but there's no requirement to do it like this from
> virt POV. You just need to touch each page.

I know, but at the same time if we match up with the DMA then it is
more likely that we avoid grabbing unneeded cache lines.  In the case
of most drivers the data for headers and start is at the start of the
DMA.  So if we dirty the cache line associated with the start of the
DMA it will be pulled into the L1 cache and there is a greater chance
that it may already be prefetched as well.

>> Also I intentionally ran from highest address to lowest since that way
>> we don't risk pushing the first cache line of the DMA buffer out of
>> the L1 cache due to the PAGE_SIZE stride.
>
> Interesting. How does order of access help with this?

If you use a PAGE_SIZE stride you will start evicting things from L1
cache after something like 8 accesses on an x86 processor as most of
the recent ones have a 32K 8 way associative L1 cache.  So if I go
from back to front then I evict the stuff that would likely be in the
data portion of a buffer instead of headers which are usually located
at the front.

> By the way, if you are into these micro-optimizations you might want to
> limit prefetch, to this end you want to access the last line of the
> page.  And it's probably worth benchmarking a bit and not doing it all just
> based on theory, keep code simple in v1 otherwise.

My main goal for now is functional code over high performance code.
That is why I have kept this code fairly simple.  I might have done
some optimization but it was as much about the optimization as keeping
the code simple.  For example by using the start of the page instead
of the end I could easily do the comparison against start and avoid
doing more than one write per page.

The issue for me doing performance testing is that I don't have
anything that uses DMA blocks that are actually big enough to make use
of the PAGE_SIZE stride.  That is why the PAGE_SIZE stride portion is
mostly just theoretical.  I just have a few NICs and most of them only
allocate 1 page or so for DMA buffers.  What little benchmarking I
have done with netperf only showed a ~1% CPU penalty for the page
dirtying code.  For setups where we did more with the DMA such as
small packet handling I would expect that value to increase, but I
still wouldn't expect to see a penalty of much more than ~5% most
likely since there are still a number of other items that are calling
atomic operations as well such as the code for releasing pages.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Dec. 14, 2015, 8:52 p.m. UTC | #5
On Mon, Dec 14, 2015 at 09:59:13AM -0800, Alexander Duyck wrote:
> On Mon, Dec 14, 2015 at 9:20 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Dec 14, 2015 at 08:34:00AM -0800, Alexander Duyck wrote:
> >> > This way distro can use a guest agent to disable
> >> > dirtying until before migration starts.
> >>
> >> Right.  For a v2 version I would definitely want to have some way to
> >> limit the scope of this.  My main reason for putting this out here is
> >> to start altering the course of discussions since it seems like were
> >> weren't getting anywhere with the ixgbevf migration changes that were
> >> being proposed.
> >
> > Absolutely, thanks for working on this.
> >
> >> >> +     unsigned long pg_addr, start;
> >> >> +
> >> >> +     start = (unsigned long)addr;
> >> >> +     pg_addr = PAGE_ALIGN(start + size);
> >> >> +     start &= ~(sizeof(atomic_t) - 1);
> >> >> +
> >> >> +     /* trigger a write fault on each page, excluding first page */
> >> >> +     while ((pg_addr -= PAGE_SIZE) > start)
> >> >> +             atomic_add(0, (atomic_t *)pg_addr);
> >> >> +
> >> >> +     /* trigger a write fault on first word of DMA */
> >> >> +     atomic_add(0, (atomic_t *)start);

Actually, I have second thoughts about using atomic_add here,
especially for _sync.

Many architectures do

#define ATOMIC_OP_RETURN(op, c_op)                                      \
static inline int atomic_##op##_return(int i, atomic_t *v)              \
{                                                                       \
        unsigned long flags;                                            \
        int ret;                                                        \
                                                                        \
        raw_local_irq_save(flags);                                      \
        ret = (v->counter = v->counter c_op i);                         \
        raw_local_irq_restore(flags);                                   \
                                                                        \
        return ret;                                                     \
}

and this is not safe if device is still doing DMA to/from
this memory.

Generally, atomic_t is there for SMP effects, not for sync
with devices.

This is why I said you should do
	cmpxchg(pg_addr, 0xdead, 0xdead); 

Yes, we probably never actually want to run m68k within a VM,
but let's not misuse interfaces like this.


> >> >
> >> > start might not be aligned correctly for a cast to atomic_t.
> >> > It's harmless to do this for any memory, so I think you should
> >> > just do this for 1st byte of all pages including the first one.
> >>
> >> You may not have noticed it but I actually aligned start in the line
> >> after pg_addr.
> >
> > Yes you did. alignof would make it a bit more noticeable.
> >
> >>  However instead of aligning to the start of the next
> >> atomic_t I just masked off the lower bits so that we start at the
> >> DWORD that contains the first byte of the starting address.  The
> >> assumption here is that I cannot trigger any sort of fault since if I
> >> have access to a given byte within a DWORD I will have access to the
> >> entire DWORD.
> >
> > I'm curious where does this come from.  Isn't it true that access is
> > controlled at page granularity normally, so you can touch beginning of
> > page just as well?
> 
> Yeah, I am pretty sure it probably is page granularity.  However my
> thought was to try and stick to the start of the DMA as the last
> access.  That way we don't pull in any more cache lines than we need
> to in order to dirty the pages.  Usually the start of the DMA region
> will contain some sort of headers or something that needs to be
> accessed with the highest priority so I wanted to make certain that we
> were forcing usable data into the L1 cache rather than just the first
> cache line of the page where the DMA started.  If however the start of
> a DMA was the start of the page there is nothing there to prevent
> that.

OK, maybe this helps. You should document all these tricks
in code comments.

> >>  I coded this up so that the spots where we touch the
> >> memory should match up with addresses provided by the hardware to
> >> perform the DMA over the PCI bus.
> >
> > Yes but there's no requirement to do it like this from
> > virt POV. You just need to touch each page.
> 
> I know, but at the same time if we match up with the DMA then it is
> more likely that we avoid grabbing unneeded cache lines.  In the case
> of most drivers the data for headers and start is at the start of the
> DMA.  So if we dirty the cache line associated with the start of the
> DMA it will be pulled into the L1 cache and there is a greater chance
> that it may already be prefetched as well.
> 
> >> Also I intentionally ran from highest address to lowest since that way
> >> we don't risk pushing the first cache line of the DMA buffer out of
> >> the L1 cache due to the PAGE_SIZE stride.
> >
> > Interesting. How does order of access help with this?
> 
> If you use a PAGE_SIZE stride you will start evicting things from L1
> cache after something like 8 accesses on an x86 processor as most of
> the recent ones have a 32K 8 way associative L1 cache.  So if I go
> from back to front then I evict the stuff that would likely be in the
> data portion of a buffer instead of headers which are usually located
> at the front.

I see, interesting.

> > By the way, if you are into these micro-optimizations you might want to
> > limit prefetch, to this end you want to access the last line of the
> > page.  And it's probably worth benchmarking a bit and not doing it all just
> > based on theory, keep code simple in v1 otherwise.
> 
> My main goal for now is functional code over high performance code.
> That is why I have kept this code fairly simple.  I might have done
> some optimization but it was as much about the optimization as keeping
> the code simple.

Well you were trying to avoid putting extra stress on
the cache, and it seems clear to me that prefetch
is not your friend here. So
-             atomic_add(0, (atomic_t *)pg_addr);
+             atomic_add(0, (atomic_t *)(pg_addr + PAGE_SIZE - sizeof(atomic_t));
(or whatever we change atomic_t to) is probably a win.

>  For example by using the start of the page instead
> of the end I could easily do the comparison against start and avoid
> doing more than one write per page.

That's probably worth fixing, we don't want two atomics
if we can help it.

-     while ((pg_addr -= PAGE_SIZE) > start)
+     while ((pg_addr -= PAGE_SIZE) >= PAGE_ALIGN(start + PAGE_SIZE))

will do it with no fuss.

> The issue for me doing performance testing is that I don't have
> anything that uses DMA blocks that are actually big enough to make use
> of the PAGE_SIZE stride.  That is why the PAGE_SIZE stride portion is
> mostly just theoretical.  I just have a few NICs and most of them only
> allocate 1 page or so for DMA buffers.  What little benchmarking I
> have done with netperf only showed a ~1% CPU penalty for the page
> dirtying code.  For setups where we did more with the DMA such as
> small packet handling I would expect that value to increase, but I
> still wouldn't expect to see a penalty of much more than ~5% most
> likely since there are still a number of other items that are calling
> atomic operations as well such as the code for releasing pages.
> 
> - Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck Dec. 14, 2015, 10:32 p.m. UTC | #6
On Mon, Dec 14, 2015 at 12:52 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Dec 14, 2015 at 09:59:13AM -0800, Alexander Duyck wrote:
>> On Mon, Dec 14, 2015 at 9:20 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Mon, Dec 14, 2015 at 08:34:00AM -0800, Alexander Duyck wrote:
>> >> > This way distro can use a guest agent to disable
>> >> > dirtying until before migration starts.
>> >>
>> >> Right.  For a v2 version I would definitely want to have some way to
>> >> limit the scope of this.  My main reason for putting this out here is
>> >> to start altering the course of discussions since it seems like were
>> >> weren't getting anywhere with the ixgbevf migration changes that were
>> >> being proposed.
>> >
>> > Absolutely, thanks for working on this.
>> >
>> >> >> +     unsigned long pg_addr, start;
>> >> >> +
>> >> >> +     start = (unsigned long)addr;
>> >> >> +     pg_addr = PAGE_ALIGN(start + size);
>> >> >> +     start &= ~(sizeof(atomic_t) - 1);
>> >> >> +
>> >> >> +     /* trigger a write fault on each page, excluding first page */
>> >> >> +     while ((pg_addr -= PAGE_SIZE) > start)
>> >> >> +             atomic_add(0, (atomic_t *)pg_addr);
>> >> >> +
>> >> >> +     /* trigger a write fault on first word of DMA */
>> >> >> +     atomic_add(0, (atomic_t *)start);
>
> Actually, I have second thoughts about using atomic_add here,
> especially for _sync.
>
> Many architectures do
>
> #define ATOMIC_OP_RETURN(op, c_op)                                      \
> static inline int atomic_##op##_return(int i, atomic_t *v)              \
> {                                                                       \
>         unsigned long flags;                                            \
>         int ret;                                                        \
>                                                                         \
>         raw_local_irq_save(flags);                                      \
>         ret = (v->counter = v->counter c_op i);                         \
>         raw_local_irq_restore(flags);                                   \
>                                                                         \
>         return ret;                                                     \
> }
>
> and this is not safe if device is still doing DMA to/from
> this memory.
>
> Generally, atomic_t is there for SMP effects, not for sync
> with devices.
>
> This is why I said you should do
>         cmpxchg(pg_addr, 0xdead, 0xdead);
>
> Yes, we probably never actually want to run m68k within a VM,
> but let's not misuse interfaces like this.

Right now this implementation is for x86 only.  Any other architecture
currently reports dma_mark_dirty as an empty inline function.  The
reason why I chose the atomic_add for x86 is simply because it is
guaranteed dirty the cache line with relatively few instructions and
operands as all I have to have is the pointer and 0.

For the m68k we could implement it as a cmpxchg instead.  The general
thought here is that each architecture is probably going to have to do
it a little bit differently.

>> >> >
>> >> > start might not be aligned correctly for a cast to atomic_t.
>> >> > It's harmless to do this for any memory, so I think you should
>> >> > just do this for 1st byte of all pages including the first one.
>> >>
>> >> You may not have noticed it but I actually aligned start in the line
>> >> after pg_addr.
>> >
>> > Yes you did. alignof would make it a bit more noticeable.
>> >
>> >>  However instead of aligning to the start of the next
>> >> atomic_t I just masked off the lower bits so that we start at the
>> >> DWORD that contains the first byte of the starting address.  The
>> >> assumption here is that I cannot trigger any sort of fault since if I
>> >> have access to a given byte within a DWORD I will have access to the
>> >> entire DWORD.
>> >
>> > I'm curious where does this come from.  Isn't it true that access is
>> > controlled at page granularity normally, so you can touch beginning of
>> > page just as well?
>>
>> Yeah, I am pretty sure it probably is page granularity.  However my
>> thought was to try and stick to the start of the DMA as the last
>> access.  That way we don't pull in any more cache lines than we need
>> to in order to dirty the pages.  Usually the start of the DMA region
>> will contain some sort of headers or something that needs to be
>> accessed with the highest priority so I wanted to make certain that we
>> were forcing usable data into the L1 cache rather than just the first
>> cache line of the page where the DMA started.  If however the start of
>> a DMA was the start of the page there is nothing there to prevent
>> that.
>
> OK, maybe this helps. You should document all these tricks
> in code comments.

I'll try to get that taken care of for v2.

>> >>  I coded this up so that the spots where we touch the
>> >> memory should match up with addresses provided by the hardware to
>> >> perform the DMA over the PCI bus.
>> >
>> > Yes but there's no requirement to do it like this from
>> > virt POV. You just need to touch each page.
>>
>> I know, but at the same time if we match up with the DMA then it is
>> more likely that we avoid grabbing unneeded cache lines.  In the case
>> of most drivers the data for headers and start is at the start of the
>> DMA.  So if we dirty the cache line associated with the start of the
>> DMA it will be pulled into the L1 cache and there is a greater chance
>> that it may already be prefetched as well.
>>
>> >> Also I intentionally ran from highest address to lowest since that way
>> >> we don't risk pushing the first cache line of the DMA buffer out of
>> >> the L1 cache due to the PAGE_SIZE stride.
>> >
>> > Interesting. How does order of access help with this?
>>
>> If you use a PAGE_SIZE stride you will start evicting things from L1
>> cache after something like 8 accesses on an x86 processor as most of
>> the recent ones have a 32K 8 way associative L1 cache.  So if I go
>> from back to front then I evict the stuff that would likely be in the
>> data portion of a buffer instead of headers which are usually located
>> at the front.
>
> I see, interesting.
>
>> > By the way, if you are into these micro-optimizations you might want to
>> > limit prefetch, to this end you want to access the last line of the
>> > page.  And it's probably worth benchmarking a bit and not doing it all just
>> > based on theory, keep code simple in v1 otherwise.
>>
>> My main goal for now is functional code over high performance code.
>> That is why I have kept this code fairly simple.  I might have done
>> some optimization but it was as much about the optimization as keeping
>> the code simple.
>
> Well you were trying to avoid putting extra stress on
> the cache, and it seems clear to me that prefetch
> is not your friend here. So
> -             atomic_add(0, (atomic_t *)pg_addr);
> +             atomic_add(0, (atomic_t *)(pg_addr + PAGE_SIZE - sizeof(atomic_t));
> (or whatever we change atomic_t to) is probably a win.

What is the advantage to writing to the last field in the page versus
the first?  I think that is the part I am not getting.

>>  For example by using the start of the page instead
>> of the end I could easily do the comparison against start and avoid
>> doing more than one write per page.
>
> That's probably worth fixing, we don't want two atomics
> if we can help it.
>
> -     while ((pg_addr -= PAGE_SIZE) > start)
> +     while ((pg_addr -= PAGE_SIZE) >= PAGE_ALIGN(start + PAGE_SIZE))
>
> will do it with no fuss.

I'm still not seeing what the gain here is.  It just seems like it is
making things more complicated.

The main goal of keeping things inside the DMA is to keep us from
doing too much cache bouncing.  Us reaching out and dirtying cache
lines that we aren't actually using seems to be really wasteful.  If
for example a page was split between two CPUs with one doing DMA on
one half, and one doing DMA on another I wouldn't want to have both
devices dirtying the same cache line, I would rather have them each
marking a separate cache line in order to avoid cache thrash.  By
having the start aligned with the start of the DMA, and all of the
other entries aligned with the start of pages contained within the DMA
we can avoid that since devices are generally working with at least
cache aligned buffers.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index ccb3aa64640d..1962d7b471c7 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -167,7 +167,8 @@  static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 	return 1;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size) { }
+static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 61e08f360e31..8d24fe11c8a3 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -84,9 +84,8 @@  static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 	return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size)
-{
-}
+static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 #endif	/* __KERNEL__ */
 #endif	/* __ASM_DMA_MAPPING_H */
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
index 4d97f60f1ef5..d92ebeb2758e 100644
--- a/arch/ia64/include/asm/dma.h
+++ b/arch/ia64/include/asm/dma.h
@@ -20,5 +20,6 @@  extern unsigned long MAX_DMA_ADDRESS;
 #define free_dma(x)
 
 void dma_mark_clean(void *addr, size_t size);
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 #endif /* _ASM_IA64_DMA_H */
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index e604f760c4a0..567f6e03e337 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -28,6 +28,7 @@  static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 #include <asm-generic/dma-mapping-common.h>
 
diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h
index de99d6e29430..b694e8399e28 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -16,6 +16,7 @@ 
 extern struct dma_map_ops swiotlb_dma_ops;
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 extern unsigned int ppc_swiotlb_enable;
 int __init swiotlb_setup_bus_notifier(void);
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 96ac6cce4a32..79953f09e938 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -58,6 +58,7 @@  static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 8140e053ccd3..b9d357ab122d 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -49,6 +49,7 @@  static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
+static inline void dma_mark_dirty(void *addr, size_t size) {}
 
 static inline void dma_cache_sync(struct device *dev, void *vaddr,
 		size_t size, enum dma_data_direction direction)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f22b61..f0b09156d7d8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -841,6 +841,17 @@  config SWIOTLB
 	  with more than 3 GB of memory.
 	  If unsure, say Y.
 
+config SWIOTLB_PAGE_DIRTYING
+	bool "SWIOTLB page dirtying"
+	depends on SWIOTLB
+	default n
+	---help---
+	  SWIOTLB page dirtying support provides a means for the guest to
+	  trigger write faults on pages which received DMA from the device
+	  without changing the data contained within.  By doing this the
+	  guest can then support migration assuming the device and any
+	  remaining pages are unmapped prior to the CPU itself being halted.
+
 config IOMMU_HELPER
 	def_bool y
 	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index ab05d73e2bb7..7f9f2e76d081 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -29,6 +29,32 @@  static inline void pci_swiotlb_late_init(void)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
+/*
+ * Make certain that the pages get marked as dirty
+ * now that the device has completed the DMA transaction.
+ *
+ * Without this we run the risk of a guest migration missing
+ * the pages that the device has written to as they are not
+ * tracked as a part of the dirty page tracking.
+ */
+static inline void dma_mark_dirty(void *addr, size_t size)
+{
+#ifdef CONFIG_SWIOTLB_PAGE_DIRTYING
+	unsigned long pg_addr, start;
+
+	start = (unsigned long)addr;
+	pg_addr = PAGE_ALIGN(start + size);
+	start &= ~(sizeof(atomic_t) - 1);
+
+	/* trigger a write fault on each page, excluding first page */
+	while ((pg_addr -= PAGE_SIZE) > start)
+		atomic_add(0, (atomic_t *)pg_addr);
+
+	/* trigger a write fault on first word of DMA */
+	atomic_add(0, (atomic_t *)start);
+#endif /* CONFIG_SWIOTLB_PAGE_DIRTYING */
+}
+
 extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flags,
 					struct dma_attrs *attrs);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 2154c70e47da..1533b3eefb67 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -456,6 +456,9 @@  void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 	 */
 	if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(phys_to_virt(paddr), size);
+
+	if (dir != DMA_TO_DEVICE)
+		dma_mark_dirty(phys_to_virt(paddr), size);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
 
@@ -485,6 +488,9 @@  xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 
 	if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(phys_to_virt(paddr), size);
+
+	if (dir != DMA_TO_DEVICE)
+		dma_mark_dirty(phys_to_virt(paddr), size);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 384ac06217b2..4223d6c54724 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -802,6 +802,9 @@  void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 	 */
 	if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(phys_to_virt(paddr), size);
+
+	if (dir != DMA_TO_DEVICE)
+		dma_mark_dirty(phys_to_virt(paddr), size);
 }
 EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
@@ -830,6 +833,9 @@  swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 
 	if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(phys_to_virt(paddr), size);
+
+	if (dir != DMA_TO_DEVICE)
+		dma_mark_dirty(phys_to_virt(paddr), size);
 }
 EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);