diff mbox series

[v5,06/27] drm/amdgpu: Handle IOMMU enabled case.

Message ID 20210428151207.1212258-7-andrey.grodzovsky@amd.com
State New
Headers show
Series RFC Support hot device unplug in amdgpu | expand

Commit Message

Andrey Grodzovsky April 28, 2021, 3:11 p.m. UTC
Handle all DMA IOMMU gropup related dependencies before the
group is removed.

v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
 drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
 drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
 drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
 drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
 drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
 14 files changed, 56 insertions(+), 16 deletions(-)

Comments

Christian König April 29, 2021, 7:08 a.m. UTC | #1
Am 28.04.21 um 17:11 schrieb Andrey Grodzovsky:
> Handle all DMA IOMMU gropup related dependencies before the
> group is removed.
>
> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate

Maybe split that up into more patches.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>   14 files changed, 56 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index fddb82897e5d..30a24db5f4d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>   
>   	bool                            in_pci_err_recovery;
>   	struct pci_saved_state          *pci_state;
> +
> +	struct list_head                device_bo_list;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 46d646c40338..91594ddc2459 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -70,6 +70,7 @@
>   #include <drm/task_barrier.h>
>   #include <linux/pm_runtime.h>
>   
> +
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
> @@ -3211,7 +3212,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>   	NULL
>   };
>   
> -
>   /**
>    * amdgpu_device_init - initialize the driver
>    *
> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>   
> +	INIT_LIST_HEAD(&adev->device_bo_list);
> +
>   	adev->gfx.gfx_off_req_count = 1;
>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>   
> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_bo *bo = NULL;
> +
> +	/*
> +	 * Unmaps all DMA mappings before device will be removed from it's
> +	 * IOMMU group otherwise in case of IOMMU enabled system a crash
> +	 * will happen.
> +	 */
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	while (!list_empty(&adev->device_bo_list)) {
> +		bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
> +		list_del_init(&bo->bo);
> +		spin_unlock(&adev->mman.bdev.lru_lock);
> +		if (bo->tbo.ttm)
> +			ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
> +		spin_lock(&adev->mman.bdev.lru_lock);
> +	}
> +	spin_unlock(&adev->mman.bdev.lru_lock);

Can you try to use the same approach as amdgpu_gtt_mgr_recover() instead 
of adding something to the BO?

Christian.

> +}
> +
>   /**
>    * amdgpu_device_fini - tear down the driver
>    *
> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>   		amdgpu_ucode_sysfs_fini(adev);
>   	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>   
> -
>   	amdgpu_fbdev_fini(adev);
>   
>   	amdgpu_irq_fini_hw(adev);
>   
>   	amdgpu_device_ip_fini_early(adev);
> +
> +	amdgpu_clear_dma_mappings(adev);
> +
> +	amdgpu_gart_dummy_page_fini(adev);
>   }
>   
>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index fde2d899b2c4..49cdcaf8512d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
>    *
>    * Frees the dummy page used by the driver (all asics).
>    */
> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>   {
>   	if (!adev->dummy_page_addr)
>   		return;
> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>   	vfree(adev->gart.pages);
>   	adev->gart.pages = NULL;
>   #endif
> -	amdgpu_gart_dummy_page_fini(adev);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> index afa2e2877d87..5678d9c105ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>   int amdgpu_gart_init(struct amdgpu_device *adev);
>   void amdgpu_gart_fini(struct amdgpu_device *adev);
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>   		       int pages);
>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 63e815c27585..a922154953a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>   		if (!amdgpu_device_has_dc_support(adev))
>   			flush_work(&adev->hotplug_work);
>   	}
> +
> +	if (adev->irq.ih_soft.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> +	if (adev->irq.ih.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih);
> +	if (adev->irq.ih1.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> +	if (adev->irq.ih2.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 485f249d063a..62d829f5e62c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
>   		list_del_init(&bo->shadow_list);
>   		mutex_unlock(&adev->shadow_list_lock);
>   	}
> -	amdgpu_bo_unref(&bo->parent);
>   
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	list_del(&bo->bo);
> +	spin_unlock(&adev->mman.bdev.lru_lock);
> +
> +	amdgpu_bo_unref(&bo->parent);
>   	kfree(bo->metadata);
>   	kfree(bo);
>   }
> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>   	if (bp->type == ttm_bo_type_device)
>   		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>   
> +	INIT_LIST_HEAD(&bo->bo);
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	list_add_tail(&bo->bo, &adev->device_bo_list);
> +	spin_unlock(&adev->mman.bdev.lru_lock);
> +
>   	return 0;
>   
>   fail_unreserve:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 9ac37569823f..5ae8555ef275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>   	struct list_head		shadow_list;
>   
>   	struct kgd_mem                  *kfd_bo;
> +
> +	struct list_head		bo;
>   };
>   
>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> index 183d44a6583c..df385ffc9768 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   	amdgpu_irq_remove_domain(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index d32743949003..b8c47e0cf37a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   	amdgpu_irq_remove_domain(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> index da96c6013477..ddfe4eaeea05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   	amdgpu_irq_remove_domain(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index 5eea4550b856..e171a9e78544 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>   
>   	amdgpu_irq_fini_sw(adev);
>   	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> index 751307f3252c..9a24f17a5750 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> index 973d80ec7f6c..b08905d1c00f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   	amdgpu_irq_remove_domain(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 2d0094c276ca..8c8abc00f710 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>   
>   	amdgpu_irq_fini_sw(adev);
>   	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>   
>   	return 0;
>   }
Alex Deucher April 30, 2021, 3:13 a.m. UTC | #2
On Wed, Apr 28, 2021 at 11:13 AM Andrey Grodzovsky
<andrey.grodzovsky@amd.com> wrote:
>
> Handle all DMA IOMMU gropup related dependencies before the
> group is removed.
>
> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>  drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>  drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>  drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>  drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>  14 files changed, 56 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index fddb82897e5d..30a24db5f4d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>
>         bool                            in_pci_err_recovery;
>         struct pci_saved_state          *pci_state;
> +
> +       struct list_head                device_bo_list;
>  };
>
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 46d646c40338..91594ddc2459 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -70,6 +70,7 @@
>  #include <drm/task_barrier.h>
>  #include <linux/pm_runtime.h>
>
> +
>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>  MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>  MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
> @@ -3211,7 +3212,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>         NULL
>  };
>
> -
>  /**
>   * amdgpu_device_init - initialize the driver
>   *
> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
>         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>
> +       INIT_LIST_HEAD(&adev->device_bo_list);
> +
>         adev->gfx.gfx_off_req_count = 1;
>         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>
> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         return r;
>  }
>
> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)

Prefix this with amdgpu_device for consistency.  E.g.,
amdgpu_device_clear_dma_mappings()

> +{
> +       struct amdgpu_bo *bo = NULL;
> +
> +       /*
> +        * Unmaps all DMA mappings before device will be removed from it's
> +        * IOMMU group otherwise in case of IOMMU enabled system a crash
> +        * will happen.
> +        */
> +
> +       spin_lock(&adev->mman.bdev.lru_lock);
> +       while (!list_empty(&adev->device_bo_list)) {
> +               bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
> +               list_del_init(&bo->bo);
> +               spin_unlock(&adev->mman.bdev.lru_lock);
> +               if (bo->tbo.ttm)
> +                       ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
> +               spin_lock(&adev->mman.bdev.lru_lock);
> +       }
> +       spin_unlock(&adev->mman.bdev.lru_lock);
> +}
> +
>  /**
>   * amdgpu_device_fini - tear down the driver
>   *
> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>                 amdgpu_ucode_sysfs_fini(adev);
>         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>
> -
>         amdgpu_fbdev_fini(adev);
>
>         amdgpu_irq_fini_hw(adev);
>
>         amdgpu_device_ip_fini_early(adev);
> +
> +       amdgpu_clear_dma_mappings(adev);
> +
> +       amdgpu_gart_dummy_page_fini(adev);
>  }
>
>  void amdgpu_device_fini_sw(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index fde2d899b2c4..49cdcaf8512d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
>   *
>   * Frees the dummy page used by the driver (all asics).
>   */
> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>  {
>         if (!adev->dummy_page_addr)
>                 return;
> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>         vfree(adev->gart.pages);
>         adev->gart.pages = NULL;
>  #endif
> -       amdgpu_gart_dummy_page_fini(adev);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> index afa2e2877d87..5678d9c105ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>  void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>  int amdgpu_gart_init(struct amdgpu_device *adev);
>  void amdgpu_gart_fini(struct amdgpu_device *adev);
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>  int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>                        int pages);
>  int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 63e815c27585..a922154953a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>                 if (!amdgpu_device_has_dc_support(adev))
>                         flush_work(&adev->hotplug_work);
>         }
> +
> +       if (adev->irq.ih_soft.ring)
> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> +       if (adev->irq.ih.ring)
> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih);
> +       if (adev->irq.ih1.ring)
> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> +       if (adev->irq.ih2.ring)
> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 485f249d063a..62d829f5e62c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
>                 list_del_init(&bo->shadow_list);
>                 mutex_unlock(&adev->shadow_list_lock);
>         }
> -       amdgpu_bo_unref(&bo->parent);
>
> +
> +       spin_lock(&adev->mman.bdev.lru_lock);
> +       list_del(&bo->bo);
> +       spin_unlock(&adev->mman.bdev.lru_lock);
> +
> +       amdgpu_bo_unref(&bo->parent);
>         kfree(bo->metadata);
>         kfree(bo);
>  }
> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>         if (bp->type == ttm_bo_type_device)
>                 bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>
> +       INIT_LIST_HEAD(&bo->bo);
> +
> +       spin_lock(&adev->mman.bdev.lru_lock);
> +       list_add_tail(&bo->bo, &adev->device_bo_list);
> +       spin_unlock(&adev->mman.bdev.lru_lock);
> +
>         return 0;
>
>  fail_unreserve:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 9ac37569823f..5ae8555ef275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>         struct list_head                shadow_list;
>
>         struct kgd_mem                  *kfd_bo;
> +
> +       struct list_head                bo;
>  };
>
>  static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> index 183d44a6583c..df385ffc9768 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
>         amdgpu_irq_fini_sw(adev);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>         amdgpu_irq_remove_domain(adev);
>
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index d32743949003..b8c47e0cf37a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
>         amdgpu_irq_fini_sw(adev);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>         amdgpu_irq_remove_domain(adev);
>
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> index da96c6013477..ddfe4eaeea05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
>         amdgpu_irq_fini_sw(adev);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>         amdgpu_irq_remove_domain(adev);
>
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index 5eea4550b856..e171a9e78544 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>
>         amdgpu_irq_fini_sw(adev);
>         amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);

Shouldn't the soft ring be removed as well?

> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>
>         return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> index 751307f3252c..9a24f17a5750 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
>         amdgpu_irq_fini_sw(adev);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>
>         return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> index 973d80ec7f6c..b08905d1c00f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
>         amdgpu_irq_fini_sw(adev);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>         amdgpu_irq_remove_domain(adev);
>
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 2d0094c276ca..8c8abc00f710 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>
>         amdgpu_irq_fini_sw(adev);
>         amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);

Same here?

> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>
>         return 0;
>  }
> --
> 2.25.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Andrey Grodzovsky May 3, 2021, 6 p.m. UTC | #3
On 2021-04-29 11:13 p.m., Alex Deucher wrote:
> On Wed, Apr 28, 2021 at 11:13 AM Andrey Grodzovsky
> <andrey.grodzovsky@amd.com> wrote:
>>
>> Handle all DMA IOMMU gropup related dependencies before the
>> group is removed.
>>
>> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index fddb82897e5d..30a24db5f4d1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>
>>          bool                            in_pci_err_recovery;
>>          struct pci_saved_state          *pci_state;
>> +
>> +       struct list_head                device_bo_list;
>>   };
>>
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 46d646c40338..91594ddc2459 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -70,6 +70,7 @@
>>   #include <drm/task_barrier.h>
>>   #include <linux/pm_runtime.h>
>>
>> +
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>> @@ -3211,7 +3212,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>>          NULL
>>   };
>>
>> -
>>   /**
>>    * amdgpu_device_init - initialize the driver
>>    *
>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>
>>          INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>
>> +       INIT_LIST_HEAD(&adev->device_bo_list);
>> +
>>          adev->gfx.gfx_off_req_count = 1;
>>          adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>>
>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>          return r;
>>   }
>>
>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
> 
> Prefix this with amdgpu_device for consistency.  E.g.,
> amdgpu_device_clear_dma_mappings()
> 
>> +{
>> +       struct amdgpu_bo *bo = NULL;
>> +
>> +       /*
>> +        * Unmaps all DMA mappings before device will be removed from it's
>> +        * IOMMU group otherwise in case of IOMMU enabled system a crash
>> +        * will happen.
>> +        */
>> +
>> +       spin_lock(&adev->mman.bdev.lru_lock);
>> +       while (!list_empty(&adev->device_bo_list)) {
>> +               bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
>> +               list_del_init(&bo->bo);
>> +               spin_unlock(&adev->mman.bdev.lru_lock);
>> +               if (bo->tbo.ttm)
>> +                       ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
>> +               spin_lock(&adev->mman.bdev.lru_lock);
>> +       }
>> +       spin_unlock(&adev->mman.bdev.lru_lock);
>> +}
>> +
>>   /**
>>    * amdgpu_device_fini - tear down the driver
>>    *
>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>>                  amdgpu_ucode_sysfs_fini(adev);
>>          sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>>
>> -
>>          amdgpu_fbdev_fini(adev);
>>
>>          amdgpu_irq_fini_hw(adev);
>>
>>          amdgpu_device_ip_fini_early(adev);
>> +
>> +       amdgpu_clear_dma_mappings(adev);
>> +
>> +       amdgpu_gart_dummy_page_fini(adev);
>>   }
>>
>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index fde2d899b2c4..49cdcaf8512d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
>>    *
>>    * Frees the dummy page used by the driver (all asics).
>>    */
>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>   {
>>          if (!adev->dummy_page_addr)
>>                  return;
>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>          vfree(adev->gart.pages);
>>          adev->gart.pages = NULL;
>>   #endif
>> -       amdgpu_gart_dummy_page_fini(adev);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> index afa2e2877d87..5678d9c105ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>                         int pages);
>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index 63e815c27585..a922154953a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>>                  if (!amdgpu_device_has_dc_support(adev))
>>                          flush_work(&adev->hotplug_work);
>>          }
>> +
>> +       if (adev->irq.ih_soft.ring)
>> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> +       if (adev->irq.ih.ring)
>> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>> +       if (adev->irq.ih1.ring)
>> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> +       if (adev->irq.ih2.ring)
>> +               amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 485f249d063a..62d829f5e62c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
>>                  list_del_init(&bo->shadow_list);
>>                  mutex_unlock(&adev->shadow_list_lock);
>>          }
>> -       amdgpu_bo_unref(&bo->parent);
>>
>> +
>> +       spin_lock(&adev->mman.bdev.lru_lock);
>> +       list_del(&bo->bo);
>> +       spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>> +       amdgpu_bo_unref(&bo->parent);
>>          kfree(bo->metadata);
>>          kfree(bo);
>>   }
>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>>          if (bp->type == ttm_bo_type_device)
>>                  bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>
>> +       INIT_LIST_HEAD(&bo->bo);
>> +
>> +       spin_lock(&adev->mman.bdev.lru_lock);
>> +       list_add_tail(&bo->bo, &adev->device_bo_list);
>> +       spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>>          return 0;
>>
>>   fail_unreserve:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 9ac37569823f..5ae8555ef275 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>          struct list_head                shadow_list;
>>
>>          struct kgd_mem                  *kfd_bo;
>> +
>> +       struct list_head                bo;
>>   };
>>
>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> index 183d44a6583c..df385ffc9768 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>          amdgpu_irq_fini_sw(adev);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>          amdgpu_irq_remove_domain(adev);
>>
>>          return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> index d32743949003..b8c47e0cf37a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>          amdgpu_irq_fini_sw(adev);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>          amdgpu_irq_remove_domain(adev);
>>
>>          return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> index da96c6013477..ddfe4eaeea05 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>          amdgpu_irq_fini_sw(adev);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>          amdgpu_irq_remove_domain(adev);
>>
>>          return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> index 5eea4550b856..e171a9e78544 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>
>>          amdgpu_irq_fini_sw(adev);
>>          amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> 
> Shouldn't the soft ring be removed as well?

Right, thanks for noticing.

> 
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>
>>          return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> index 751307f3252c..9a24f17a5750 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>          amdgpu_irq_fini_sw(adev);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>
>>          return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> index 973d80ec7f6c..b08905d1c00f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>          amdgpu_irq_fini_sw(adev);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>          amdgpu_irq_remove_domain(adev);
>>
>>          return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> index 2d0094c276ca..8c8abc00f710 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>
>>          amdgpu_irq_fini_sw(adev);
>>          amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> 
> Same here?

Yep.

Andrey

> 
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -       amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>
>>          return 0;
>>   }
>> --
>> 2.25.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Candrey.grodzovsky%40amd.com%7C6b1f10b7bfb5491e025008d90b86044a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637553492499443932%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=WagbcvZG9xsaX6cKlMG%2FzMD%2FlYQlUSgxSQjnUHB8Myc%3D&amp;reserved=0
Andrey Grodzovsky May 3, 2021, 8:43 p.m. UTC | #4
On 2021-04-29 3:08 a.m., Christian König wrote:
> Am 28.04.21 um 17:11 schrieb Andrey Grodzovsky:
>> Handle all DMA IOMMU gropup related dependencies before the
>> group is removed.
>>
>> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate
> 
> Maybe split that up into more patches.
> 
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index fddb82897e5d..30a24db5f4d1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>       bool                            in_pci_err_recovery;
>>       struct pci_saved_state          *pci_state;
>> +
>> +    struct list_head                device_bo_list;
>>   };
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 46d646c40338..91594ddc2459 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -70,6 +70,7 @@
>>   #include <drm/task_barrier.h>
>>   #include <linux/pm_runtime.h>
>> +
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>> @@ -3211,7 +3212,6 @@ static const struct attribute 
>> *amdgpu_dev_attributes[] = {
>>       NULL
>>   };
>> -
>>   /**
>>    * amdgpu_device_init - initialize the driver
>>    *
>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> +    INIT_LIST_HEAD(&adev->device_bo_list);
>> +
>>       adev->gfx.gfx_off_req_count = 1;
>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>       return r;
>>   }
>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
>> +{
>> +    struct amdgpu_bo *bo = NULL;
>> +
>> +    /*
>> +     * Unmaps all DMA mappings before device will be removed from it's
>> +     * IOMMU group otherwise in case of IOMMU enabled system a crash
>> +     * will happen.
>> +     */
>> +
>> +    spin_lock(&adev->mman.bdev.lru_lock);
>> +    while (!list_empty(&adev->device_bo_list)) {
>> +        bo = list_first_entry(&adev->device_bo_list, struct 
>> amdgpu_bo, bo);
>> +        list_del_init(&bo->bo);
>> +        spin_unlock(&adev->mman.bdev.lru_lock);
>> +        if (bo->tbo.ttm)
>> +            ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
>> +        spin_lock(&adev->mman.bdev.lru_lock);
>> +    }
>> +    spin_unlock(&adev->mman.bdev.lru_lock);
> 
> Can you try to use the same approach as amdgpu_gtt_mgr_recover() instead 
> of adding something to the BO?
> 
> Christian.

Are you sure that dma mappings limit themself only to GTT BOs
which have allocated mm nodes ? Otherwsie we will crash and burn
on missing IOMMU group when unampping post device remove.
Problem for me to test this as in 5.12 kernel I don't crash even
when removing this entire patch.  Looks like iommu_dma_unmap_page
was changed since 5.9 when I introdiced this patch.

Andrey

> 
>> +}
>> +
>>   /**
>>    * amdgpu_device_fini - tear down the driver
>>    *
>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct 
>> amdgpu_device *adev)
>>           amdgpu_ucode_sysfs_fini(adev);
>>       sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>> -
>>       amdgpu_fbdev_fini(adev);
>>       amdgpu_irq_fini_hw(adev);
>>       amdgpu_device_ip_fini_early(adev);
>> +
>> +    amdgpu_clear_dma_mappings(adev);
>> +
>> +    amdgpu_gart_dummy_page_fini(adev);
>>   }
>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index fde2d899b2c4..49cdcaf8512d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct 
>> amdgpu_device *adev)
>>    *
>>    * Frees the dummy page used by the driver (all asics).
>>    */
>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>   {
>>       if (!adev->dummy_page_addr)
>>           return;
>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>       vfree(adev->gart.pages);
>>       adev->gart.pages = NULL;
>>   #endif
>> -    amdgpu_gart_dummy_page_fini(adev);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> index afa2e2877d87..5678d9c105ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device 
>> *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>                  int pages);
>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index 63e815c27585..a922154953a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>>           if (!amdgpu_device_has_dc_support(adev))
>>               flush_work(&adev->hotplug_work);
>>       }
>> +
>> +    if (adev->irq.ih_soft.ring)
>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> +    if (adev->irq.ih.ring)
>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>> +    if (adev->irq.ih1.ring)
>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> +    if (adev->irq.ih2.ring)
>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>   }
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 485f249d063a..62d829f5e62c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct 
>> ttm_buffer_object *tbo)
>>           list_del_init(&bo->shadow_list);
>>           mutex_unlock(&adev->shadow_list_lock);
>>       }
>> -    amdgpu_bo_unref(&bo->parent);
>> +
>> +    spin_lock(&adev->mman.bdev.lru_lock);
>> +    list_del(&bo->bo);
>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>> +    amdgpu_bo_unref(&bo->parent);
>>       kfree(bo->metadata);
>>       kfree(bo);
>>   }
>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct 
>> amdgpu_device *adev,
>>       if (bp->type == ttm_bo_type_device)
>>           bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>> +    INIT_LIST_HEAD(&bo->bo);
>> +
>> +    spin_lock(&adev->mman.bdev.lru_lock);
>> +    list_add_tail(&bo->bo, &adev->device_bo_list);
>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>>       return 0;
>>   fail_unreserve:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 9ac37569823f..5ae8555ef275 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>       struct list_head        shadow_list;
>>       struct kgd_mem                  *kfd_bo;
>> +
>> +    struct list_head        bo;
>>   };
>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct 
>> ttm_buffer_object *tbo)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> index 183d44a6583c..df385ffc9768 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>       amdgpu_irq_fini_sw(adev);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       amdgpu_irq_remove_domain(adev);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> index d32743949003..b8c47e0cf37a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>       amdgpu_irq_fini_sw(adev);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       amdgpu_irq_remove_domain(adev);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> index da96c6013477..ddfe4eaeea05 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>       amdgpu_irq_fini_sw(adev);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       amdgpu_irq_remove_domain(adev);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> index 5eea4550b856..e171a9e78544 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>       amdgpu_irq_fini_sw(adev);
>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> index 751307f3252c..9a24f17a5750 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>       amdgpu_irq_fini_sw(adev);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> index 973d80ec7f6c..b08905d1c00f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>       amdgpu_irq_fini_sw(adev);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       amdgpu_irq_remove_domain(adev);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> index 2d0094c276ca..8c8abc00f710 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>       amdgpu_irq_fini_sw(adev);
>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>       return 0;
>>   }
>
Christian König May 4, 2021, 7:03 a.m. UTC | #5
Am 03.05.21 um 22:43 schrieb Andrey Grodzovsky:
>
>
> On 2021-04-29 3:08 a.m., Christian König wrote:
>> Am 28.04.21 um 17:11 schrieb Andrey Grodzovsky:
>>> Handle all DMA IOMMU gropup related dependencies before the
>>> group is removed.
>>>
>>> v5: Drop IOMMU notifier and switch to lockless call to 
>>> ttm_tt_unpopulate
>>
>> Maybe split that up into more patches.
>>
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 
>>> ++++++++++++++++++++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index fddb82897e5d..30a24db5f4d1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>>       bool                            in_pci_err_recovery;
>>>       struct pci_saved_state          *pci_state;
>>> +
>>> +    struct list_head                device_bo_list;
>>>   };
>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>> *ddev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 46d646c40338..91594ddc2459 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -70,6 +70,7 @@
>>>   #include <drm/task_barrier.h>
>>>   #include <linux/pm_runtime.h>
>>> +
>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>>> @@ -3211,7 +3212,6 @@ static const struct attribute 
>>> *amdgpu_dev_attributes[] = {
>>>       NULL
>>>   };
>>> -
>>>   /**
>>>    * amdgpu_device_init - initialize the driver
>>>    *
>>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>> +    INIT_LIST_HEAD(&adev->device_bo_list);
>>> +
>>>       adev->gfx.gfx_off_req_count = 1;
>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       return r;
>>>   }
>>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
>>> +{
>>> +    struct amdgpu_bo *bo = NULL;
>>> +
>>> +    /*
>>> +     * Unmaps all DMA mappings before device will be removed from it's
>>> +     * IOMMU group otherwise in case of IOMMU enabled system a crash
>>> +     * will happen.
>>> +     */
>>> +
>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>> +    while (!list_empty(&adev->device_bo_list)) {
>>> +        bo = list_first_entry(&adev->device_bo_list, struct 
>>> amdgpu_bo, bo);
>>> +        list_del_init(&bo->bo);
>>> +        spin_unlock(&adev->mman.bdev.lru_lock);
>>> +        if (bo->tbo.ttm)
>>> +            ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
>>> +        spin_lock(&adev->mman.bdev.lru_lock);
>>> +    }
>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>
>> Can you try to use the same approach as amdgpu_gtt_mgr_recover() 
>> instead of adding something to the BO?
>>
>> Christian.
>
> Are you sure that dma mappings limit themself only to GTT BOs
> which have allocated mm nodes ?

Yes, you would also need the system domain BOs. But those can be put on 
a similar list.

> Otherwsie we will crash and burn
> on missing IOMMU group when unampping post device remove.
> Problem for me to test this as in 5.12 kernel I don't crash even
> when removing this entire patch.  Looks like iommu_dma_unmap_page
> was changed since 5.9 when I introdiced this patch.

Do we really still need that stuff then? What exactly has changed?

Christian.

>
> Andrey
>
>>
>>> +}
>>> +
>>>   /**
>>>    * amdgpu_device_fini - tear down the driver
>>>    *
>>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct 
>>> amdgpu_device *adev)
>>>           amdgpu_ucode_sysfs_fini(adev);
>>>       sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>>> -
>>>       amdgpu_fbdev_fini(adev);
>>>       amdgpu_irq_fini_hw(adev);
>>>       amdgpu_device_ip_fini_early(adev);
>>> +
>>> +    amdgpu_clear_dma_mappings(adev);
>>> +
>>> +    amdgpu_gart_dummy_page_fini(adev);
>>>   }
>>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> index fde2d899b2c4..49cdcaf8512d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct 
>>> amdgpu_device *adev)
>>>    *
>>>    * Frees the dummy page used by the driver (all asics).
>>>    */
>>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>>   {
>>>       if (!adev->dummy_page_addr)
>>>           return;
>>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>>       vfree(adev->gart.pages);
>>>       adev->gart.pages = NULL;
>>>   #endif
>>> -    amdgpu_gart_dummy_page_fini(adev);
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>> index afa2e2877d87..5678d9c105ab 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct 
>>> amdgpu_device *adev);
>>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>>                  int pages);
>>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>> index 63e815c27585..a922154953a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device 
>>> *adev)
>>>           if (!amdgpu_device_has_dc_support(adev))
>>>               flush_work(&adev->hotplug_work);
>>>       }
>>> +
>>> +    if (adev->irq.ih_soft.ring)
>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>> +    if (adev->irq.ih.ring)
>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>> +    if (adev->irq.ih1.ring)
>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>> +    if (adev->irq.ih2.ring)
>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>   }
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 485f249d063a..62d829f5e62c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct 
>>> ttm_buffer_object *tbo)
>>>           list_del_init(&bo->shadow_list);
>>>           mutex_unlock(&adev->shadow_list_lock);
>>>       }
>>> -    amdgpu_bo_unref(&bo->parent);
>>> +
>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>> +    list_del(&bo->bo);
>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>> +
>>> +    amdgpu_bo_unref(&bo->parent);
>>>       kfree(bo->metadata);
>>>       kfree(bo);
>>>   }
>>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct 
>>> amdgpu_device *adev,
>>>       if (bp->type == ttm_bo_type_device)
>>>           bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>> +    INIT_LIST_HEAD(&bo->bo);
>>> +
>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>> +    list_add_tail(&bo->bo, &adev->device_bo_list);
>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>> +
>>>       return 0;
>>>   fail_unreserve:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 9ac37569823f..5ae8555ef275 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>>       struct list_head        shadow_list;
>>>       struct kgd_mem                  *kfd_bo;
>>> +
>>> +    struct list_head        bo;
>>>   };
>>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct 
>>> ttm_buffer_object *tbo)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> index 183d44a6583c..df385ffc9768 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>       amdgpu_irq_fini_sw(adev);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       amdgpu_irq_remove_domain(adev);
>>>       return 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> index d32743949003..b8c47e0cf37a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>       amdgpu_irq_fini_sw(adev);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       amdgpu_irq_remove_domain(adev);
>>>       return 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> index da96c6013477..ddfe4eaeea05 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>       amdgpu_irq_fini_sw(adev);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       amdgpu_irq_remove_domain(adev);
>>>       return 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> index 5eea4550b856..e171a9e78544 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>>       amdgpu_irq_fini_sw(adev);
>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> index 751307f3252c..9a24f17a5750 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>       amdgpu_irq_fini_sw(adev);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> index 973d80ec7f6c..b08905d1c00f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>       amdgpu_irq_fini_sw(adev);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       amdgpu_irq_remove_domain(adev);
>>>       return 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> index 2d0094c276ca..8c8abc00f710 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>>       amdgpu_irq_fini_sw(adev);
>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>       return 0;
>>>   }
>>
Andrey Grodzovsky May 4, 2021, 3:43 p.m. UTC | #6
On 2021-05-04 3:03 a.m., Christian König wrote:
> Am 03.05.21 um 22:43 schrieb Andrey Grodzovsky:
>>
>>
>> On 2021-04-29 3:08 a.m., Christian König wrote:
>>> Am 28.04.21 um 17:11 schrieb Andrey Grodzovsky:
>>>> Handle all DMA IOMMU gropup related dependencies before the
>>>> group is removed.
>>>>
>>>> v5: Drop IOMMU notifier and switch to lockless call to 
>>>> ttm_tt_unpopulate
>>>
>>> Maybe split that up into more patches.
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 
>>>> ++++++++++++++++++++--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index fddb82897e5d..30a24db5f4d1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>>>       bool                            in_pci_err_recovery;
>>>>       struct pci_saved_state          *pci_state;
>>>> +
>>>> +    struct list_head                device_bo_list;
>>>>   };
>>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>>> *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 46d646c40338..91594ddc2459 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -70,6 +70,7 @@
>>>>   #include <drm/task_barrier.h>
>>>>   #include <linux/pm_runtime.h>
>>>> +
>>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>>>> @@ -3211,7 +3212,6 @@ static const struct attribute 
>>>> *amdgpu_dev_attributes[] = {
>>>>       NULL
>>>>   };
>>>> -
>>>>   /**
>>>>    * amdgpu_device_init - initialize the driver
>>>>    *
>>>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device 
>>>> *adev,
>>>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>>> +    INIT_LIST_HEAD(&adev->device_bo_list);
>>>> +
>>>>       adev->gfx.gfx_off_req_count = 1;
>>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>>>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device 
>>>> *adev,
>>>>       return r;
>>>>   }
>>>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
>>>> +{
>>>> +    struct amdgpu_bo *bo = NULL;
>>>> +
>>>> +    /*
>>>> +     * Unmaps all DMA mappings before device will be removed from it's
>>>> +     * IOMMU group otherwise in case of IOMMU enabled system a crash
>>>> +     * will happen.
>>>> +     */
>>>> +
>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>> +    while (!list_empty(&adev->device_bo_list)) {
>>>> +        bo = list_first_entry(&adev->device_bo_list, struct 
>>>> amdgpu_bo, bo);
>>>> +        list_del_init(&bo->bo);
>>>> +        spin_unlock(&adev->mman.bdev.lru_lock);
>>>> +        if (bo->tbo.ttm)
>>>> +            ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
>>>> +        spin_lock(&adev->mman.bdev.lru_lock);
>>>> +    }
>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>
>>> Can you try to use the same approach as amdgpu_gtt_mgr_recover() 
>>> instead of adding something to the BO?
>>>
>>> Christian.
>>
>> Are you sure that dma mappings limit themself only to GTT BOs
>> which have allocated mm nodes ?
> 
> Yes, you would also need the system domain BOs. But those can be put on 
> a similar list.

What list ? Those BOs don't have ttm_resource_manager and so no
drm_mm_node list they all bound to. Should I maintain a list for them
spcifically for the unmap purpuse ?

> 
>> Otherwsie we will crash and burn
>> on missing IOMMU group when unampping post device remove.
>> Problem for me to test this as in 5.12 kernel I don't crash even
>> when removing this entire patch.  Looks like iommu_dma_unmap_page
>> was changed since 5.9 when I introdiced this patch.
> 
> Do we really still need that stuff then? What exactly has changed?

At first I assumed that because of this change 'iommu: Allow the 
dma-iommu api to use bounce buffers'
Which changed iommu_dma_unmap_page to call __iommu_dma_unmap_swiotlb
instead if __iommu_dma_unmap directly. But then i looked inside
__iommu_dma_unmap_swiotlb and it still calls __iommu_dma_unmap
evenetually. So maybe the fact that I moved the amd_ip_funcs.hw_fini
call to inside amdgpu_pci_remove helps.

Andrey


> 
> Christian.
> 
>>
>> Andrey
>>
>>>
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_device_fini - tear down the driver
>>>>    *
>>>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct 
>>>> amdgpu_device *adev)
>>>>           amdgpu_ucode_sysfs_fini(adev);
>>>>       sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>>>> -
>>>>       amdgpu_fbdev_fini(adev);
>>>>       amdgpu_irq_fini_hw(adev);
>>>>       amdgpu_device_ip_fini_early(adev);
>>>> +
>>>> +    amdgpu_clear_dma_mappings(adev);
>>>> +
>>>> +    amdgpu_gart_dummy_page_fini(adev);
>>>>   }
>>>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> index fde2d899b2c4..49cdcaf8512d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct 
>>>> amdgpu_device *adev)
>>>>    *
>>>>    * Frees the dummy page used by the driver (all asics).
>>>>    */
>>>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>>>   {
>>>>       if (!adev->dummy_page_addr)
>>>>           return;
>>>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>>>       vfree(adev->gart.pages);
>>>>       adev->gart.pages = NULL;
>>>>   #endif
>>>> -    amdgpu_gart_dummy_page_fini(adev);
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>> index afa2e2877d87..5678d9c105ab 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct 
>>>> amdgpu_device *adev);
>>>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>>>                  int pages);
>>>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>> index 63e815c27585..a922154953a7 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device 
>>>> *adev)
>>>>           if (!amdgpu_device_has_dc_support(adev))
>>>>               flush_work(&adev->hotplug_work);
>>>>       }
>>>> +
>>>> +    if (adev->irq.ih_soft.ring)
>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>> +    if (adev->irq.ih.ring)
>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>> +    if (adev->irq.ih1.ring)
>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>> +    if (adev->irq.ih2.ring)
>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>>   }
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> index 485f249d063a..62d829f5e62c 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct 
>>>> ttm_buffer_object *tbo)
>>>>           list_del_init(&bo->shadow_list);
>>>>           mutex_unlock(&adev->shadow_list_lock);
>>>>       }
>>>> -    amdgpu_bo_unref(&bo->parent);
>>>> +
>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>> +    list_del(&bo->bo);
>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>> +
>>>> +    amdgpu_bo_unref(&bo->parent);
>>>>       kfree(bo->metadata);
>>>>       kfree(bo);
>>>>   }
>>>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct 
>>>> amdgpu_device *adev,
>>>>       if (bp->type == ttm_bo_type_device)
>>>>           bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>>> +    INIT_LIST_HEAD(&bo->bo);
>>>> +
>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>> +    list_add_tail(&bo->bo, &adev->device_bo_list);
>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>> +
>>>>       return 0;
>>>>   fail_unreserve:
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> index 9ac37569823f..5ae8555ef275 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>>>       struct list_head        shadow_list;
>>>>       struct kgd_mem                  *kfd_bo;
>>>> +
>>>> +    struct list_head        bo;
>>>>   };
>>>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct 
>>>> ttm_buffer_object *tbo)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> index 183d44a6583c..df385ffc9768 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>       amdgpu_irq_fini_sw(adev);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       amdgpu_irq_remove_domain(adev);
>>>>       return 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> index d32743949003..b8c47e0cf37a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>       amdgpu_irq_fini_sw(adev);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       amdgpu_irq_remove_domain(adev);
>>>>       return 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> index da96c6013477..ddfe4eaeea05 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>       amdgpu_irq_fini_sw(adev);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       amdgpu_irq_remove_domain(adev);
>>>>       return 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> index 5eea4550b856..e171a9e78544 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>>>       amdgpu_irq_fini_sw(adev);
>>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       return 0;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> index 751307f3252c..9a24f17a5750 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>       amdgpu_irq_fini_sw(adev);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       return 0;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> index 973d80ec7f6c..b08905d1c00f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>       amdgpu_irq_fini_sw(adev);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       amdgpu_irq_remove_domain(adev);
>>>>       return 0;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> index 2d0094c276ca..8c8abc00f710 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>>>       amdgpu_irq_fini_sw(adev);
>>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>       return 0;
>>>>   }
>>>
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Candrey.grodzovsky%40amd.com%7C1cee392c0b934cda6c7608d90ecabc41%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637557086175078458%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=C8QBsUQhJa1eWV1YYdQaykUVQGwmCn6OIoWQSrDkWoU%3D&amp;reserved=0 
>
Felix Kuehling May 4, 2021, 5:05 p.m. UTC | #7
Am 2021-04-28 um 11:11 a.m. schrieb Andrey Grodzovsky:
> Handle all DMA IOMMU gropup related dependencies before the
> group is removed.
>
> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>  drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>  drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>  drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>  drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>  14 files changed, 56 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index fddb82897e5d..30a24db5f4d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>  
>  	bool                            in_pci_err_recovery;
>  	struct pci_saved_state          *pci_state;
> +
> +	struct list_head                device_bo_list;
>  };
>  
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 46d646c40338..91594ddc2459 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -70,6 +70,7 @@
>  #include <drm/task_barrier.h>
>  #include <linux/pm_runtime.h>
>  
> +
>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>  MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>  MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
> @@ -3211,7 +3212,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>  	NULL
>  };
>  
> -
>  /**
>   * amdgpu_device_init - initialize the driver
>   *
> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  
>  	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>  
> +	INIT_LIST_HEAD(&adev->device_bo_list);
> +
>  	adev->gfx.gfx_off_req_count = 1;
>  	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>  
> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  	return r;
>  }
>  
> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_bo *bo = NULL;
> +
> +	/*
> +	 * Unmaps all DMA mappings before device will be removed from it's
> +	 * IOMMU group otherwise in case of IOMMU enabled system a crash
> +	 * will happen.
> +	 */
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	while (!list_empty(&adev->device_bo_list)) {
> +		bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
> +		list_del_init(&bo->bo);
> +		spin_unlock(&adev->mman.bdev.lru_lock);
> +		if (bo->tbo.ttm)
> +			ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);

I have a patch pending (reviewed by Christian) that moves the
dma-unmapping to amdgpu_ttm_backend_unbind. With that patch,
ttm_tt_unpopulate would no longer be the right way to remove the DMA
mapping.

Maybe I'd need to add a check in ttm_tt_unpopulate to call
backend_unbind first, if necessary. Or is there some other mechanism
that moves the BO to the CPU domain before unpopulating it?

Regards,
  Felix


> +		spin_lock(&adev->mman.bdev.lru_lock);
> +	}
> +	spin_unlock(&adev->mman.bdev.lru_lock);
> +}
> +
>  /**
>   * amdgpu_device_fini - tear down the driver
>   *
> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>  		amdgpu_ucode_sysfs_fini(adev);
>  	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>  
> -
>  	amdgpu_fbdev_fini(adev);
>  
>  	amdgpu_irq_fini_hw(adev);
>  
>  	amdgpu_device_ip_fini_early(adev);
> +
> +	amdgpu_clear_dma_mappings(adev);
> +
> +	amdgpu_gart_dummy_page_fini(adev);
>  }
>  
>  void amdgpu_device_fini_sw(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index fde2d899b2c4..49cdcaf8512d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
>   *
>   * Frees the dummy page used by the driver (all asics).
>   */
> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>  {
>  	if (!adev->dummy_page_addr)
>  		return;
> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>  	vfree(adev->gart.pages);
>  	adev->gart.pages = NULL;
>  #endif
> -	amdgpu_gart_dummy_page_fini(adev);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> index afa2e2877d87..5678d9c105ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>  void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>  int amdgpu_gart_init(struct amdgpu_device *adev);
>  void amdgpu_gart_fini(struct amdgpu_device *adev);
> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>  int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>  		       int pages);
>  int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 63e815c27585..a922154953a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>  		if (!amdgpu_device_has_dc_support(adev))
>  			flush_work(&adev->hotplug_work);
>  	}
> +
> +	if (adev->irq.ih_soft.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> +	if (adev->irq.ih.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih);
> +	if (adev->irq.ih1.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> +	if (adev->irq.ih2.ring)
> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 485f249d063a..62d829f5e62c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
>  		list_del_init(&bo->shadow_list);
>  		mutex_unlock(&adev->shadow_list_lock);
>  	}
> -	amdgpu_bo_unref(&bo->parent);
>  
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	list_del(&bo->bo);
> +	spin_unlock(&adev->mman.bdev.lru_lock);
> +
> +	amdgpu_bo_unref(&bo->parent);
>  	kfree(bo->metadata);
>  	kfree(bo);
>  }
> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  	if (bp->type == ttm_bo_type_device)
>  		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>  
> +	INIT_LIST_HEAD(&bo->bo);
> +
> +	spin_lock(&adev->mman.bdev.lru_lock);
> +	list_add_tail(&bo->bo, &adev->device_bo_list);
> +	spin_unlock(&adev->mman.bdev.lru_lock);
> +
>  	return 0;
>  
>  fail_unreserve:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 9ac37569823f..5ae8555ef275 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>  	struct list_head		shadow_list;
>  
>  	struct kgd_mem                  *kfd_bo;
> +
> +	struct list_head		bo;
>  };
>  
>  static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> index 183d44a6583c..df385ffc9768 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>  	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  	amdgpu_irq_remove_domain(adev);
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index d32743949003..b8c47e0cf37a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>  	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  	amdgpu_irq_remove_domain(adev);
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> index da96c6013477..ddfe4eaeea05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>  	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  	amdgpu_irq_remove_domain(adev);
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index 5eea4550b856..e171a9e78544 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>  
>  	amdgpu_irq_fini_sw(adev);
>  	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> index 751307f3252c..9a24f17a5750 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>  	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> index 973d80ec7f6c..b08905d1c00f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>  
>  	amdgpu_irq_fini_sw(adev);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  	amdgpu_irq_remove_domain(adev);
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 2d0094c276ca..8c8abc00f710 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>  
>  	amdgpu_irq_fini_sw(adev);
>  	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>  
>  	return 0;
>  }
Andrey Grodzovsky May 5, 2021, 2:05 p.m. UTC | #8
On 2021-05-04 1:05 p.m., Felix Kuehling wrote:
> 
> Am 2021-04-28 um 11:11 a.m. schrieb Andrey Grodzovsky:
>> Handle all DMA IOMMU gropup related dependencies before the
>> group is removed.
>>
>> v5: Drop IOMMU notifier and switch to lockless call to ttm_tt_unpopulate
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 ++++++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index fddb82897e5d..30a24db5f4d1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>   
>>   	bool                            in_pci_err_recovery;
>>   	struct pci_saved_state          *pci_state;
>> +
>> +	struct list_head                device_bo_list;
>>   };
>>   
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 46d646c40338..91594ddc2459 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -70,6 +70,7 @@
>>   #include <drm/task_barrier.h>
>>   #include <linux/pm_runtime.h>
>>   
>> +
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>> @@ -3211,7 +3212,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>>   	NULL
>>   };
>>   
>> -
>>   /**
>>    * amdgpu_device_init - initialize the driver
>>    *
>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   
>>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>   
>> +	INIT_LIST_HEAD(&adev->device_bo_list);
>> +
>>   	adev->gfx.gfx_off_req_count = 1;
>>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>>   
>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   	return r;
>>   }
>>   
>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
>> +{
>> +	struct amdgpu_bo *bo = NULL;
>> +
>> +	/*
>> +	 * Unmaps all DMA mappings before device will be removed from it's
>> +	 * IOMMU group otherwise in case of IOMMU enabled system a crash
>> +	 * will happen.
>> +	 */
>> +
>> +	spin_lock(&adev->mman.bdev.lru_lock);
>> +	while (!list_empty(&adev->device_bo_list)) {
>> +		bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
>> +		list_del_init(&bo->bo);
>> +		spin_unlock(&adev->mman.bdev.lru_lock);
>> +		if (bo->tbo.ttm)
>> +			ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
> 
> I have a patch pending (reviewed by Christian) that moves the
> dma-unmapping to amdgpu_ttm_backend_unbind. With that patch,
> ttm_tt_unpopulate would no longer be the right way to remove the DMA
> mapping.
> 
> Maybe I'd need to add a check in ttm_tt_unpopulate to call
> backend_unbind first, if necessary. Or is there some other mechanism
> that moves the BO to the CPU domain before unpopulating it?
> 
> Regards,
>    Felix

At least in the context of this patch we don't move the BO to system
domain but rather preemptively remove DMA mappings before IOMMU grpoup
is gone post pci_remove. So yes, I would say yes, we need to check here
for backend_unbind first.

Andrey

> 
> 
>> +		spin_lock(&adev->mman.bdev.lru_lock);
>> +	}
>> +	spin_unlock(&adev->mman.bdev.lru_lock);
>> +}
>> +
>>   /**
>>    * amdgpu_device_fini - tear down the driver
>>    *
>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>>   		amdgpu_ucode_sysfs_fini(adev);
>>   	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>>   
>> -
>>   	amdgpu_fbdev_fini(adev);
>>   
>>   	amdgpu_irq_fini_hw(adev);
>>   
>>   	amdgpu_device_ip_fini_early(adev);
>> +
>> +	amdgpu_clear_dma_mappings(adev);
>> +
>> +	amdgpu_gart_dummy_page_fini(adev);
>>   }
>>   
>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index fde2d899b2c4..49cdcaf8512d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
>>    *
>>    * Frees the dummy page used by the driver (all asics).
>>    */
>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>   {
>>   	if (!adev->dummy_page_addr)
>>   		return;
>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>   	vfree(adev->gart.pages);
>>   	adev->gart.pages = NULL;
>>   #endif
>> -	amdgpu_gart_dummy_page_fini(adev);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> index afa2e2877d87..5678d9c105ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>   		       int pages);
>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> index 63e815c27585..a922154953a7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
>>   		if (!amdgpu_device_has_dc_support(adev))
>>   			flush_work(&adev->hotplug_work);
>>   	}
>> +
>> +	if (adev->irq.ih_soft.ring)
>> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> +	if (adev->irq.ih.ring)
>> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>> +	if (adev->irq.ih1.ring)
>> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> +	if (adev->irq.ih2.ring)
>> +		amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>   }
>>   
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 485f249d063a..62d829f5e62c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
>>   		list_del_init(&bo->shadow_list);
>>   		mutex_unlock(&adev->shadow_list_lock);
>>   	}
>> -	amdgpu_bo_unref(&bo->parent);
>>   
>> +
>> +	spin_lock(&adev->mman.bdev.lru_lock);
>> +	list_del(&bo->bo);
>> +	spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>> +	amdgpu_bo_unref(&bo->parent);
>>   	kfree(bo->metadata);
>>   	kfree(bo);
>>   }
>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>>   	if (bp->type == ttm_bo_type_device)
>>   		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>   
>> +	INIT_LIST_HEAD(&bo->bo);
>> +
>> +	spin_lock(&adev->mman.bdev.lru_lock);
>> +	list_add_tail(&bo->bo, &adev->device_bo_list);
>> +	spin_unlock(&adev->mman.bdev.lru_lock);
>> +
>>   	return 0;
>>   
>>   fail_unreserve:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 9ac37569823f..5ae8555ef275 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>   	struct list_head		shadow_list;
>>   
>>   	struct kgd_mem                  *kfd_bo;
>> +
>> +	struct list_head		bo;
>>   };
>>   
>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> index 183d44a6583c..df385ffc9768 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   
>>   	amdgpu_irq_fini_sw(adev);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   	amdgpu_irq_remove_domain(adev);
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> index d32743949003..b8c47e0cf37a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   
>>   	amdgpu_irq_fini_sw(adev);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   	amdgpu_irq_remove_domain(adev);
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> index da96c6013477..ddfe4eaeea05 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   
>>   	amdgpu_irq_fini_sw(adev);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   	amdgpu_irq_remove_domain(adev);
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> index 5eea4550b856..e171a9e78544 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>   
>>   	amdgpu_irq_fini_sw(adev);
>>   	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   
>>   	return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> index 751307f3252c..9a24f17a5750 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   
>>   	amdgpu_irq_fini_sw(adev);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   
>>   	return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> index 973d80ec7f6c..b08905d1c00f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   
>>   	amdgpu_irq_fini_sw(adev);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   	amdgpu_irq_remove_domain(adev);
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> index 2d0094c276ca..8c8abc00f710 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>   
>>   	amdgpu_irq_fini_sw(adev);
>>   	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>> -	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>   
>>   	return 0;
>>   }
Andrey Grodzovsky May 5, 2021, 2:51 p.m. UTC | #9
Ping

Andrey

On 2021-05-04 11:43 a.m., Andrey Grodzovsky wrote:
> 
> 
> On 2021-05-04 3:03 a.m., Christian König wrote:
>> Am 03.05.21 um 22:43 schrieb Andrey Grodzovsky:
>>>
>>>
>>> On 2021-04-29 3:08 a.m., Christian König wrote:
>>>> Am 28.04.21 um 17:11 schrieb Andrey Grodzovsky:
>>>>> Handle all DMA IOMMU gropup related dependencies before the
>>>>> group is removed.
>>>>>
>>>>> v5: Drop IOMMU notifier and switch to lockless call to 
>>>>> ttm_tt_unpopulate
>>>>
>>>> Maybe split that up into more patches.
>>>>
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 
>>>>> ++++++++++++++++++++--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  3 +--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  9 +++++++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 ++++++++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/cik_ih.c        |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/cz_ih.c         |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/iceland_ih.c    |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c     |  3 ---
>>>>>   drivers/gpu/drm/amd/amdgpu/si_ih.c         |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/tonga_ih.c      |  1 -
>>>>>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c     |  3 ---
>>>>>   14 files changed, 56 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index fddb82897e5d..30a24db5f4d1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -1054,6 +1054,8 @@ struct amdgpu_device {
>>>>>       bool                            in_pci_err_recovery;
>>>>>       struct pci_saved_state          *pci_state;
>>>>> +
>>>>> +    struct list_head                device_bo_list;
>>>>>   };
>>>>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>>>> *ddev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 46d646c40338..91594ddc2459 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -70,6 +70,7 @@
>>>>>   #include <drm/task_barrier.h>
>>>>>   #include <linux/pm_runtime.h>
>>>>> +
>>>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>>>   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
>>>>> @@ -3211,7 +3212,6 @@ static const struct attribute 
>>>>> *amdgpu_dev_attributes[] = {
>>>>>       NULL
>>>>>   };
>>>>> -
>>>>>   /**
>>>>>    * amdgpu_device_init - initialize the driver
>>>>>    *
>>>>> @@ -3316,6 +3316,8 @@ int amdgpu_device_init(struct amdgpu_device 
>>>>> *adev,
>>>>>       INIT_WORK(&adev->xgmi_reset_work, 
>>>>> amdgpu_device_xgmi_reset_func);
>>>>> +    INIT_LIST_HEAD(&adev->device_bo_list);
>>>>> +
>>>>>       adev->gfx.gfx_off_req_count = 1;
>>>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>>>>> @@ -3601,6 +3603,28 @@ int amdgpu_device_init(struct amdgpu_device 
>>>>> *adev,
>>>>>       return r;
>>>>>   }
>>>>> +static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
>>>>> +{
>>>>> +    struct amdgpu_bo *bo = NULL;
>>>>> +
>>>>> +    /*
>>>>> +     * Unmaps all DMA mappings before device will be removed from 
>>>>> it's
>>>>> +     * IOMMU group otherwise in case of IOMMU enabled system a crash
>>>>> +     * will happen.
>>>>> +     */
>>>>> +
>>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>>> +    while (!list_empty(&adev->device_bo_list)) {
>>>>> +        bo = list_first_entry(&adev->device_bo_list, struct 
>>>>> amdgpu_bo, bo);
>>>>> +        list_del_init(&bo->bo);
>>>>> +        spin_unlock(&adev->mman.bdev.lru_lock);
>>>>> +        if (bo->tbo.ttm)
>>>>> +            ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
>>>>> +        spin_lock(&adev->mman.bdev.lru_lock);
>>>>> +    }
>>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>>
>>>> Can you try to use the same approach as amdgpu_gtt_mgr_recover() 
>>>> instead of adding something to the BO?
>>>>
>>>> Christian.
>>>
>>> Are you sure that dma mappings limit themself only to GTT BOs
>>> which have allocated mm nodes ?
>>
>> Yes, you would also need the system domain BOs. But those can be put 
>> on a similar list.
> 
> What list ? Those BOs don't have ttm_resource_manager and so no
> drm_mm_node list they all bound to. Should I maintain a list for them
> spcifically for the unmap purpuse ?
> 
>>
>>> Otherwsie we will crash and burn
>>> on missing IOMMU group when unampping post device remove.
>>> Problem for me to test this as in 5.12 kernel I don't crash even
>>> when removing this entire patch.  Looks like iommu_dma_unmap_page
>>> was changed since 5.9 when I introdiced this patch.
>>
>> Do we really still need that stuff then? What exactly has changed?
> 
> At first I assumed that because of this change 'iommu: Allow the 
> dma-iommu api to use bounce buffers'
> Which changed iommu_dma_unmap_page to call __iommu_dma_unmap_swiotlb
> instead if __iommu_dma_unmap directly. But then i looked inside
> __iommu_dma_unmap_swiotlb and it still calls __iommu_dma_unmap
> evenetually. So maybe the fact that I moved the amd_ip_funcs.hw_fini
> call to inside amdgpu_pci_remove helps.
> 
> Andrey
> 
> 
>>
>> Christian.
>>
>>>
>>> Andrey
>>>
>>>>
>>>>> +}
>>>>> +
>>>>>   /**
>>>>>    * amdgpu_device_fini - tear down the driver
>>>>>    *
>>>>> @@ -3639,12 +3663,15 @@ void amdgpu_device_fini_hw(struct 
>>>>> amdgpu_device *adev)
>>>>>           amdgpu_ucode_sysfs_fini(adev);
>>>>>       sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
>>>>> -
>>>>>       amdgpu_fbdev_fini(adev);
>>>>>       amdgpu_irq_fini_hw(adev);
>>>>>       amdgpu_device_ip_fini_early(adev);
>>>>> +
>>>>> +    amdgpu_clear_dma_mappings(adev);
>>>>> +
>>>>> +    amdgpu_gart_dummy_page_fini(adev);
>>>>>   }
>>>>>   void amdgpu_device_fini_sw(struct amdgpu_device *adev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> index fde2d899b2c4..49cdcaf8512d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct 
>>>>> amdgpu_device *adev)
>>>>>    *
>>>>>    * Frees the dummy page used by the driver (all asics).
>>>>>    */
>>>>> -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>>>>>   {
>>>>>       if (!adev->dummy_page_addr)
>>>>>           return;
>>>>> @@ -397,5 +397,4 @@ void amdgpu_gart_fini(struct amdgpu_device *adev)
>>>>>       vfree(adev->gart.pages);
>>>>>       adev->gart.pages = NULL;
>>>>>   #endif
>>>>> -    amdgpu_gart_dummy_page_fini(adev);
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>>> index afa2e2877d87..5678d9c105ab 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
>>>>> @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct 
>>>>> amdgpu_device *adev);
>>>>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>>>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>>>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>>>>> +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
>>>>>   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>>>>                  int pages);
>>>>>   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>>> index 63e815c27585..a922154953a7 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
>>>>> @@ -326,6 +326,15 @@ void amdgpu_irq_fini_hw(struct amdgpu_device 
>>>>> *adev)
>>>>>           if (!amdgpu_device_has_dc_support(adev))
>>>>>               flush_work(&adev->hotplug_work);
>>>>>       }
>>>>> +
>>>>> +    if (adev->irq.ih_soft.ring)
>>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>>> +    if (adev->irq.ih.ring)
>>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>> +    if (adev->irq.ih1.ring)
>>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>>> +    if (adev->irq.ih2.ring)
>>>>> +        amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>>>   }
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> index 485f249d063a..62d829f5e62c 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> @@ -68,8 +68,13 @@ static void amdgpu_bo_destroy(struct 
>>>>> ttm_buffer_object *tbo)
>>>>>           list_del_init(&bo->shadow_list);
>>>>>           mutex_unlock(&adev->shadow_list_lock);
>>>>>       }
>>>>> -    amdgpu_bo_unref(&bo->parent);
>>>>> +
>>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>>> +    list_del(&bo->bo);
>>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>>> +
>>>>> +    amdgpu_bo_unref(&bo->parent);
>>>>>       kfree(bo->metadata);
>>>>>       kfree(bo);
>>>>>   }
>>>>> @@ -585,6 +590,12 @@ static int amdgpu_bo_do_create(struct 
>>>>> amdgpu_device *adev,
>>>>>       if (bp->type == ttm_bo_type_device)
>>>>>           bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>>>> +    INIT_LIST_HEAD(&bo->bo);
>>>>> +
>>>>> +    spin_lock(&adev->mman.bdev.lru_lock);
>>>>> +    list_add_tail(&bo->bo, &adev->device_bo_list);
>>>>> +    spin_unlock(&adev->mman.bdev.lru_lock);
>>>>> +
>>>>>       return 0;
>>>>>   fail_unreserve:
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> index 9ac37569823f..5ae8555ef275 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> @@ -110,6 +110,8 @@ struct amdgpu_bo {
>>>>>       struct list_head        shadow_list;
>>>>>       struct kgd_mem                  *kfd_bo;
>>>>> +
>>>>> +    struct list_head        bo;
>>>>>   };
>>>>>   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct 
>>>>> ttm_buffer_object *tbo)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>>> index 183d44a6583c..df385ffc9768 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
>>>>> @@ -310,7 +310,6 @@ static int cik_ih_sw_fini(void *handle)
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       amdgpu_irq_remove_domain(adev);
>>>>>       return 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> index d32743949003..b8c47e0cf37a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
>>>>> @@ -302,7 +302,6 @@ static int cz_ih_sw_fini(void *handle)
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       amdgpu_irq_remove_domain(adev);
>>>>>       return 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> index da96c6013477..ddfe4eaeea05 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
>>>>> @@ -301,7 +301,6 @@ static int iceland_ih_sw_fini(void *handle)
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       amdgpu_irq_remove_domain(adev);
>>>>>       return 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> index 5eea4550b856..e171a9e78544 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
>>>>> @@ -571,9 +571,6 @@ static int navi10_ih_sw_fini(void *handle)
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       return 0;
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> index 751307f3252c..9a24f17a5750 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
>>>>> @@ -176,7 +176,6 @@ static int si_ih_sw_fini(void *handle)
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       return 0;
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> index 973d80ec7f6c..b08905d1c00f 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
>>>>> @@ -313,7 +313,6 @@ static int tonga_ih_sw_fini(void *handle)
>>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       amdgpu_irq_remove_domain(adev);
>>>>>       return 0;
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> index 2d0094c276ca..8c8abc00f710 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
>>>>> @@ -525,9 +525,6 @@ static int vega10_ih_sw_fini(void *handle)
>>>>>       amdgpu_irq_fini_sw(adev);
>>>>>       amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
>>>>> -    amdgpu_ih_ring_fini(adev, &adev->irq.ih);
>>>>>       return 0;
>>>>>   }
>>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Candrey.grodzovsky%40amd.com%7C1cee392c0b934cda6c7608d90ecabc41%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637557086175078458%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=C8QBsUQhJa1eWV1YYdQaykUVQGwmCn6OIoWQSrDkWoU%3D&amp;reserved=0 
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index fddb82897e5d..30a24db5f4d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1054,6 +1054,8 @@  struct amdgpu_device {
 
 	bool                            in_pci_err_recovery;
 	struct pci_saved_state          *pci_state;
+
+	struct list_head                device_bo_list;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 46d646c40338..91594ddc2459 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -70,6 +70,7 @@ 
 #include <drm/task_barrier.h>
 #include <linux/pm_runtime.h>
 
+
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
@@ -3211,7 +3212,6 @@  static const struct attribute *amdgpu_dev_attributes[] = {
 	NULL
 };
 
-
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -3316,6 +3316,8 @@  int amdgpu_device_init(struct amdgpu_device *adev,
 
 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
 
+	INIT_LIST_HEAD(&adev->device_bo_list);
+
 	adev->gfx.gfx_off_req_count = 1;
 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
 
@@ -3601,6 +3603,28 @@  int amdgpu_device_init(struct amdgpu_device *adev,
 	return r;
 }
 
+static void amdgpu_clear_dma_mappings(struct amdgpu_device *adev)
+{
+	struct amdgpu_bo *bo = NULL;
+
+	/*
+	 * Unmaps all DMA mappings before device will be removed from it's
+	 * IOMMU group otherwise in case of IOMMU enabled system a crash
+	 * will happen.
+	 */
+
+	spin_lock(&adev->mman.bdev.lru_lock);
+	while (!list_empty(&adev->device_bo_list)) {
+		bo = list_first_entry(&adev->device_bo_list, struct amdgpu_bo, bo);
+		list_del_init(&bo->bo);
+		spin_unlock(&adev->mman.bdev.lru_lock);
+		if (bo->tbo.ttm)
+			ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
+		spin_lock(&adev->mman.bdev.lru_lock);
+	}
+	spin_unlock(&adev->mman.bdev.lru_lock);
+}
+
 /**
  * amdgpu_device_fini - tear down the driver
  *
@@ -3639,12 +3663,15 @@  void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 		amdgpu_ucode_sysfs_fini(adev);
 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
 
-
 	amdgpu_fbdev_fini(adev);
 
 	amdgpu_irq_fini_hw(adev);
 
 	amdgpu_device_ip_fini_early(adev);
+
+	amdgpu_clear_dma_mappings(adev);
+
+	amdgpu_gart_dummy_page_fini(adev);
 }
 
 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index fde2d899b2c4..49cdcaf8512d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -92,7 +92,7 @@  static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
  *
  * Frees the dummy page used by the driver (all asics).
  */
-static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
+void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
 {
 	if (!adev->dummy_page_addr)
 		return;
@@ -397,5 +397,4 @@  void amdgpu_gart_fini(struct amdgpu_device *adev)
 	vfree(adev->gart.pages);
 	adev->gart.pages = NULL;
 #endif
-	amdgpu_gart_dummy_page_fini(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
index afa2e2877d87..5678d9c105ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
@@ -61,6 +61,7 @@  int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
 void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
 int amdgpu_gart_init(struct amdgpu_device *adev);
 void amdgpu_gart_fini(struct amdgpu_device *adev);
+void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
 int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
 		       int pages);
 int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 63e815c27585..a922154953a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -326,6 +326,15 @@  void amdgpu_irq_fini_hw(struct amdgpu_device *adev)
 		if (!amdgpu_device_has_dc_support(adev))
 			flush_work(&adev->hotplug_work);
 	}
+
+	if (adev->irq.ih_soft.ring)
+		amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
+	if (adev->irq.ih.ring)
+		amdgpu_ih_ring_fini(adev, &adev->irq.ih);
+	if (adev->irq.ih1.ring)
+		amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
+	if (adev->irq.ih2.ring)
+		amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 485f249d063a..62d829f5e62c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -68,8 +68,13 @@  static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
 		list_del_init(&bo->shadow_list);
 		mutex_unlock(&adev->shadow_list_lock);
 	}
-	amdgpu_bo_unref(&bo->parent);
 
+
+	spin_lock(&adev->mman.bdev.lru_lock);
+	list_del(&bo->bo);
+	spin_unlock(&adev->mman.bdev.lru_lock);
+
+	amdgpu_bo_unref(&bo->parent);
 	kfree(bo->metadata);
 	kfree(bo);
 }
@@ -585,6 +590,12 @@  static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	if (bp->type == ttm_bo_type_device)
 		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
 
+	INIT_LIST_HEAD(&bo->bo);
+
+	spin_lock(&adev->mman.bdev.lru_lock);
+	list_add_tail(&bo->bo, &adev->device_bo_list);
+	spin_unlock(&adev->mman.bdev.lru_lock);
+
 	return 0;
 
 fail_unreserve:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 9ac37569823f..5ae8555ef275 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -110,6 +110,8 @@  struct amdgpu_bo {
 	struct list_head		shadow_list;
 
 	struct kgd_mem                  *kfd_bo;
+
+	struct list_head		bo;
 };
 
 static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
index 183d44a6583c..df385ffc9768 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
@@ -310,7 +310,6 @@  static int cik_ih_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	amdgpu_irq_fini_sw(adev);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 	amdgpu_irq_remove_domain(adev);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
index d32743949003..b8c47e0cf37a 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
@@ -302,7 +302,6 @@  static int cz_ih_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	amdgpu_irq_fini_sw(adev);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 	amdgpu_irq_remove_domain(adev);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
index da96c6013477..ddfe4eaeea05 100644
--- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
@@ -301,7 +301,6 @@  static int iceland_ih_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	amdgpu_irq_fini_sw(adev);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 	amdgpu_irq_remove_domain(adev);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
index 5eea4550b856..e171a9e78544 100644
--- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
@@ -571,9 +571,6 @@  static int navi10_ih_sw_fini(void *handle)
 
 	amdgpu_irq_fini_sw(adev);
 	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c
index 751307f3252c..9a24f17a5750 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c
@@ -176,7 +176,6 @@  static int si_ih_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	amdgpu_irq_fini_sw(adev);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
index 973d80ec7f6c..b08905d1c00f 100644
--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
@@ -313,7 +313,6 @@  static int tonga_ih_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	amdgpu_irq_fini_sw(adev);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 	amdgpu_irq_remove_domain(adev);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index 2d0094c276ca..8c8abc00f710 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -525,9 +525,6 @@  static int vega10_ih_sw_fini(void *handle)
 
 	amdgpu_irq_fini_sw(adev);
 	amdgpu_ih_ring_fini(adev, &adev->irq.ih_soft);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
-	amdgpu_ih_ring_fini(adev, &adev->irq.ih);
 
 	return 0;
 }