diff mbox series

[v2,01/10] iommu/vt-d: add wrapper functions for page allocations

Message ID 20231130201504.2322355-2-pasha.tatashin@soleen.com
State Handled Elsewhere
Headers show
Series IOMMU memory observability | expand

Commit Message

Pasha Tatashin Nov. 30, 2023, 8:14 p.m. UTC
In order to improve observability and accountability of IOMMU layer, we
must account the number of pages that are allocated by functions that
are calling directly into buddy allocator.

This is achieved by first wrapping the allocation related functions into a
separate inline functions in new file:

drivers/iommu/iommu-pages.h

Convert all page allocation calls under iommu/intel to use these new
functions.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 drivers/iommu/intel/dmar.c          |  10 +-
 drivers/iommu/intel/iommu.c         |  47 +++----
 drivers/iommu/intel/iommu.h         |   2 -
 drivers/iommu/intel/irq_remapping.c |  10 +-
 drivers/iommu/intel/pasid.c         |  12 +-
 drivers/iommu/intel/svm.c           |   7 +-
 drivers/iommu/iommu-pages.h         | 199 ++++++++++++++++++++++++++++
 7 files changed, 236 insertions(+), 51 deletions(-)
 create mode 100644 drivers/iommu/iommu-pages.h

Comments

David Rientjes Dec. 14, 2023, 5:58 p.m. UTC | #1
On Thu, 30 Nov 2023, Pasha Tatashin wrote:

> diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
> new file mode 100644
> index 000000000000..2332f807d514
> --- /dev/null
> +++ b/drivers/iommu/iommu-pages.h
> @@ -0,0 +1,199 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2023, Google LLC.
> + * Pasha Tatashin <pasha.tatashin@soleen.com>
> + */
> +
> +#ifndef __IOMMU_PAGES_H
> +#define __IOMMU_PAGES_H
> +
> +#include <linux/vmstat.h>
> +#include <linux/gfp.h>
> +#include <linux/mm.h>
> +
> +/*
> + * All page allocation that are performed in the IOMMU subsystem must use one of
> + * the functions below.  This is necessary for the proper accounting as IOMMU
> + * state can be rather large, i.e. multiple gigabytes in size.
> + */
> +
> +/**
> + * __iommu_alloc_pages_node - allocate a zeroed page of a given order from
> + * specific NUMA node.
> + * @nid: memory NUMA node id

NUMA_NO_NODE if no locality requirements?

> + * @gfp: buddy allocator flags
> + * @order: page order
> + *
> + * returns the head struct page of the allocated page.
> + */
> +static inline struct page *__iommu_alloc_pages_node(int nid, gfp_t gfp,
> +						    int order)
> +{
> +	struct page *pages;

s/pages/page/ here and later in this file.

> +
> +	pages = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
> +	if (!pages)

unlikely()?

> +		return NULL;
> +
> +	return pages;
> +}
> +
> +/**
> + * __iommu_alloc_pages - allocate a zeroed page of a given order.
> + * @gfp: buddy allocator flags
> + * @order: page order
> + *
> + * returns the head struct page of the allocated page.
> + */
> +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
> +{
> +	struct page *pages;
> +
> +	pages = alloc_pages(gfp | __GFP_ZERO, order);
> +	if (!pages)
> +		return NULL;
> +
> +	return pages;
> +}
> +
> +/**
> + * __iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
> + * @nid: memory NUMA node id
> + * @gfp: buddy allocator flags
> + *
> + * returns the struct page of the allocated page.
> + */
> +static inline struct page *__iommu_alloc_page_node(int nid, gfp_t gfp)
> +{
> +	return __iommu_alloc_pages_node(nid, gfp, 0);
> +}
> +
> +/**
> + * __iommu_alloc_page - allocate a zeroed page
> + * @gfp: buddy allocator flags
> + *
> + * returns the struct page of the allocated page.
> + */
> +static inline struct page *__iommu_alloc_page(gfp_t gfp)
> +{
> +	return __iommu_alloc_pages(gfp, 0);
> +}
> +
> +/**
> + * __iommu_free_pages - free page of a given order
> + * @pages: head struct page of the page

I think "pages" implies more than one page, this is just a (potentially 
compound) page?

> + * @order: page order
> + */
> +static inline void __iommu_free_pages(struct page *pages, int order)
> +{
> +	if (!pages)
> +		return;
> +
> +	__free_pages(pages, order);
> +}
> +
> +/**
> + * __iommu_free_page - free page
> + * @page: struct page of the page
> + */
> +static inline void __iommu_free_page(struct page *page)
> +{
> +	__iommu_free_pages(page, 0);
> +}
> +
> +/**
> + * iommu_alloc_pages_node - allocate a zeroed page of a given order from
> + * specific NUMA node.
> + * @nid: memory NUMA node id
> + * @gfp: buddy allocator flags
> + * @order: page order
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
> +{
> +	struct page *pages = __iommu_alloc_pages_node(nid, gfp, order);
> +
> +	if (!pages)
> +		return NULL;
> +
> +	return page_address(pages);
> +}
> +
> +/**
> + * iommu_alloc_pages - allocate a zeroed page of a given order
> + * @gfp: buddy allocator flags
> + * @order: page order
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_pages(gfp_t gfp, int order)
> +{
> +	struct page *pages = __iommu_alloc_pages(gfp, order);
> +
> +	if (!pages)
> +		return NULL;
> +
> +	return page_address(pages);
> +}
> +
> +/**
> + * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
> + * @nid: memory NUMA node id
> + * @gfp: buddy allocator flags
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_page_node(int nid, gfp_t gfp)
> +{
> +	return iommu_alloc_pages_node(nid, gfp, 0);
> +}
> +
> +/**
> + * iommu_alloc_page - allocate a zeroed page
> + * @gfp: buddy allocator flags
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_page(gfp_t gfp)
> +{
> +	return iommu_alloc_pages(gfp, 0);
> +}
> +
> +/**
> + * iommu_free_pages - free page of a given order
> + * @virt: virtual address of the page to be freed.
> + * @order: page order
> + */
> +static inline void iommu_free_pages(void *virt, int order)
> +{
> +	if (!virt)
> +		return;
> +
> +	__iommu_free_pages(virt_to_page(virt), order);
> +}
> +
> +/**
> + * iommu_free_page - free page
> + * @virt: virtual address of the page to be freed.
> + */
> +static inline void iommu_free_page(void *virt)
> +{
> +	iommu_free_pages(virt, 0);
> +}
> +
> +/**
> + * iommu_free_pages_list - free a list of pages.
> + * @pages: the head of the lru list to be freed.

Document the locking requirements for this?

> + */
> +static inline void iommu_free_pages_list(struct list_head *pages)
> +{
> +	while (!list_empty(pages)) {
> +		struct page *p = list_entry(pages->prev, struct page, lru);
> +
> +		list_del(&p->lru);
> +		put_page(p);
> +	}
> +}
> +
> +#endif	/* __IOMMU_PAGES_H */
> -- 
> 2.43.0.rc2.451.g8631bc7472-goog
> 
> 
>
Pasha Tatashin Dec. 14, 2023, 7:16 p.m. UTC | #2
On Thu, Dec 14, 2023 at 12:58 PM David Rientjes <rientjes@google.com> wrote:
>
> On Thu, 30 Nov 2023, Pasha Tatashin wrote:
>
> > diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
> > new file mode 100644
> > index 000000000000..2332f807d514
> > --- /dev/null
> > +++ b/drivers/iommu/iommu-pages.h
> > @@ -0,0 +1,199 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (c) 2023, Google LLC.
> > + * Pasha Tatashin <pasha.tatashin@soleen.com>
> > + */
> > +
> > +#ifndef __IOMMU_PAGES_H
> > +#define __IOMMU_PAGES_H
> > +
> > +#include <linux/vmstat.h>
> > +#include <linux/gfp.h>
> > +#include <linux/mm.h>
> > +
> > +/*
> > + * All page allocation that are performed in the IOMMU subsystem must use one of
> > + * the functions below.  This is necessary for the proper accounting as IOMMU
> > + * state can be rather large, i.e. multiple gigabytes in size.
> > + */
> > +
> > +/**
> > + * __iommu_alloc_pages_node - allocate a zeroed page of a given order from
> > + * specific NUMA node.
> > + * @nid: memory NUMA node id
>
> NUMA_NO_NODE if no locality requirements?

If no locality is required, there is a better interface:
__iommu_alloc_pages(). That one will also take a look at the calling
process policies to determine the proper NUMA node when nothing is
specified. However, when policies should be ignored, and no locality
required, NUMA_NO_NODE can be passed.

>
> > + * @gfp: buddy allocator flags
> > + * @order: page order
> > + *
> > + * returns the head struct page of the allocated page.
> > + */
> > +static inline struct page *__iommu_alloc_pages_node(int nid, gfp_t gfp,
> > +                                                 int order)
> > +{
> > +     struct page *pages;
>
> s/pages/page/ here and later in this file.

In this file, where there a page with an "order", I reference it with
"pages", when no order (i.e. order = 0), I reference it with "page"

I.e.: __iommu_alloc_page vs. __iommu_alloc_pages

>
> > +
> > +     pages = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
> > +     if (!pages)
>
> unlikely()?

Will add it.

>
> > +             return NULL;
> > +
> > +     return pages;
> > +}
> > +
> > +/**
> > + * __iommu_alloc_pages - allocate a zeroed page of a given order.
> > + * @gfp: buddy allocator flags
> > + * @order: page order
> > + *
> > + * returns the head struct page of the allocated page.
> > + */
> > +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
> > +{
> > +     struct page *pages;
> > +
> > +     pages = alloc_pages(gfp | __GFP_ZERO, order);
> > +     if (!pages)
> > +             return NULL;
> > +
> > +     return pages;
> > +}
> > +
> > +/**
> > + * __iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
> > + * @nid: memory NUMA node id
> > + * @gfp: buddy allocator flags
> > + *
> > + * returns the struct page of the allocated page.
> > + */
> > +static inline struct page *__iommu_alloc_page_node(int nid, gfp_t gfp)
> > +{
> > +     return __iommu_alloc_pages_node(nid, gfp, 0);
> > +}
> > +
> > +/**
> > + * __iommu_alloc_page - allocate a zeroed page
> > + * @gfp: buddy allocator flags
> > + *
> > + * returns the struct page of the allocated page.
> > + */
> > +static inline struct page *__iommu_alloc_page(gfp_t gfp)
> > +{
> > +     return __iommu_alloc_pages(gfp, 0);
> > +}
> > +
> > +/**
> > + * __iommu_free_pages - free page of a given order
> > + * @pages: head struct page of the page
>
> I think "pages" implies more than one page, this is just a (potentially
> compound) page?

Yes, more than one page, basically, when order may be > 0.

> > +/**
> > + * iommu_free_page - free page
> > + * @virt: virtual address of the page to be freed.
> > + */
> > +static inline void iommu_free_page(void *virt)
> > +{
> > +     iommu_free_pages(virt, 0);
> > +}
> > +
> > +/**
> > + * iommu_free_pages_list - free a list of pages.
> > + * @pages: the head of the lru list to be freed.
>
> Document the locking requirements for this?

Thank you for the review. I will add info about locking requirements,
in fact they are very relaxed.

These pages are added to the list by unmaps or remaps operation in
Intel IOMMU implementation. These calls assume that whoever is doing
those operations has exclusive access to the VA range in the page
table of that operation. The pages in this freelist only belong to the
former page-tables from the IOVA range for those operations.

> > + */
> > +static inline void iommu_free_pages_list(struct list_head *pages)
> > +{
> > +     while (!list_empty(pages)) {
> > +             struct page *p = list_entry(pages->prev, struct page, lru);
> > +
> > +             list_del(&p->lru);
> > +             put_page(p);
> > +     }
> > +}
David Rientjes Dec. 24, 2023, 9:30 p.m. UTC | #3
On Thu, 14 Dec 2023, Pasha Tatashin wrote:

> On Thu, Dec 14, 2023 at 12:58 PM David Rientjes <rientjes@google.com> wrote:
> >
> > On Thu, 30 Nov 2023, Pasha Tatashin wrote:
> >
> > > diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
> > > new file mode 100644
> > > index 000000000000..2332f807d514
> > > --- /dev/null
> > > +++ b/drivers/iommu/iommu-pages.h
> > > @@ -0,0 +1,199 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/*
> > > + * Copyright (c) 2023, Google LLC.
> > > + * Pasha Tatashin <pasha.tatashin@soleen.com>
> > > + */
> > > +
> > > +#ifndef __IOMMU_PAGES_H
> > > +#define __IOMMU_PAGES_H
> > > +
> > > +#include <linux/vmstat.h>
> > > +#include <linux/gfp.h>
> > > +#include <linux/mm.h>
> > > +
> > > +/*
> > > + * All page allocation that are performed in the IOMMU subsystem must use one of
> > > + * the functions below.  This is necessary for the proper accounting as IOMMU
> > > + * state can be rather large, i.e. multiple gigabytes in size.
> > > + */
> > > +
> > > +/**
> > > + * __iommu_alloc_pages_node - allocate a zeroed page of a given order from
> > > + * specific NUMA node.
> > > + * @nid: memory NUMA node id
> >
> > NUMA_NO_NODE if no locality requirements?
> 
> If no locality is required, there is a better interface:
> __iommu_alloc_pages(). That one will also take a look at the calling
> process policies to determine the proper NUMA node when nothing is
> specified. However, when policies should be ignored, and no locality
> required, NUMA_NO_NODE can be passed.
> 

Gotcha, thanks!

> >
> > > + * @gfp: buddy allocator flags
> > > + * @order: page order
> > > + *
> > > + * returns the head struct page of the allocated page.
> > > + */
> > > +static inline struct page *__iommu_alloc_pages_node(int nid, gfp_t gfp,
> > > +                                                 int order)
> > > +{
> > > +     struct page *pages;
> >
> > s/pages/page/ here and later in this file.
> 
> In this file, where there a page with an "order", I reference it with
> "pages", when no order (i.e. order = 0), I reference it with "page"
> 
> I.e.: __iommu_alloc_page vs. __iommu_alloc_pages
> 

Eh, the struct page points to a (potentially compound) page, not a set or 
list of pages.  I won't bikeshed on it, but "struct page *pages" never 
makes sense unless it's **pages or *pages[] :)
Matthew Wilcox Dec. 24, 2023, 9:48 p.m. UTC | #4
On Sun, Dec 24, 2023 at 01:30:50PM -0800, David Rientjes wrote:
> > > s/pages/page/ here and later in this file.
> > 
> > In this file, where there a page with an "order", I reference it with
> > "pages", when no order (i.e. order = 0), I reference it with "page"
> > 
> > I.e.: __iommu_alloc_page vs. __iommu_alloc_pages
> > 
> 
> Eh, the struct page points to a (potentially compound) page, not a set or 
> list of pages.  I won't bikeshed on it, but "struct page *pages" never 
> makes sense unless it's **pages or *pages[] :)

I'd suggest that 'pages' also makes sense when _not_ using __GFP_COMP,
as we do in fact allocate an array of pages in that case.

That said, we shouldn't encourage the use of non-compound allocations.
It would also be good for someone to define a memdesc for iommu memory
like we have already for slab.  We'll need it eventually, and it'll work
out better if someone who understands iommus (ie not me) does it.
Liu, Jingqi Dec. 26, 2023, 6:09 a.m. UTC | #5
On 12/15/2023 3:16 AM, Pasha Tatashin wrote:
> On Thu, Dec 14, 2023 at 12:58 PM David Rientjes <rientjes@google.com> wrote:
>>
>> On Thu, 30 Nov 2023, Pasha Tatashin wrote:
>>
>>> diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
>>> new file mode 100644
>>> index 000000000000..2332f807d514
>>> --- /dev/null
>>> +++ b/drivers/iommu/iommu-pages.h
>>> @@ -0,0 +1,199 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * Copyright (c) 2023, Google LLC.
>>> + * Pasha Tatashin <pasha.tatashin@soleen.com>
>>> + */
>>> +
>>> +#ifndef __IOMMU_PAGES_H
>>> +#define __IOMMU_PAGES_H
>>> +
>>> +#include <linux/vmstat.h>
>>> +#include <linux/gfp.h>
>>> +#include <linux/mm.h>
>>> +
>>> +/*
>>> + * All page allocation that are performed in the IOMMU subsystem must use one of
>>> + * the functions below.  This is necessary for the proper accounting as IOMMU
>>> + * state can be rather large, i.e. multiple gigabytes in size.
>>> + */
>>> +
>>> +/**
>>> + * __iommu_alloc_pages_node - allocate a zeroed page of a given order from
>>> + * specific NUMA node.
>>> + * @nid: memory NUMA node id
>>
>> NUMA_NO_NODE if no locality requirements?
> 
> If no locality is required, there is a better interface:
> __iommu_alloc_pages(). That one will also take a look at the calling
> process policies to determine the proper NUMA node when nothing is
> specified. However, when policies should be ignored, and no locality
> required, NUMA_NO_NODE can be passed.
> 
>>
>>> + * @gfp: buddy allocator flags
>>> + * @order: page order
>>> + *
>>> + * returns the head struct page of the allocated page.
>>> + */
>>> +static inline struct page *__iommu_alloc_pages_node(int nid, gfp_t gfp,
>>> +                                                 int order)
>>> +{
>>> +     struct page *pages;
>>
>> s/pages/page/ here and later in this file.
> 
> In this file, where there a page with an "order", I reference it with
> "pages", when no order (i.e. order = 0), I reference it with "page"
> 
> I.e.: __iommu_alloc_page vs. __iommu_alloc_pages
> 
>>
>>> +
>>> +     pages = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
>>> +     if (!pages)
>>
>> unlikely()?
> 
> Will add it.
> 
>>
>>> +             return NULL;
>>> +
>>> +     return pages;
>>> +}
>>> +
>>> +/**
>>> + * __iommu_alloc_pages - allocate a zeroed page of a given order.
>>> + * @gfp: buddy allocator flags
>>> + * @order: page order
>>> + *
>>> + * returns the head struct page of the allocated page.
>>> + */
>>> +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
>>> +{
>>> +     struct page *pages;
>>> +
>>> +     pages = alloc_pages(gfp | __GFP_ZERO, order);
>>> +     if (!pages)
>>> +             return NULL;
>>> +
>>> +     return pages;
>>> +}
>>> +
>>> +/**
>>> + * __iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
>>> + * @nid: memory NUMA node id
>>> + * @gfp: buddy allocator flags
>>> + *
>>> + * returns the struct page of the allocated page.
>>> + */
>>> +static inline struct page *__iommu_alloc_page_node(int nid, gfp_t gfp)
>>> +{
>>> +     return __iommu_alloc_pages_node(nid, gfp, 0);
>>> +}
>>> +
>>> +/**
>>> + * __iommu_alloc_page - allocate a zeroed page
>>> + * @gfp: buddy allocator flags
>>> + *
>>> + * returns the struct page of the allocated page.
>>> + */
>>> +static inline struct page *__iommu_alloc_page(gfp_t gfp)
>>> +{
>>> +     return __iommu_alloc_pages(gfp, 0);
>>> +}
>>> +
>>> +/**
>>> + * __iommu_free_pages - free page of a given order
>>> + * @pages: head struct page of the page
>>
>> I think "pages" implies more than one page, this is just a (potentially
>> compound) page?
> 
> Yes, more than one page, basically, when order may be > 0.
> 
>>> +/**
>>> + * iommu_free_page - free page
>>> + * @virt: virtual address of the page to be freed.
>>> + */
>>> +static inline void iommu_free_page(void *virt)
>>> +{
>>> +     iommu_free_pages(virt, 0);
>>> +}
>>> +
>>> +/**
>>> + * iommu_free_pages_list - free a list of pages.
>>> + * @pages: the head of the lru list to be freed.
>>
>> Document the locking requirements for this?
> 
> Thank you for the review. I will add info about locking requirements,
> in fact they are very relaxed.
> 
> These pages are added to the list by unmaps or remaps operation in
> Intel IOMMU implementation. These calls assume that whoever is doing
> those operations has exclusive access to the VA range in the page
> table of that operation. The pages in this freelist only belong to the
> former page-tables from the IOVA range for those operations.

These pages maybe be accessed concurrently by thread contexts other than 
the IOMMU driver (such as debugfs).

Thanks,
Jingqi
> 
>>> + */
>>> +static inline void iommu_free_pages_list(struct list_head *pages)
>>> +{
>>> +     while (!list_empty(pages)) {
>>> +             struct page *p = list_entry(pages->prev, struct page, lru);
>>> +
>>> +             list_del(&p->lru);
>>> +             put_page(p);
>>> +     }
>>> +}
>
Pasha Tatashin Dec. 26, 2023, 5:14 p.m. UTC | #6
> >> Document the locking requirements for this?
> >
> > Thank you for the review. I will add info about locking requirements,
> > in fact they are very relaxed.
> >
> > These pages are added to the list by unmaps or remaps operation in
> > Intel IOMMU implementation. These calls assume that whoever is doing
> > those operations has exclusive access to the VA range in the page
> > table of that operation. The pages in this freelist only belong to the
> > former page-tables from the IOVA range for those operations.
>
> These pages maybe be accessed concurrently by thread contexts other than
> the IOMMU driver (such as debugfs).

Good point regarding debugfs. While, it does not change any locking
assumptions, for this series. It might change some design decisions
that we need to make when freeing pages on unmaps (a separate RFC
series that I sent). I will have to study how debugfs affect refcnt
and mapcount, and whether we could use per-page page table locking if
needed.

Thanks,
Pasha
Pasha Tatashin Dec. 26, 2023, 5:57 p.m. UTC | #7
On Sun, Dec 24, 2023 at 4:48 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Sun, Dec 24, 2023 at 01:30:50PM -0800, David Rientjes wrote:
> > > > s/pages/page/ here and later in this file.
> > >
> > > In this file, where there a page with an "order", I reference it with
> > > "pages", when no order (i.e. order = 0), I reference it with "page"
> > >
> > > I.e.: __iommu_alloc_page vs. __iommu_alloc_pages
> > >
> >
> > Eh, the struct page points to a (potentially compound) page, not a set or
> > list of pages.  I won't bikeshed on it, but "struct page *pages" never
> > makes sense unless it's **pages or *pages[] :)
>
> I'd suggest that 'pages' also makes sense when _not_ using __GFP_COMP,
> as we do in fact allocate an array of pages in that case.
>
> That said, we shouldn't encourage the use of non-compound allocations.
> It would also be good for someone to define a memdesc for iommu memory
> like we have already for slab.  We'll need it eventually, and it'll work
> out better if someone who understands iommus (ie not me) does it.

I was thinking of adding an IOMMU page table memdesc, at least by
starting with Intel implementation.

- For efficient freeing on page-unmap we need a counter.
- We might also need a per-page page table locking (aka PTL type
lock), if the current approach I am proposing is not scalable enough.
- Access to debugfs (I am studying this now).

However, iommu memdesc would really make sense, once all the various
page table management IOMMU implementations are unified.

Pasha
diff mbox series

Patch

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index a3414afe11b0..a6937e1e20a5 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -32,6 +32,7 @@ 
 
 #include "iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "perf.h"
 #include "trace.h"
 #include "perfmon.h"
@@ -1185,7 +1186,7 @@  static void free_iommu(struct intel_iommu *iommu)
 	}
 
 	if (iommu->qi) {
-		free_page((unsigned long)iommu->qi->desc);
+		iommu_free_page(iommu->qi->desc);
 		kfree(iommu->qi->desc_status);
 		kfree(iommu->qi);
 	}
@@ -1714,6 +1715,7 @@  int dmar_enable_qi(struct intel_iommu *iommu)
 {
 	struct q_inval *qi;
 	struct page *desc_page;
+	int order;
 
 	if (!ecap_qis(iommu->ecap))
 		return -ENOENT;
@@ -1734,8 +1736,8 @@  int dmar_enable_qi(struct intel_iommu *iommu)
 	 * Need two pages to accommodate 256 descriptors of 256 bits each
 	 * if the remapping hardware supports scalable mode translation.
 	 */
-	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-				     !!ecap_smts(iommu->ecap));
+	order = ecap_smts(iommu->ecap) ? 1 : 0;
+	desc_page = __iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order);
 	if (!desc_page) {
 		kfree(qi);
 		iommu->qi = NULL;
@@ -1746,7 +1748,7 @@  int dmar_enable_qi(struct intel_iommu *iommu)
 
 	qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
 	if (!qi->desc_status) {
-		free_page((unsigned long) qi->desc);
+		iommu_free_page(qi->desc);
 		kfree(qi);
 		iommu->qi = NULL;
 		return -ENOMEM;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3531b956556c..04f852175cbe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -28,6 +28,7 @@ 
 #include "../dma-iommu.h"
 #include "../irq_remapping.h"
 #include "../iommu-sva.h"
+#include "../iommu-pages.h"
 #include "pasid.h"
 #include "cap_audit.h"
 #include "perfmon.h"
@@ -367,22 +368,6 @@  static int __init intel_iommu_setup(char *str)
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
-void *alloc_pgtable_page(int node, gfp_t gfp)
-{
-	struct page *page;
-	void *vaddr = NULL;
-
-	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-	if (page)
-		vaddr = page_address(page);
-	return vaddr;
-}
-
-void free_pgtable_page(void *vaddr)
-{
-	free_page((unsigned long)vaddr);
-}
-
 static inline int domain_type_is_si(struct dmar_domain *domain)
 {
 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
@@ -617,7 +602,7 @@  struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 		if (!alloc)
 			return NULL;
 
-		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
+		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 		if (!context)
 			return NULL;
 
@@ -791,17 +776,17 @@  static void free_context_table(struct intel_iommu *iommu)
 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 		context = iommu_context_addr(iommu, i, 0, 0);
 		if (context)
-			free_pgtable_page(context);
+			iommu_free_page(context);
 
 		if (!sm_supported(iommu))
 			continue;
 
 		context = iommu_context_addr(iommu, i, 0x80, 0);
 		if (context)
-			free_pgtable_page(context);
+			iommu_free_page(context);
 	}
 
-	free_pgtable_page(iommu->root_entry);
+	iommu_free_page(iommu->root_entry);
 	iommu->root_entry = NULL;
 }
 
@@ -939,7 +924,7 @@  static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 		if (!dma_pte_present(pte)) {
 			uint64_t pteval;
 
-			tmp_page = alloc_pgtable_page(domain->nid, gfp);
+			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
 
 			if (!tmp_page)
 				return NULL;
@@ -951,7 +936,7 @@  static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
-				free_pgtable_page(tmp_page);
+				iommu_free_page(tmp_page);
 			else
 				domain_flush_cache(domain, pte, sizeof(*pte));
 		}
@@ -1064,7 +1049,7 @@  static void dma_pte_free_level(struct dmar_domain *domain, int level,
 		      last_pfn < level_pfn + level_size(level) - 1)) {
 			dma_clear_pte(pte);
 			domain_flush_cache(domain, pte, sizeof(*pte));
-			free_pgtable_page(level_pte);
+			iommu_free_page(level_pte);
 		}
 next:
 		pfn += level_size(level);
@@ -1088,7 +1073,7 @@  static void dma_pte_free_pagetable(struct dmar_domain *domain,
 
 	/* free pgd */
 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
-		free_pgtable_page(domain->pgd);
+		iommu_free_page(domain->pgd);
 		domain->pgd = NULL;
 	}
 }
@@ -1190,7 +1175,7 @@  static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
 	struct root_entry *root;
 
-	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
+	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 	if (!root) {
 		pr_err("Allocating root entry for %s failed\n",
 			iommu->name);
@@ -1863,7 +1848,7 @@  static void domain_exit(struct dmar_domain *domain)
 		LIST_HEAD(freelist);
 
 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
-		put_pages_list(&freelist);
+		iommu_free_pages_list(&freelist);
 	}
 
 	if (WARN_ON(!list_empty(&domain->devices)))
@@ -2637,7 +2622,7 @@  static int copy_context_table(struct intel_iommu *iommu,
 			if (!old_ce)
 				goto out;
 
-			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
+			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
 			if (!new_ce)
 				goto out_unmap;
 
@@ -3570,7 +3555,7 @@  static int intel_iommu_memory_notifier(struct notifier_block *nb,
 					start_vpfn, mhp->nr_pages,
 					list_empty(&freelist), 0);
 			rcu_read_unlock();
-			put_pages_list(&freelist);
+			iommu_free_pages_list(&freelist);
 		}
 		break;
 	}
@@ -4001,7 +3986,7 @@  static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	domain->max_addr = 0;
 
 	/* always allocate the top pgd */
-	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
+	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
 	if (!domain->pgd)
 		return -ENOMEM;
 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
@@ -4148,7 +4133,7 @@  int prepare_domain_attach_device(struct iommu_domain *domain,
 		pte = dmar_domain->pgd;
 		if (dma_pte_present(pte)) {
 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
-			free_pgtable_page(pte);
+			iommu_free_page(pte);
 		}
 		dmar_domain->agaw--;
 	}
@@ -4295,7 +4280,7 @@  static void intel_iommu_tlb_sync(struct iommu_domain *domain,
 				      start_pfn, nrpages,
 				      list_empty(&gather->freelist), 0);
 
-	put_pages_list(&gather->freelist);
+	iommu_free_pages_list(&gather->freelist);
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 65d37a138c75..b505f3f44d0a 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -894,8 +894,6 @@  void domain_update_iommu_cap(struct dmar_domain *domain);
 
 int dmar_ir_support(void);
 
-void *alloc_pgtable_page(int node, gfp_t gfp);
-void free_pgtable_page(void *vaddr);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
 struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 29b9e55dcf26..72e1c1342c13 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -22,6 +22,7 @@ 
 
 #include "iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "cap_audit.h"
 
 enum irq_mode {
@@ -536,8 +537,8 @@  static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 	if (!ir_table)
 		return -ENOMEM;
 
-	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
-				 INTR_REMAP_PAGE_ORDER);
+	pages = __iommu_alloc_pages_node(iommu->node, GFP_KERNEL,
+					 INTR_REMAP_PAGE_ORDER);
 	if (!pages) {
 		pr_err("IR%d: failed to allocate pages of order %d\n",
 		       iommu->seq_id, INTR_REMAP_PAGE_ORDER);
@@ -622,7 +623,7 @@  static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 out_free_bitmap:
 	bitmap_free(bitmap);
 out_free_pages:
-	__free_pages(pages, INTR_REMAP_PAGE_ORDER);
+	__iommu_free_pages(pages, INTR_REMAP_PAGE_ORDER);
 out_free_table:
 	kfree(ir_table);
 
@@ -643,8 +644,7 @@  static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 			irq_domain_free_fwnode(fn);
 			iommu->ir_domain = NULL;
 		}
-		free_pages((unsigned long)iommu->ir_table->base,
-			   INTR_REMAP_PAGE_ORDER);
+		iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER);
 		bitmap_free(iommu->ir_table->bitmap);
 		kfree(iommu->ir_table);
 		iommu->ir_table = NULL;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 74e8e4c17e81..1856e74bba78 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -20,6 +20,7 @@ 
 
 #include "iommu.h"
 #include "pasid.h"
+#include "../iommu-pages.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
@@ -116,8 +117,7 @@  int intel_pasid_alloc_table(struct device *dev)
 
 	size = max_pasid >> (PASID_PDE_SHIFT - 3);
 	order = size ? get_order(size) : 0;
-	pages = alloc_pages_node(info->iommu->node,
-				 GFP_KERNEL | __GFP_ZERO, order);
+	pages = __iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order);
 	if (!pages) {
 		kfree(pasid_table);
 		return -ENOMEM;
@@ -154,10 +154,10 @@  void intel_pasid_free_table(struct device *dev)
 	max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
 	for (i = 0; i < max_pde; i++) {
 		table = get_pasid_table_from_pde(&dir[i]);
-		free_pgtable_page(table);
+		iommu_free_page(table);
 	}
 
-	free_pages((unsigned long)pasid_table->table, pasid_table->order);
+	iommu_free_pages(pasid_table->table, pasid_table->order);
 	kfree(pasid_table);
 }
 
@@ -203,7 +203,7 @@  static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 retry:
 	entries = get_pasid_table_from_pde(&dir[dir_index]);
 	if (!entries) {
-		entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC);
+		entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC);
 		if (!entries)
 			return NULL;
 
@@ -215,7 +215,7 @@  static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 		 */
 		if (cmpxchg64(&dir[dir_index].val, 0ULL,
 			      (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
-			free_pgtable_page(entries);
+			iommu_free_page(entries);
 			goto retry;
 		}
 		if (!ecap_coherent(info->iommu->ecap)) {
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 50a481c895b8..4cf8826b30e1 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -23,6 +23,7 @@ 
 #include "pasid.h"
 #include "perf.h"
 #include "../iommu-sva.h"
+#include "../iommu-pages.h"
 #include "trace.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
@@ -67,7 +68,7 @@  int intel_svm_enable_prq(struct intel_iommu *iommu)
 	struct page *pages;
 	int irq, ret;
 
-	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+	pages = __iommu_alloc_pages(GFP_KERNEL, PRQ_ORDER);
 	if (!pages) {
 		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
 			iommu->name);
@@ -118,7 +119,7 @@  int intel_svm_enable_prq(struct intel_iommu *iommu)
 	dmar_free_hwirq(irq);
 	iommu->pr_irq = 0;
 free_prq:
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu_free_pages(iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
 	return ret;
@@ -141,7 +142,7 @@  int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->iopf_queue = NULL;
 	}
 
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu_free_pages(iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
 	return 0;
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
new file mode 100644
index 000000000000..2332f807d514
--- /dev/null
+++ b/drivers/iommu/iommu-pages.h
@@ -0,0 +1,199 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef __IOMMU_PAGES_H
+#define __IOMMU_PAGES_H
+
+#include <linux/vmstat.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+
+/*
+ * All page allocation that are performed in the IOMMU subsystem must use one of
+ * the functions below.  This is necessary for the proper accounting as IOMMU
+ * state can be rather large, i.e. multiple gigabytes in size.
+ */
+
+/**
+ * __iommu_alloc_pages_node - allocate a zeroed page of a given order from
+ * specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the head struct page of the allocated page.
+ */
+static inline struct page *__iommu_alloc_pages_node(int nid, gfp_t gfp,
+						    int order)
+{
+	struct page *pages;
+
+	pages = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
+	if (!pages)
+		return NULL;
+
+	return pages;
+}
+
+/**
+ * __iommu_alloc_pages - allocate a zeroed page of a given order.
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the head struct page of the allocated page.
+ */
+static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
+{
+	struct page *pages;
+
+	pages = alloc_pages(gfp | __GFP_ZERO, order);
+	if (!pages)
+		return NULL;
+
+	return pages;
+}
+
+/**
+ * __iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ *
+ * returns the struct page of the allocated page.
+ */
+static inline struct page *__iommu_alloc_page_node(int nid, gfp_t gfp)
+{
+	return __iommu_alloc_pages_node(nid, gfp, 0);
+}
+
+/**
+ * __iommu_alloc_page - allocate a zeroed page
+ * @gfp: buddy allocator flags
+ *
+ * returns the struct page of the allocated page.
+ */
+static inline struct page *__iommu_alloc_page(gfp_t gfp)
+{
+	return __iommu_alloc_pages(gfp, 0);
+}
+
+/**
+ * __iommu_free_pages - free page of a given order
+ * @pages: head struct page of the page
+ * @order: page order
+ */
+static inline void __iommu_free_pages(struct page *pages, int order)
+{
+	if (!pages)
+		return;
+
+	__free_pages(pages, order);
+}
+
+/**
+ * __iommu_free_page - free page
+ * @page: struct page of the page
+ */
+static inline void __iommu_free_page(struct page *page)
+{
+	__iommu_free_pages(page, 0);
+}
+
+/**
+ * iommu_alloc_pages_node - allocate a zeroed page of a given order from
+ * specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
+{
+	struct page *pages = __iommu_alloc_pages_node(nid, gfp, order);
+
+	if (!pages)
+		return NULL;
+
+	return page_address(pages);
+}
+
+/**
+ * iommu_alloc_pages - allocate a zeroed page of a given order
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_pages(gfp_t gfp, int order)
+{
+	struct page *pages = __iommu_alloc_pages(gfp, order);
+
+	if (!pages)
+		return NULL;
+
+	return page_address(pages);
+}
+
+/**
+ * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_page_node(int nid, gfp_t gfp)
+{
+	return iommu_alloc_pages_node(nid, gfp, 0);
+}
+
+/**
+ * iommu_alloc_page - allocate a zeroed page
+ * @gfp: buddy allocator flags
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_page(gfp_t gfp)
+{
+	return iommu_alloc_pages(gfp, 0);
+}
+
+/**
+ * iommu_free_pages - free page of a given order
+ * @virt: virtual address of the page to be freed.
+ * @order: page order
+ */
+static inline void iommu_free_pages(void *virt, int order)
+{
+	if (!virt)
+		return;
+
+	__iommu_free_pages(virt_to_page(virt), order);
+}
+
+/**
+ * iommu_free_page - free page
+ * @virt: virtual address of the page to be freed.
+ */
+static inline void iommu_free_page(void *virt)
+{
+	iommu_free_pages(virt, 0);
+}
+
+/**
+ * iommu_free_pages_list - free a list of pages.
+ * @pages: the head of the lru list to be freed.
+ */
+static inline void iommu_free_pages_list(struct list_head *pages)
+{
+	while (!list_empty(pages)) {
+		struct page *p = list_entry(pages->prev, struct page, lru);
+
+		list_del(&p->lru);
+		put_page(p);
+	}
+}
+
+#endif	/* __IOMMU_PAGES_H */