diff mbox series

[v4] misc: pci_endpoint_test: Use memcpy_toio()/memcpy_fromio() for BAR tests

Message ID 20240322164139.678228-1-cassel@kernel.org
State New
Headers show
Series [v4] misc: pci_endpoint_test: Use memcpy_toio()/memcpy_fromio() for BAR tests | expand

Commit Message

Niklas Cassel March 22, 2024, 4:41 p.m. UTC
The current code uses writel()/readl(), which has an implicit memory
barrier for every single readl()/writel().

Additionally, reading 4 bytes at a time over the PCI bus is not really
optimal, considering that this code is running in an ioctl handler.

Use memcpy_toio()/memcpy_fromio() for BAR tests.

Before patch with a 4MB BAR:
$ time /usr/bin/pcitest -b 1
BAR1:           OKAY
real    0m 1.56s

After patch with a 4MB BAR:
$ time /usr/bin/pcitest -b 1
BAR1:           OKAY
real    0m 0.54s

Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
Changes since v3:
-Use scope-based resource management __free attribute from cleanup.h to
 avoid overly verbose gotos and labels for error handling.
-Added a comment related to why we allocate a buffer of max 1MB.
 (kmalloc() default upper limit is usually 4 MB on ARM and x86.)

 drivers/misc/pci_endpoint_test.c | 54 +++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 12 deletions(-)

Comments

Kuppuswamy Sathyanarayanan March 22, 2024, 5:03 p.m. UTC | #1
On 3/22/24 9:41 AM, Niklas Cassel wrote:
> The current code uses writel()/readl(), which has an implicit memory
> barrier for every single readl()/writel().
>
> Additionally, reading 4 bytes at a time over the PCI bus is not really
> optimal, considering that this code is running in an ioctl handler.
>
> Use memcpy_toio()/memcpy_fromio() for BAR tests.
>
> Before patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 1.56s
>
> After patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 0.54s
>
> Signed-off-by: Niklas Cassel <cassel@kernel.org>
> ---
> Changes since v3:
> -Use scope-based resource management __free attribute from cleanup.h to
>  avoid overly verbose gotos and labels for error handling.
> -Added a comment related to why we allocate a buffer of max 1MB.
>  (kmalloc() default upper limit is usually 4 MB on ARM and x86.)
>
>  drivers/misc/pci_endpoint_test.c | 54 +++++++++++++++++++++++++-------
>  1 file changed, 42 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
> index 705029ad8eb5..bf64d3aff7d8 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -7,6 +7,7 @@
>   */
>  
>  #include <linux/crc32.h>
> +#include <linux/cleanup.h>
>  #include <linux/delay.h>
>  #include <linux/fs.h>
>  #include <linux/io.h>
> @@ -272,31 +273,60 @@ static const u32 bar_test_pattern[] = {
>  	0xA5A5A5A5,
>  };
>  
> +static int pci_endpoint_test_bar_memcmp(struct pci_endpoint_test *test,
> +					enum pci_barno barno, int offset,
> +					void *write_buf, void *read_buf,
> +					int size)
> +{
> +	memset(write_buf, bar_test_pattern[barno], size);
> +	memcpy_toio(test->bar[barno] + offset, write_buf, size);
> +
> +	memcpy_fromio(read_buf, test->bar[barno] + offset, size);
> +
> +	return memcmp(write_buf, read_buf, size);
> +}
> +
>  static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
>  				  enum pci_barno barno)
>  {
> -	int j;
> -	u32 val;
> -	int size;
> +	int j, bar_size, buf_size, iters, remain;
> +	void *write_buf __free(kfree) = NULL;
> +	void *read_buf __free(kfree) = NULL;

Please check the following cleanup doc. Recommendation is to avoid above __free(kfree) = NULL declarations and directly define write_buf/read_buf.

https://lore.kernel.org/lkml/171097196970.1011049.9726486429680041876.stgit@dwillia2-xfh.jf.intel.com/T/#m05d71a668ff0fad46c2055dbdcde55d7381780b7

>  	struct pci_dev *pdev = test->pdev;
>  
>  	if (!test->bar[barno])
>  		return false;
>  
> -	size = pci_resource_len(pdev, barno);
> +	bar_size = pci_resource_len(pdev, barno);
>  
>  	if (barno == test->test_reg_bar)
> -		size = 0x4;
> +		bar_size = 0x4;
>  
> -	for (j = 0; j < size; j += 4)
> -		pci_endpoint_test_bar_writel(test, barno, j,
> -					     bar_test_pattern[barno]);
> +	/*
> +	 * Allocate a buffer of max size 1MB, and reuse that buffer while
> +	 * iterating over the whole BAR size (which might be much larger).
> +	 */
> +	buf_size = min(SZ_1M, bar_size);
>  
> -	for (j = 0; j < size; j += 4) {
> -		val = pci_endpoint_test_bar_readl(test, barno, j);
> -		if (val != bar_test_pattern[barno])
> +	write_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!write_buf)
> +		return false;
> +
> +	read_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!read_buf)
> +		return false;
> +
> +	iters = bar_size / buf_size;
> +	for (j = 0; j < iters; j++)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * j,
> +						 write_buf, read_buf, buf_size))
> +			return false;
> +
> +	remain = bar_size % buf_size;
> +	if (remain)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * iters,
> +						 write_buf, read_buf, remain))
>  			return false;
> -	}
>  
>  	return true;
>  }
Niklas Cassel March 25, 2024, 7:58 a.m. UTC | #2
Hello Kuppuswamy, Dan,

On Fri, Mar 22, 2024 at 10:03:12AM -0700, Kuppuswamy Sathyanarayanan wrote:
> > +
> >  static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
> >  				  enum pci_barno barno)
> >  {
> > -	int j;
> > -	u32 val;
> > -	int size;
> > +	int j, bar_size, buf_size, iters, remain;
> > +	void *write_buf __free(kfree) = NULL;
> > +	void *read_buf __free(kfree) = NULL;
> 
> Please check the following cleanup doc. Recommendation is to avoid above __free(kfree) = NULL declarations and directly define write_buf/read_buf.
> 
> https://lore.kernel.org/lkml/171097196970.1011049.9726486429680041876.stgit@dwillia2-xfh.jf.intel.com/T/#m05d71a668ff0fad46c2055dbdcde55d7381780b7

I don't think that the docs say that using:
void *ptr __free(kfree) = NULL;
is always an anti-pattern.

"If other cleanup helpers appeared in such a function that depended on
@dev being live to complete their unwind then using the
"struct obj_type *obj __free(...) = NULL" style is an anti-pattern that
potentially causes a use-after-free bug."

I think it says that it is an anti-pattern IFF there are two cleanup
helpers in a function AND they have have a inter-dependency.

This patch just adds a single cleanup helper, so there can be no
inter-dependency problem.

This pattern is common in Linus's current tree, see e.g.
$ git grep -C 3 -E "__free(.*) = NULL"

If this was a problem, I don't think we would have seen
100 different instances of this pattern.

In other words, I think this patch is fine.

Dan, please correct me if I'm wrong.


Kind regards,
Niklas
Niklas Cassel April 18, 2024, 9:08 a.m. UTC | #3
On Fri, Mar 22, 2024 at 05:41:38PM +0100, Niklas Cassel wrote:
> The current code uses writel()/readl(), which has an implicit memory
> barrier for every single readl()/writel().
> 
> Additionally, reading 4 bytes at a time over the PCI bus is not really
> optimal, considering that this code is running in an ioctl handler.
> 
> Use memcpy_toio()/memcpy_fromio() for BAR tests.
> 
> Before patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 1.56s
> 
> After patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 0.54s
> 
> Signed-off-by: Niklas Cassel <cassel@kernel.org>
> ---
> Changes since v3:
> -Use scope-based resource management __free attribute from cleanup.h to
>  avoid overly verbose gotos and labels for error handling.
> -Added a comment related to why we allocate a buffer of max 1MB.
>  (kmalloc() default upper limit is usually 4 MB on ARM and x86.)
> 
>  drivers/misc/pci_endpoint_test.c | 54 +++++++++++++++++++++++++-------
>  1 file changed, 42 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
> index 705029ad8eb5..bf64d3aff7d8 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -7,6 +7,7 @@
>   */
>  
>  #include <linux/crc32.h>
> +#include <linux/cleanup.h>
>  #include <linux/delay.h>
>  #include <linux/fs.h>
>  #include <linux/io.h>
> @@ -272,31 +273,60 @@ static const u32 bar_test_pattern[] = {
>  	0xA5A5A5A5,
>  };
>  
> +static int pci_endpoint_test_bar_memcmp(struct pci_endpoint_test *test,
> +					enum pci_barno barno, int offset,
> +					void *write_buf, void *read_buf,
> +					int size)
> +{
> +	memset(write_buf, bar_test_pattern[barno], size);
> +	memcpy_toio(test->bar[barno] + offset, write_buf, size);
> +
> +	memcpy_fromio(read_buf, test->bar[barno] + offset, size);
> +
> +	return memcmp(write_buf, read_buf, size);
> +}
> +
>  static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
>  				  enum pci_barno barno)
>  {
> -	int j;
> -	u32 val;
> -	int size;
> +	int j, bar_size, buf_size, iters, remain;
> +	void *write_buf __free(kfree) = NULL;
> +	void *read_buf __free(kfree) = NULL;
>  	struct pci_dev *pdev = test->pdev;
>  
>  	if (!test->bar[barno])
>  		return false;
>  
> -	size = pci_resource_len(pdev, barno);
> +	bar_size = pci_resource_len(pdev, barno);
>  
>  	if (barno == test->test_reg_bar)
> -		size = 0x4;
> +		bar_size = 0x4;
>  
> -	for (j = 0; j < size; j += 4)
> -		pci_endpoint_test_bar_writel(test, barno, j,
> -					     bar_test_pattern[barno]);
> +	/*
> +	 * Allocate a buffer of max size 1MB, and reuse that buffer while
> +	 * iterating over the whole BAR size (which might be much larger).
> +	 */
> +	buf_size = min(SZ_1M, bar_size);
>  
> -	for (j = 0; j < size; j += 4) {
> -		val = pci_endpoint_test_bar_readl(test, barno, j);
> -		if (val != bar_test_pattern[barno])
> +	write_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!write_buf)
> +		return false;
> +
> +	read_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!read_buf)
> +		return false;
> +
> +	iters = bar_size / buf_size;
> +	for (j = 0; j < iters; j++)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * j,
> +						 write_buf, read_buf, buf_size))
> +			return false;
> +
> +	remain = bar_size % buf_size;
> +	if (remain)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * iters,
> +						 write_buf, read_buf, remain))
>  			return false;
> -	}
>  
>  	return true;
>  }
> -- 
> 2.44.0
> 

Gentle ping :)


Kind regards,
Niklas
Kuppuswamy Sathyanarayanan April 18, 2024, 5:49 p.m. UTC | #4
On 3/22/24 9:41 AM, Niklas Cassel wrote:
> The current code uses writel()/readl(), which has an implicit memory
> barrier for every single readl()/writel().
>
> Additionally, reading 4 bytes at a time over the PCI bus is not really
> optimal, considering that this code is running in an ioctl handler.
>
> Use memcpy_toio()/memcpy_fromio() for BAR tests.
>
> Before patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 1.56s
>
> After patch with a 4MB BAR:
> $ time /usr/bin/pcitest -b 1
> BAR1:           OKAY
> real    0m 0.54s
>
> Signed-off-by: Niklas Cassel <cassel@kernel.org>
> ---
LGTM

Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> Changes since v3:
> -Use scope-based resource management __free attribute from cleanup.h to
>  avoid overly verbose gotos and labels for error handling.
> -Added a comment related to why we allocate a buffer of max 1MB.
>  (kmalloc() default upper limit is usually 4 MB on ARM and x86.)
>
>  drivers/misc/pci_endpoint_test.c | 54 +++++++++++++++++++++++++-------
>  1 file changed, 42 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
> index 705029ad8eb5..bf64d3aff7d8 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -7,6 +7,7 @@
>   */
>  
>  #include <linux/crc32.h>
> +#include <linux/cleanup.h>
>  #include <linux/delay.h>
>  #include <linux/fs.h>
>  #include <linux/io.h>
> @@ -272,31 +273,60 @@ static const u32 bar_test_pattern[] = {
>  	0xA5A5A5A5,
>  };
>  
> +static int pci_endpoint_test_bar_memcmp(struct pci_endpoint_test *test,
> +					enum pci_barno barno, int offset,
> +					void *write_buf, void *read_buf,
> +					int size)
> +{
> +	memset(write_buf, bar_test_pattern[barno], size);
> +	memcpy_toio(test->bar[barno] + offset, write_buf, size);
> +
> +	memcpy_fromio(read_buf, test->bar[barno] + offset, size);
> +
> +	return memcmp(write_buf, read_buf, size);
> +}
> +
>  static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
>  				  enum pci_barno barno)
>  {
> -	int j;
> -	u32 val;
> -	int size;
> +	int j, bar_size, buf_size, iters, remain;
> +	void *write_buf __free(kfree) = NULL;
> +	void *read_buf __free(kfree) = NULL;
>  	struct pci_dev *pdev = test->pdev;
>  
>  	if (!test->bar[barno])
>  		return false;
>  
> -	size = pci_resource_len(pdev, barno);
> +	bar_size = pci_resource_len(pdev, barno);
>  
>  	if (barno == test->test_reg_bar)
> -		size = 0x4;
> +		bar_size = 0x4;
>  
> -	for (j = 0; j < size; j += 4)
> -		pci_endpoint_test_bar_writel(test, barno, j,
> -					     bar_test_pattern[barno]);
> +	/*
> +	 * Allocate a buffer of max size 1MB, and reuse that buffer while
> +	 * iterating over the whole BAR size (which might be much larger).
> +	 */
> +	buf_size = min(SZ_1M, bar_size);
>  
> -	for (j = 0; j < size; j += 4) {
> -		val = pci_endpoint_test_bar_readl(test, barno, j);
> -		if (val != bar_test_pattern[barno])
> +	write_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!write_buf)
> +		return false;
> +
> +	read_buf = kmalloc(buf_size, GFP_KERNEL);
> +	if (!read_buf)
> +		return false;
> +
> +	iters = bar_size / buf_size;
> +	for (j = 0; j < iters; j++)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * j,
> +						 write_buf, read_buf, buf_size))
> +			return false;
> +
> +	remain = bar_size % buf_size;
> +	if (remain)
> +		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * iters,
> +						 write_buf, read_buf, remain))
>  			return false;
> -	}
>  
>  	return true;
>  }
diff mbox series

Patch

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 705029ad8eb5..bf64d3aff7d8 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -7,6 +7,7 @@ 
  */
 
 #include <linux/crc32.h>
+#include <linux/cleanup.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/io.h>
@@ -272,31 +273,60 @@  static const u32 bar_test_pattern[] = {
 	0xA5A5A5A5,
 };
 
+static int pci_endpoint_test_bar_memcmp(struct pci_endpoint_test *test,
+					enum pci_barno barno, int offset,
+					void *write_buf, void *read_buf,
+					int size)
+{
+	memset(write_buf, bar_test_pattern[barno], size);
+	memcpy_toio(test->bar[barno] + offset, write_buf, size);
+
+	memcpy_fromio(read_buf, test->bar[barno] + offset, size);
+
+	return memcmp(write_buf, read_buf, size);
+}
+
 static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
 				  enum pci_barno barno)
 {
-	int j;
-	u32 val;
-	int size;
+	int j, bar_size, buf_size, iters, remain;
+	void *write_buf __free(kfree) = NULL;
+	void *read_buf __free(kfree) = NULL;
 	struct pci_dev *pdev = test->pdev;
 
 	if (!test->bar[barno])
 		return false;
 
-	size = pci_resource_len(pdev, barno);
+	bar_size = pci_resource_len(pdev, barno);
 
 	if (barno == test->test_reg_bar)
-		size = 0x4;
+		bar_size = 0x4;
 
-	for (j = 0; j < size; j += 4)
-		pci_endpoint_test_bar_writel(test, barno, j,
-					     bar_test_pattern[barno]);
+	/*
+	 * Allocate a buffer of max size 1MB, and reuse that buffer while
+	 * iterating over the whole BAR size (which might be much larger).
+	 */
+	buf_size = min(SZ_1M, bar_size);
 
-	for (j = 0; j < size; j += 4) {
-		val = pci_endpoint_test_bar_readl(test, barno, j);
-		if (val != bar_test_pattern[barno])
+	write_buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!write_buf)
+		return false;
+
+	read_buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!read_buf)
+		return false;
+
+	iters = bar_size / buf_size;
+	for (j = 0; j < iters; j++)
+		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * j,
+						 write_buf, read_buf, buf_size))
+			return false;
+
+	remain = bar_size % buf_size;
+	if (remain)
+		if (pci_endpoint_test_bar_memcmp(test, barno, buf_size * iters,
+						 write_buf, read_buf, remain))
 			return false;
-	}
 
 	return true;
 }