diff mbox

core/pci: Introduce virtual device and filter cache

Message ID 1473319172-11289-1-git-send-email-gwshan@linux.vnet.ibm.com
State Deferred
Headers show

Commit Message

Gavin Shan Sept. 8, 2016, 7:19 a.m. UTC
This introduces virtual device and filter cache to speed up the
searching for PCI virtual device and config space register filter
in PCI config access path. With this applied, the original bandwidth
is regained during the NPU1 bandwidth testing.

Original bandwidth before PCI virtual device patchset is applied:

garrison# pwd
/home/alistair/NVIDIA_CUDA-7.5_Samples/1_Utilities/bandwidthTest
garrison# ./bandwidthTest --memory=pinned
    :
 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(MB/s)
   33554432			33205.6

With PCI virtual device patchset and this one is applied:

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(MB/s)
   33554432			33080.0

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 core/pci-virt.c    | 23 +++++++++++++++++++++--
 include/pci-virt.h |  1 +
 include/pci.h      |  1 +
 3 files changed, 23 insertions(+), 2 deletions(-)

Comments

Alistair Popple Sept. 12, 2016, 12:59 a.m. UTC | #1
Hi Gavin,

On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
> This introduces virtual device and filter cache to speed up the
> searching for PCI virtual device and config space register filter
> in PCI config access path. With this applied, the original bandwidth
> is regained during the NPU1 bandwidth testing.

Have you done any profiling that shows the virtual pci filters are the source 
of the slow down? I am quite confused why this is an issue as once the links 
are setup nothing should be touching the virtual config spaces (apart from a 
watchdog thread running once a second in the nvidia device driver).

I am concerned the slow down may be due to some other reason such as link 
training not work correctly so would like some confirmation that slow config 
space access is what is causing this. Also if the slow down is due to frequent 
config space accesses then we should be looking to reduce the frequency of 
accesses. There is probably also plenty of scope to further optimise these 
code paths as they were assumed to be on a fairly slow path when written.

Regars,

Alistair

> Original bandwidth before PCI virtual device patchset is applied:
> 
> garrison# pwd
> /home/alistair/NVIDIA_CUDA-7.5_Samples/1_Utilities/bandwidthTest
> garrison# ./bandwidthTest --memory=pinned
>     :
>  Host to Device Bandwidth, 1 Device(s)
>  PINNED Memory Transfers
>    Transfer Size (Bytes)	Bandwidth(MB/s)
>    33554432			33205.6
> 
> With PCI virtual device patchset and this one is applied:
> 
>  Host to Device Bandwidth, 1 Device(s)
>  PINNED Memory Transfers
>    Transfer Size (Bytes)	Bandwidth(MB/s)
>    33554432			33080.0
> 
> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> ---
>  core/pci-virt.c    | 23 +++++++++++++++++++++--
>  include/pci-virt.h |  1 +
>  include/pci.h      |  1 +
>  3 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/core/pci-virt.c b/core/pci-virt.c
> index b531470..663225a 100644
> --- a/core/pci-virt.c
> +++ b/core/pci-virt.c
> @@ -55,6 +55,12 @@ static struct pci_cfg_reg_filter *pci_virt_find_filter(
>  	if (!pvd || !len || start >= pvd->cfg_size)
>  		return NULL;
>  
> +	/* Check the cache first */
> +	pcrf = pvd->cached_pcrf;
> +	if (pcrf && start < (pcrf->start + pcrf->len) &&
> +	    (start + len) > pcrf->start)
> +		return pcrf;
> +
>  	/* Return filter if there is overlapped region. We don't
>  	 * require strict matching for more flexibility. It also
>  	 * means the associated handler should validate the register
> @@ -62,8 +68,12 @@ static struct pci_cfg_reg_filter *pci_virt_find_filter(
>  	 */
>  	list_for_each(&pvd->pcrf, pcrf, link) {
>  		if (start < (pcrf->start + pcrf->len) &&
> -		    (start + len) > pcrf->start)
> +		    (start + len) > pcrf->start) {
> +			/* Update the cache */
> +			pvd->cached_pcrf = pcrf;
> +
>  			return pcrf;
> +		}
>  	}
>  
>  	return NULL;
> @@ -111,9 +121,18 @@ struct pci_virt_device *pci_virt_find_device(struct phb 
*phb,
>  {
>  	struct pci_virt_device *pvd;
>  
> +	/* Check the cached virtual device firstly */
> +	pvd = phb->cached_pvd;
> +	if (pvd && pvd->bdfn == bdfn)
> +		return pvd;
> +
>  	list_for_each(&phb->virt_devices, pvd, node) {
> -		if (pvd->bdfn == bdfn)
> +		if (pvd->bdfn == bdfn) {
> +			/* Update the cache */
> +			phb->cached_pvd = pvd;
> +
>  			return pvd;
> +		}
>  	}
>  
>  	return NULL;
> diff --git a/include/pci-virt.h b/include/pci-virt.h
> index 7c787cf..d1427bc 100644
> --- a/include/pci-virt.h
> +++ b/include/pci-virt.h
> @@ -30,6 +30,7 @@ struct pci_virt_device {
>  	uint32_t		bdfn;
>  	uint32_t		cfg_size;
>  	uint8_t			*config[PCI_VIRT_CFG_MAX];
> +	struct pci_cfg_reg_filter	*cached_pcrf;
>  	struct list_head	pcrf;
>  	struct list_node	node;
>  	void			*data;
> diff --git a/include/pci.h b/include/pci.h
> index 92e3dce..c326a88 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -324,6 +324,7 @@ struct phb {
>  	enum phb_type		phb_type;
>  	struct lock		lock;
>  	struct list_head	devices;
> +	struct pci_virt_device	*cached_pvd;
>  	struct list_head	virt_devices;
>  	const struct phb_ops	*ops;
>  	struct pci_lsi_state	lstate;
>
Gavin Shan Sept. 13, 2016, 8:01 a.m. UTC | #2
On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
>On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
>> This introduces virtual device and filter cache to speed up the
>> searching for PCI virtual device and config space register filter
>> in PCI config access path. With this applied, the original bandwidth
>> is regained during the NPU1 bandwidth testing.
>
>Have you done any profiling that shows the virtual pci filters are the source 
>of the slow down? I am quite confused why this is an issue as once the links 
>are setup nothing should be touching the virtual config spaces (apart from a 
>watchdog thread running once a second in the nvidia device driver).
>

Alistair, I didn't do the profiling. As the code change included in
this patch is just introduce a virtual PCI device and filter cache.
Both of them are used in PCI config access with help of the virtual
PCI filters. It proves we have lots of PCI config traffic hitting the
virtual PCI filters indirectly, right?

>I am concerned the slow down may be due to some other reason such as link 
>training not work correctly so would like some confirmation that slow config 
>space access is what is causing this. Also if the slow down is due to frequent 
>config space accesses then we should be looking to reduce the frequency of 
>accesses. There is probably also plenty of scope to further optimise these 
>code paths as they were assumed to be on a fairly slow path when written.
>

Yeah, I agree the PCI config access is the slow path. I'm not sure why
we have huge PCI config traffic during the testing. I will rerun the test
with more code to reveal how much PCI config read/write are issued and
their distribution when I get a chance. It's not high priority as I think :-)

Thanks,
Gavin
Alistair Popple Sept. 14, 2016, 3:55 a.m. UTC | #3
On Tue, 13 Sep 2016 06:01:14 PM Gavin Shan wrote:
> On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
> >On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
> >> This introduces virtual device and filter cache to speed up the
> >> searching for PCI virtual device and config space register filter
> >> in PCI config access path. With this applied, the original bandwidth
> >> is regained during the NPU1 bandwidth testing.
> >
> >Have you done any profiling that shows the virtual pci filters are the 
source 
> >of the slow down? I am quite confused why this is an issue as once the 
links 
> >are setup nothing should be touching the virtual config spaces (apart from 
a 
> >watchdog thread running once a second in the nvidia device driver).
> >
> 
> Alistair, I didn't do the profiling. As the code change included in
> this patch is just introduce a virtual PCI device and filter cache.
> Both of them are used in PCI config access with help of the virtual
> PCI filters. It proves we have lots of PCI config traffic hitting the
> virtual PCI filters indirectly, right?

Perhaps it does indirectly, but the bandwidth could also be lower due to 
incorrect implementation of config space (especially as we shouldn't be 
getting any config space traffic). So it would be good to get more direct 
proof that the slow down is due to lots of config space traffic during the 
bandwidth test so that:

1) We can be sure it's just a code performance issue and not a config space 
implementation detail.
2) So that we can go back to nVidia and work out why there are so many config 
space accesses.

All we really need to do is either the bandwidth test under perf or add some 
printf's to the config space filter.

> >I am concerned the slow down may be due to some other reason such as link 
> >training not work correctly so would like some confirmation that slow 
config 
> >space access is what is causing this. Also if the slow down is due to 
frequent 
> >config space accesses then we should be looking to reduce the frequency of 
> >accesses. There is probably also plenty of scope to further optimise these 
> >code paths as they were assumed to be on a fairly slow path when written.
> >
> 
> Yeah, I agree the PCI config access is the slow path. I'm not sure why
> we have huge PCI config traffic during the testing. I will rerun the test
> with more code to reveal how much PCI config read/write are issued and
> their distribution when I get a chance. It's not high priority as I think 
:-)

What I meant is that no effort has been put into optimisation of that code 
path because it shouldn't be on a performance critical path. Given the PCI 
virtual device patches are upstream we should work this out sooner rather than 
later :-)

Thanks!

- Alistair

> Thanks,
> Gavin
>
Stewart Smith Dec. 8, 2016, 12:10 a.m. UTC | #4
Alistair Popple <alistair@popple.id.au> writes:
> On Tue, 13 Sep 2016 06:01:14 PM Gavin Shan wrote:
>> On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
>> >On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
>> >> This introduces virtual device and filter cache to speed up the
>> >> searching for PCI virtual device and config space register filter
>> >> in PCI config access path. With this applied, the original bandwidth
>> >> is regained during the NPU1 bandwidth testing.
>> >
>> >Have you done any profiling that shows the virtual pci filters are the 
> source 
>> >of the slow down? I am quite confused why this is an issue as once the 
> links 
>> >are setup nothing should be touching the virtual config spaces (apart from 
> a 
>> >watchdog thread running once a second in the nvidia device driver).
>> >
>> 
>> Alistair, I didn't do the profiling. As the code change included in
>> this patch is just introduce a virtual PCI device and filter cache.
>> Both of them are used in PCI config access with help of the virtual
>> PCI filters. It proves we have lots of PCI config traffic hitting the
>> virtual PCI filters indirectly, right?
>
> Perhaps it does indirectly, but the bandwidth could also be lower due to 
> incorrect implementation of config space (especially as we shouldn't be 
> getting any config space traffic). So it would be good to get more direct 
> proof that the slow down is due to lots of config space traffic during the 
> bandwidth test so that:
>
> 1) We can be sure it's just a code performance issue and not a config space 
> implementation detail.
> 2) So that we can go back to nVidia and work out why there are so many config 
> space accesses.
>
> All we really need to do is either the bandwidth test under perf or add some 
> printf's to the config space filter.
>
>> >I am concerned the slow down may be due to some other reason such as link 
>> >training not work correctly so would like some confirmation that slow 
> config 
>> >space access is what is causing this. Also if the slow down is due to 
> frequent 
>> >config space accesses then we should be looking to reduce the frequency of 
>> >accesses. There is probably also plenty of scope to further optimise these 
>> >code paths as they were assumed to be on a fairly slow path when written.
>> >
>> 
>> Yeah, I agree the PCI config access is the slow path. I'm not sure why
>> we have huge PCI config traffic during the testing. I will rerun the test
>> with more code to reveal how much PCI config read/write are issued and
>> their distribution when I get a chance. It's not high priority as I think 
> :-)
>
> What I meant is that no effort has been put into optimisation of that code 
> path because it shouldn't be on a performance critical path. Given the PCI 
> virtual device patches are upstream we should work this out sooner rather than 
> later :-)

Did we ever get a resolution on this? I was kind of holding off on
merging until some kind of consensus occured that this was the slow path?
Alistair Popple Dec. 8, 2016, 2:25 a.m. UTC | #5
On Thu, 8 Dec 2016 11:10:39 AM Stewart Smith wrote:
> Alistair Popple <alistair@popple.id.au> writes:
> > On Tue, 13 Sep 2016 06:01:14 PM Gavin Shan wrote:
> >> On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
> >> >On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
> >> >> This introduces virtual device and filter cache to speed up the
> >> >> searching for PCI virtual device and config space register filter
> >> >> in PCI config access path. With this applied, the original bandwidth
> >> >> is regained during the NPU1 bandwidth testing.
> >> >
> >> >Have you done any profiling that shows the virtual pci filters are the 
> > source 
> >> >of the slow down? I am quite confused why this is an issue as once the 
> > links 
> >> >are setup nothing should be touching the virtual config spaces (apart from 
> > a 
> >> >watchdog thread running once a second in the nvidia device driver).
> >> >
> >> 
> >> Alistair, I didn't do the profiling. As the code change included in
> >> this patch is just introduce a virtual PCI device and filter cache.
> >> Both of them are used in PCI config access with help of the virtual
> >> PCI filters. It proves we have lots of PCI config traffic hitting the
> >> virtual PCI filters indirectly, right?
> >
> > Perhaps it does indirectly, but the bandwidth could also be lower due to 
> > incorrect implementation of config space (especially as we shouldn't be 
> > getting any config space traffic). So it would be good to get more direct 
> > proof that the slow down is due to lots of config space traffic during the 
> > bandwidth test so that:
> >
> > 1) We can be sure it's just a code performance issue and not a config space 
> > implementation detail.
> > 2) So that we can go back to nVidia and work out why there are so many config 
> > space accesses.
> >
> > All we really need to do is either the bandwidth test under perf or add some 
> > printf's to the config space filter.
> >
> >> >I am concerned the slow down may be due to some other reason such as link 
> >> >training not work correctly so would like some confirmation that slow 
> > config 
> >> >space access is what is causing this. Also if the slow down is due to 
> > frequent 
> >> >config space accesses then we should be looking to reduce the frequency of 
> >> >accesses. There is probably also plenty of scope to further optimise these 
> >> >code paths as they were assumed to be on a fairly slow path when written.
> >> >
> >> 
> >> Yeah, I agree the PCI config access is the slow path. I'm not sure why
> >> we have huge PCI config traffic during the testing. I will rerun the test
> >> with more code to reveal how much PCI config read/write are issued and
> >> their distribution when I get a chance. It's not high priority as I think 
> > :-)
> >
> > What I meant is that no effort has been put into optimisation of that code 
> > path because it shouldn't be on a performance critical path. Given the PCI 
> > virtual device patches are upstream we should work this out sooner rather than 
> > later :-)
> 
> Did we ever get a resolution on this? I was kind of holding off on
> merging until some kind of consensus occured that this was the slow path?

I haven't had a chance to look into it myself yet. Gavin have you?
Gavin Shan Dec. 9, 2016, 12:04 a.m. UTC | #6
On Thu, Dec 08, 2016 at 01:25:42PM +1100, Alistair Popple wrote:
>On Thu, 8 Dec 2016 11:10:39 AM Stewart Smith wrote:
>> Alistair Popple <alistair@popple.id.au> writes:
>> > On Tue, 13 Sep 2016 06:01:14 PM Gavin Shan wrote:
>> >> On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
>> >> >On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
>> >> >> This introduces virtual device and filter cache to speed up the
>> >> >> searching for PCI virtual device and config space register filter
>> >> >> in PCI config access path. With this applied, the original bandwidth
>> >> >> is regained during the NPU1 bandwidth testing.
>> >> >
>> >> >Have you done any profiling that shows the virtual pci filters are the 
>> > source 
>> >> >of the slow down? I am quite confused why this is an issue as once the 
>> > links 
>> >> >are setup nothing should be touching the virtual config spaces (apart from 
>> > a 
>> >> >watchdog thread running once a second in the nvidia device driver).
>> >> >
>> >> 
>> >> Alistair, I didn't do the profiling. As the code change included in
>> >> this patch is just introduce a virtual PCI device and filter cache.
>> >> Both of them are used in PCI config access with help of the virtual
>> >> PCI filters. It proves we have lots of PCI config traffic hitting the
>> >> virtual PCI filters indirectly, right?
>> >
>> > Perhaps it does indirectly, but the bandwidth could also be lower due to 
>> > incorrect implementation of config space (especially as we shouldn't be 
>> > getting any config space traffic). So it would be good to get more direct 
>> > proof that the slow down is due to lots of config space traffic during the 
>> > bandwidth test so that:
>> >
>> > 1) We can be sure it's just a code performance issue and not a config space 
>> > implementation detail.
>> > 2) So that we can go back to nVidia and work out why there are so many config 
>> > space accesses.
>> >
>> > All we really need to do is either the bandwidth test under perf or add some 
>> > printf's to the config space filter.
>> >
>> >> >I am concerned the slow down may be due to some other reason such as link 
>> >> >training not work correctly so would like some confirmation that slow 
>> > config 
>> >> >space access is what is causing this. Also if the slow down is due to 
>> > frequent 
>> >> >config space accesses then we should be looking to reduce the frequency of 
>> >> >accesses. There is probably also plenty of scope to further optimise these 
>> >> >code paths as they were assumed to be on a fairly slow path when written.
>> >> >
>> >> 
>> >> Yeah, I agree the PCI config access is the slow path. I'm not sure why
>> >> we have huge PCI config traffic during the testing. I will rerun the test
>> >> with more code to reveal how much PCI config read/write are issued and
>> >> their distribution when I get a chance. It's not high priority as I think 
>> > :-)
>> >
>> > What I meant is that no effort has been put into optimisation of that code 
>> > path because it shouldn't be on a performance critical path. Given the PCI 
>> > virtual device patches are upstream we should work this out sooner rather than 
>> > later :-)
>> 
>> Did we ever get a resolution on this? I was kind of holding off on
>> merging until some kind of consensus occured that this was the slow path?
>
>I haven't had a chance to look into it myself yet. Gavin have you?
>

No, I didn't get chance looking at it, but why not just merge it?  With it,
the performance is improved. Isn't the result we want? :-)

Thanks,
Gavin
Alistair Popple Dec. 13, 2016, 2:17 a.m. UTC | #7
On Fri, 9 Dec 2016 11:04:47 AM Gavin Shan wrote:
> On Thu, Dec 08, 2016 at 01:25:42PM +1100, Alistair Popple wrote:
> >On Thu, 8 Dec 2016 11:10:39 AM Stewart Smith wrote:
> >> Alistair Popple <alistair@popple.id.au> writes:
> >> > On Tue, 13 Sep 2016 06:01:14 PM Gavin Shan wrote:
> >> >> On Mon, Sep 12, 2016 at 10:59:18AM +1000, Alistair Popple wrote:
> >> >> >On Thu, 8 Sep 2016 05:19:32 PM Gavin Shan wrote:
> >> >> >> This introduces virtual device and filter cache to speed up the
> >> >> >> searching for PCI virtual device and config space register filter
> >> >> >> in PCI config access path. With this applied, the original bandwidth
> >> >> >> is regained during the NPU1 bandwidth testing.
> >> >> >
> >> >> >Have you done any profiling that shows the virtual pci filters are the 
> >> > source 
> >> >> >of the slow down? I am quite confused why this is an issue as once the 
> >> > links 
> >> >> >are setup nothing should be touching the virtual config spaces (apart from 
> >> > a 
> >> >> >watchdog thread running once a second in the nvidia device driver).
> >> >> >
> >> >> 
> >> >> Alistair, I didn't do the profiling. As the code change included in
> >> >> this patch is just introduce a virtual PCI device and filter cache.
> >> >> Both of them are used in PCI config access with help of the virtual
> >> >> PCI filters. It proves we have lots of PCI config traffic hitting the
> >> >> virtual PCI filters indirectly, right?
> >> >
> >> > Perhaps it does indirectly, but the bandwidth could also be lower due to 
> >> > incorrect implementation of config space (especially as we shouldn't be 
> >> > getting any config space traffic). So it would be good to get more direct 
> >> > proof that the slow down is due to lots of config space traffic during the 
> >> > bandwidth test so that:
> >> >
> >> > 1) We can be sure it's just a code performance issue and not a config space 
> >> > implementation detail.
> >> > 2) So that we can go back to nVidia and work out why there are so many config 
> >> > space accesses.
> >> >
> >> > All we really need to do is either the bandwidth test under perf or add some 
> >> > printf's to the config space filter.
> >> >
> >> >> >I am concerned the slow down may be due to some other reason such as link 
> >> >> >training not work correctly so would like some confirmation that slow 
> >> > config 
> >> >> >space access is what is causing this. Also if the slow down is due to 
> >> > frequent 
> >> >> >config space accesses then we should be looking to reduce the frequency of 
> >> >> >accesses. There is probably also plenty of scope to further optimise these 
> >> >> >code paths as they were assumed to be on a fairly slow path when written.
> >> >> >
> >> >> 
> >> >> Yeah, I agree the PCI config access is the slow path. I'm not sure why
> >> >> we have huge PCI config traffic during the testing. I will rerun the test
> >> >> with more code to reveal how much PCI config read/write are issued and
> >> >> their distribution when I get a chance. It's not high priority as I think 
> >> > :-)
> >> >
> >> > What I meant is that no effort has been put into optimisation of that code 
> >> > path because it shouldn't be on a performance critical path. Given the PCI 
> >> > virtual device patches are upstream we should work this out sooner rather than 
> >> > later :-)
> >> 
> >> Did we ever get a resolution on this? I was kind of holding off on
> >> merging until some kind of consensus occured that this was the slow path?
> >
> >I haven't had a chance to look into it myself yet. Gavin have you?
> >
> 
> No, I didn't get chance looking at it, but why not just merge it?  With it,
> the performance is improved. Isn't the result we want? :-)

Only reason not to merge it would be that it increases code complexity
unnecessarily - that code path should not be anywhere near that hot for Nvidia
performance and if it is we need to do a better jobs of optimising all of it.

Regards,

Alistair

> Thanks,
> Gavin
> 
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
>
diff mbox

Patch

diff --git a/core/pci-virt.c b/core/pci-virt.c
index b531470..663225a 100644
--- a/core/pci-virt.c
+++ b/core/pci-virt.c
@@ -55,6 +55,12 @@  static struct pci_cfg_reg_filter *pci_virt_find_filter(
 	if (!pvd || !len || start >= pvd->cfg_size)
 		return NULL;
 
+	/* Check the cache first */
+	pcrf = pvd->cached_pcrf;
+	if (pcrf && start < (pcrf->start + pcrf->len) &&
+	    (start + len) > pcrf->start)
+		return pcrf;
+
 	/* Return filter if there is overlapped region. We don't
 	 * require strict matching for more flexibility. It also
 	 * means the associated handler should validate the register
@@ -62,8 +68,12 @@  static struct pci_cfg_reg_filter *pci_virt_find_filter(
 	 */
 	list_for_each(&pvd->pcrf, pcrf, link) {
 		if (start < (pcrf->start + pcrf->len) &&
-		    (start + len) > pcrf->start)
+		    (start + len) > pcrf->start) {
+			/* Update the cache */
+			pvd->cached_pcrf = pcrf;
+
 			return pcrf;
+		}
 	}
 
 	return NULL;
@@ -111,9 +121,18 @@  struct pci_virt_device *pci_virt_find_device(struct phb *phb,
 {
 	struct pci_virt_device *pvd;
 
+	/* Check the cached virtual device firstly */
+	pvd = phb->cached_pvd;
+	if (pvd && pvd->bdfn == bdfn)
+		return pvd;
+
 	list_for_each(&phb->virt_devices, pvd, node) {
-		if (pvd->bdfn == bdfn)
+		if (pvd->bdfn == bdfn) {
+			/* Update the cache */
+			phb->cached_pvd = pvd;
+
 			return pvd;
+		}
 	}
 
 	return NULL;
diff --git a/include/pci-virt.h b/include/pci-virt.h
index 7c787cf..d1427bc 100644
--- a/include/pci-virt.h
+++ b/include/pci-virt.h
@@ -30,6 +30,7 @@  struct pci_virt_device {
 	uint32_t		bdfn;
 	uint32_t		cfg_size;
 	uint8_t			*config[PCI_VIRT_CFG_MAX];
+	struct pci_cfg_reg_filter	*cached_pcrf;
 	struct list_head	pcrf;
 	struct list_node	node;
 	void			*data;
diff --git a/include/pci.h b/include/pci.h
index 92e3dce..c326a88 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -324,6 +324,7 @@  struct phb {
 	enum phb_type		phb_type;
 	struct lock		lock;
 	struct list_head	devices;
+	struct pci_virt_device	*cached_pvd;
 	struct list_head	virt_devices;
 	const struct phb_ops	*ops;
 	struct pci_lsi_state	lstate;