diff mbox series

[kernel] prom_init: Fetch flatten device tree from the system firmware

Message ID 20190501034221.18437-1-aik@ozlabs.ru (mailing list archive)
State Rejected
Headers show
Series [kernel] prom_init: Fetch flatten device tree from the system firmware | expand

Commit Message

Alexey Kardashevskiy May 1, 2019, 3:42 a.m. UTC
At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
about 8.5sec to fetch the entire device tree via the client interface
as the DT is traversed twice - for strings blob and for struct blob.
Also, "getprop" is quite slow too as SLOF stores properties in a linked
list.

However, since [1] SLOF builds flattened device tree (FDT) for another
purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
the FDT.

This tries the new method; if not supported, this falls back to
the old method.

There is a change in the FDT layout - the old method produced
(reserved map, strings, structs), the new one receives only strings and
structs from the firmware and adds the final reserved map to the end,
so it is (fw reserved map, strings, structs, reserved map).
This still produces the same unflattened device tree.

This merges the reserved map from the firmware into the kernel's reserved
map. At the moment SLOF generates an empty reserved map so this does not
change the existing behaviour in regard of reservations.

This supports only v17 onward as only that version provides dt_struct_size
which works as "fdt-fetch" only produces v17 blobs.

If "fdt-fetch" is not available, the old method of fetching the DT is used.

[1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
[2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/prom_init.c | 43 +++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

Comments

David Gibson May 2, 2019, 4:27 a.m. UTC | #1
On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
> about 8.5sec to fetch the entire device tree via the client interface
> as the DT is traversed twice - for strings blob and for struct blob.
> Also, "getprop" is quite slow too as SLOF stores properties in a linked
> list.
> 
> However, since [1] SLOF builds flattened device tree (FDT) for another
> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
> the FDT.
> 
> This tries the new method; if not supported, this falls back to
> the old method.
> 
> There is a change in the FDT layout - the old method produced
> (reserved map, strings, structs), the new one receives only strings and
> structs from the firmware and adds the final reserved map to the end,
> so it is (fw reserved map, strings, structs, reserved map).
> This still produces the same unflattened device tree.
> 
> This merges the reserved map from the firmware into the kernel's reserved
> map. At the moment SLOF generates an empty reserved map so this does not
> change the existing behaviour in regard of reservations.
> 
> This supports only v17 onward as only that version provides dt_struct_size
> which works as "fdt-fetch" only produces v17 blobs.
> 
> If "fdt-fetch" is not available, the old method of fetching the DT is used.
> 
> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Hrm.  I've gotta say I'm not terribly convinced that it's worth adding
a new interface we'll need to maintain to save 8s on a somewhat
contrived testcase.

> ---
>  arch/powerpc/kernel/prom_init.c | 43 +++++++++++++++++++++++++++++++++
>  1 file changed, 43 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index f33ff4163a51..72e7a602b68e 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -2457,6 +2457,48 @@ static void __init flatten_device_tree(void)
>  		prom_panic("Can't allocate initial device-tree chunk\n");
>  	mem_end = mem_start + room;
>  
> +	hdr = (void *) mem_start;
> +	if (!call_prom_ret("fdt-fetch", 2, 1, NULL, mem_start,
> +				room - sizeof(mem_reserve_map)) &&
> +			hdr->version >= 17) {
> +		u32 size;
> +		struct mem_map_entry *fwrmap;
> +
> +		/* Fixup the boot cpuid */
> +		hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
> +
> +		/*
> +		 * Store the struct and strings addresses, mostly
> +		 * for consistency, only dt_header_start actually matters later.
> +		 */
> +		dt_header_start = mem_start;
> +		dt_string_start = mem_start + be32_to_cpu(hdr->off_dt_strings);
> +		dt_string_end = dt_string_start +
> +			be32_to_cpu(hdr->dt_strings_size);
> +		dt_struct_start = mem_start + be32_to_cpu(hdr->off_dt_struct);
> +		dt_struct_end = dt_struct_start +
> +			be32_to_cpu(hdr->dt_struct_size);
> +
> +		/*
> +		 * Calculate the reserved map location (which we put
> +		 * at the blob end) and update total size.
> +		 */
> +		fwrmap = (void *)(mem_start + be32_to_cpu(hdr->off_mem_rsvmap));
> +		hdr->off_mem_rsvmap = hdr->totalsize;
> +		size = be32_to_cpu(hdr->totalsize);
> +		hdr->totalsize = cpu_to_be32(size + sizeof(mem_reserve_map));
> +
> +		/* Merge reserved map from firmware to ours */
> +		for ( ; fwrmap->size; ++fwrmap)
> +			reserve_mem(be64_to_cpu(fwrmap->base),
> +					be64_to_cpu(fwrmap->size));
> +
> +		rsvmap = (u64 *)(mem_start + size);
> +
> +		prom_debug("Fetched DTB: %d bytes to @%lx\n", size, mem_start);
> +		goto finalize_exit;
> +	}
> +
>  	/* Get root of tree */
>  	root = call_prom("peer", 1, 1, (phandle)0);
>  	if (root == (phandle)0)
> @@ -2504,6 +2546,7 @@ static void __init flatten_device_tree(void)
>  	/* Version 16 is not backward compatible */
>  	hdr->last_comp_version = cpu_to_be32(0x10);
>  
> +finalize_exit:
>  	/* Copy the reserve map in */
>  	memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));
>
Stewart Smith May 3, 2019, 12:10 a.m. UTC | #2
David Gibson <david@gibson.dropbear.id.au> writes:
> On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
>> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
>> about 8.5sec to fetch the entire device tree via the client interface
>> as the DT is traversed twice - for strings blob and for struct blob.
>> Also, "getprop" is quite slow too as SLOF stores properties in a linked
>> list.
>> 
>> However, since [1] SLOF builds flattened device tree (FDT) for another
>> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
>> the FDT.
>> 
>> This tries the new method; if not supported, this falls back to
>> the old method.
>> 
>> There is a change in the FDT layout - the old method produced
>> (reserved map, strings, structs), the new one receives only strings and
>> structs from the firmware and adds the final reserved map to the end,
>> so it is (fw reserved map, strings, structs, reserved map).
>> This still produces the same unflattened device tree.
>> 
>> This merges the reserved map from the firmware into the kernel's reserved
>> map. At the moment SLOF generates an empty reserved map so this does not
>> change the existing behaviour in regard of reservations.
>> 
>> This supports only v17 onward as only that version provides dt_struct_size
>> which works as "fdt-fetch" only produces v17 blobs.
>> 
>> If "fdt-fetch" is not available, the old method of fetching the DT is used.
>> 
>> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
>> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
>> 
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Hrm.  I've gotta say I'm not terribly convinced that it's worth adding
> a new interface we'll need to maintain to save 8s on a somewhat
> contrived testcase.

256CPUs aren't that many anymore though. Although I guess that many PCI
devices is still a little uncommon.

A 4 socket POWER8 or POWER9 can easily be that large, and a small test
kernel/userspace will boot in ~2.5-4 seconds. So it's possible that
the device tree fetch could be surprisingly non-trivial percentage of boot
time at least on some machines.
David Gibson May 3, 2019, 2:35 a.m. UTC | #3
On Fri, May 03, 2019 at 10:10:57AM +1000, Stewart Smith wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> > On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
> >> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
> >> about 8.5sec to fetch the entire device tree via the client interface
> >> as the DT is traversed twice - for strings blob and for struct blob.
> >> Also, "getprop" is quite slow too as SLOF stores properties in a linked
> >> list.
> >> 
> >> However, since [1] SLOF builds flattened device tree (FDT) for another
> >> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
> >> the FDT.
> >> 
> >> This tries the new method; if not supported, this falls back to
> >> the old method.
> >> 
> >> There is a change in the FDT layout - the old method produced
> >> (reserved map, strings, structs), the new one receives only strings and
> >> structs from the firmware and adds the final reserved map to the end,
> >> so it is (fw reserved map, strings, structs, reserved map).
> >> This still produces the same unflattened device tree.
> >> 
> >> This merges the reserved map from the firmware into the kernel's reserved
> >> map. At the moment SLOF generates an empty reserved map so this does not
> >> change the existing behaviour in regard of reservations.
> >> 
> >> This supports only v17 onward as only that version provides dt_struct_size
> >> which works as "fdt-fetch" only produces v17 blobs.
> >> 
> >> If "fdt-fetch" is not available, the old method of fetching the DT is used.
> >> 
> >> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
> >> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
> >> 
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> > Hrm.  I've gotta say I'm not terribly convinced that it's worth adding
> > a new interface we'll need to maintain to save 8s on a somewhat
> > contrived testcase.
> 
> 256CPUs aren't that many anymore though. Although I guess that many PCI
> devices is still a little uncommon.

Yeah, it was the PCI devices I was meaning, not the cpus.

> A 4 socket POWER8 or POWER9 can easily be that large, and a small test
> kernel/userspace will boot in ~2.5-4 seconds. So it's possible that
> the device tree fetch could be surprisingly non-trivial percentage of boot
> time at least on some machines.
> 
>
Segher Boessenkool May 3, 2019, 3:32 p.m. UTC | #4
On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
> about 8.5sec to fetch the entire device tree via the client interface
> as the DT is traversed twice - for strings blob and for struct blob.
> Also, "getprop" is quite slow too as SLOF stores properties in a linked
> list.

Most OF implementations do it that way.  An optimisation that can help
a lot is to cache the last accessed node / prop.  This of course then
requires you to invalidate that cache at many places you did not think
about :-/

> However, since [1] SLOF builds flattened device tree (FDT) for another
> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
> the FDT.

Since Linux does not do much more with the device tree, this should
work great for it.


Segher
Segher Boessenkool May 3, 2019, 3:35 p.m. UTC | #5
On Fri, May 03, 2019 at 10:10:57AM +1000, Stewart Smith wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> > On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
> >> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
> >> about 8.5sec to fetch the entire device tree via the client interface
> >> as the DT is traversed twice - for strings blob and for struct blob.
> >> Also, "getprop" is quite slow too as SLOF stores properties in a linked
> >> list.
> >> 
> >> However, since [1] SLOF builds flattened device tree (FDT) for another
> >> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
> >> the FDT.
> >> 
> >> This tries the new method; if not supported, this falls back to
> >> the old method.
> >> 
> >> There is a change in the FDT layout - the old method produced
> >> (reserved map, strings, structs), the new one receives only strings and
> >> structs from the firmware and adds the final reserved map to the end,
> >> so it is (fw reserved map, strings, structs, reserved map).
> >> This still produces the same unflattened device tree.
> >> 
> >> This merges the reserved map from the firmware into the kernel's reserved
> >> map. At the moment SLOF generates an empty reserved map so this does not
> >> change the existing behaviour in regard of reservations.
> >> 
> >> This supports only v17 onward as only that version provides dt_struct_size
> >> which works as "fdt-fetch" only produces v17 blobs.
> >> 
> >> If "fdt-fetch" is not available, the old method of fetching the DT is used.
> >> 
> >> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
> >> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
> >> 
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> > Hrm.  I've gotta say I'm not terribly convinced that it's worth adding
> > a new interface we'll need to maintain to save 8s on a somewhat
> > contrived testcase.
> 
> 256CPUs aren't that many anymore though. Although I guess that many PCI
> devices is still a little uncommon.
> 
> A 4 socket POWER8 or POWER9 can easily be that large, and a small test
> kernel/userspace will boot in ~2.5-4 seconds. So it's possible that
> the device tree fetch could be surprisingly non-trivial percentage of boot
> time at least on some machines.

All client interface calls are really heavy, and you need to do a lot of
them if you have a big device tree.  This takes time, even if the linked
list stuff does not kill you :-)


Segher
Alexey Kardashevskiy May 6, 2019, 2:21 a.m. UTC | #6
On 03/05/2019 12:35, David Gibson wrote:
> On Fri, May 03, 2019 at 10:10:57AM +1000, Stewart Smith wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>> On Wed, May 01, 2019 at 01:42:21PM +1000, Alexey Kardashevskiy wrote:
>>>> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
>>>> about 8.5sec to fetch the entire device tree via the client interface
>>>> as the DT is traversed twice - for strings blob and for struct blob.
>>>> Also, "getprop" is quite slow too as SLOF stores properties in a linked
>>>> list.
>>>>
>>>> However, since [1] SLOF builds flattened device tree (FDT) for another
>>>> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
>>>> the FDT.
>>>>
>>>> This tries the new method; if not supported, this falls back to
>>>> the old method.
>>>>
>>>> There is a change in the FDT layout - the old method produced
>>>> (reserved map, strings, structs), the new one receives only strings and
>>>> structs from the firmware and adds the final reserved map to the end,
>>>> so it is (fw reserved map, strings, structs, reserved map).
>>>> This still produces the same unflattened device tree.
>>>>
>>>> This merges the reserved map from the firmware into the kernel's reserved
>>>> map. At the moment SLOF generates an empty reserved map so this does not
>>>> change the existing behaviour in regard of reservations.
>>>>
>>>> This supports only v17 onward as only that version provides dt_struct_size
>>>> which works as "fdt-fetch" only produces v17 blobs.
>>>>
>>>> If "fdt-fetch" is not available, the old method of fetching the DT is used.
>>>>
>>>> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
>>>> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>
>>> Hrm.  I've gotta say I'm not terribly convinced that it's worth adding
>>> a new interface we'll need to maintain to save 8s on a somewhat
>>> contrived testcase.
>>
>> 256CPUs aren't that many anymore though. Although I guess that many PCI
>> devices is still a little uncommon.
> 
> Yeah, it was the PCI devices I was meaning, not the cpus.


Each node (device, cpu, memory/numa) has a dozen of properties, so any
500 nodes will slow booting down more or less equally.


> 
>> A 4 socket POWER8 or POWER9 can easily be that large, and a small test
>> kernel/userspace will boot in ~2.5-4 seconds. So it's possible that
>> the device tree fetch could be surprisingly non-trivial percentage of boot
>> time at least on some machines.
>>
>>
>
Alexey Kardashevskiy May 30, 2019, 7:09 a.m. UTC | #7
so, it is sort-of nack from David and sort-of ack from Segher, what
happens now?



On 01/05/2019 13:42, Alexey Kardashevskiy wrote:
> At the moment, on 256CPU + 256 PCI devices guest, it takes the guest
> about 8.5sec to fetch the entire device tree via the client interface
> as the DT is traversed twice - for strings blob and for struct blob.
> Also, "getprop" is quite slow too as SLOF stores properties in a linked
> list.
> 
> However, since [1] SLOF builds flattened device tree (FDT) for another
> purpose. [2] adds a new "fdt-fetch" client interface for the OS to fetch
> the FDT.
> 
> This tries the new method; if not supported, this falls back to
> the old method.
> 
> There is a change in the FDT layout - the old method produced
> (reserved map, strings, structs), the new one receives only strings and
> structs from the firmware and adds the final reserved map to the end,
> so it is (fw reserved map, strings, structs, reserved map).
> This still produces the same unflattened device tree.
> 
> This merges the reserved map from the firmware into the kernel's reserved
> map. At the moment SLOF generates an empty reserved map so this does not
> change the existing behaviour in regard of reservations.
> 
> This supports only v17 onward as only that version provides dt_struct_size
> which works as "fdt-fetch" only produces v17 blobs.
> 
> If "fdt-fetch" is not available, the old method of fetching the DT is used.
> 
> [1] https://git.qemu.org/?p=SLOF.git;a=commitdiff;h=e6fc84652c9c00
> [2] https://git.qemu.org/?p=SLOF.git;a=commit;h=ecda95906930b80
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/kernel/prom_init.c | 43 +++++++++++++++++++++++++++++++++
>  1 file changed, 43 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index f33ff4163a51..72e7a602b68e 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -2457,6 +2457,48 @@ static void __init flatten_device_tree(void)
>  		prom_panic("Can't allocate initial device-tree chunk\n");
>  	mem_end = mem_start + room;
>  
> +	hdr = (void *) mem_start;
> +	if (!call_prom_ret("fdt-fetch", 2, 1, NULL, mem_start,
> +				room - sizeof(mem_reserve_map)) &&
> +			hdr->version >= 17) {
> +		u32 size;
> +		struct mem_map_entry *fwrmap;
> +
> +		/* Fixup the boot cpuid */
> +		hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
> +
> +		/*
> +		 * Store the struct and strings addresses, mostly
> +		 * for consistency, only dt_header_start actually matters later.
> +		 */
> +		dt_header_start = mem_start;
> +		dt_string_start = mem_start + be32_to_cpu(hdr->off_dt_strings);
> +		dt_string_end = dt_string_start +
> +			be32_to_cpu(hdr->dt_strings_size);
> +		dt_struct_start = mem_start + be32_to_cpu(hdr->off_dt_struct);
> +		dt_struct_end = dt_struct_start +
> +			be32_to_cpu(hdr->dt_struct_size);
> +
> +		/*
> +		 * Calculate the reserved map location (which we put
> +		 * at the blob end) and update total size.
> +		 */
> +		fwrmap = (void *)(mem_start + be32_to_cpu(hdr->off_mem_rsvmap));
> +		hdr->off_mem_rsvmap = hdr->totalsize;
> +		size = be32_to_cpu(hdr->totalsize);
> +		hdr->totalsize = cpu_to_be32(size + sizeof(mem_reserve_map));
> +
> +		/* Merge reserved map from firmware to ours */
> +		for ( ; fwrmap->size; ++fwrmap)
> +			reserve_mem(be64_to_cpu(fwrmap->base),
> +					be64_to_cpu(fwrmap->size));
> +
> +		rsvmap = (u64 *)(mem_start + size);
> +
> +		prom_debug("Fetched DTB: %d bytes to @%lx\n", size, mem_start);
> +		goto finalize_exit;
> +	}
> +
>  	/* Get root of tree */
>  	root = call_prom("peer", 1, 1, (phandle)0);
>  	if (root == (phandle)0)
> @@ -2504,6 +2546,7 @@ static void __init flatten_device_tree(void)
>  	/* Version 16 is not backward compatible */
>  	hdr->last_comp_version = cpu_to_be32(0x10);
>  
> +finalize_exit:
>  	/* Copy the reserve map in */
>  	memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));
>  
>
Segher Boessenkool May 30, 2019, 7:37 p.m. UTC | #8
On Thu, May 30, 2019 at 05:09:06PM +1000, Alexey Kardashevskiy wrote:
> so, it is sort-of nack from David and sort-of ack from Segher, what
> happens now?

Maybe what we really need just a CI call to get all properties of a node
at once?  Will that speed up things enough?

That way you need no change at all in lifetime of properties and how they
are used, etc.; just a client getting the properties is a lot faster.


Segher
Benjamin Herrenschmidt May 31, 2019, 1:03 a.m. UTC | #9
On Thu, 2019-05-30 at 14:37 -0500, Segher Boessenkool wrote:
> On Thu, May 30, 2019 at 05:09:06PM +1000, Alexey Kardashevskiy wrote:
> > so, it is sort-of nack from David and sort-of ack from Segher, what
> > happens now?
> 
> Maybe what we really need just a CI call to get all properties of a node
> at once?  Will that speed up things enough?
> 
> That way you need no change at all in lifetime of properties and how they
> are used, etc.; just a client getting the properties is a lot faster.

Hrm... if we're going to create a new interface, let's go for what we
need.

What we need is the FDT. It's a rather ubiquitous thing these days, it
makes sense to have a way to fetch an FDT directly from FW.

There is no use for the "fetch all properties" cases other than
building an FDT that any of us can think of, and it would create a more
complicated interface than just "fetch an FDT".

So I go for the simple one and agree with Alexey's idea.

Cheers,
Ben.
Segher Boessenkool June 2, 2019, 11:23 p.m. UTC | #10
Hi!

On Fri, May 31, 2019 at 11:03:26AM +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2019-05-30 at 14:37 -0500, Segher Boessenkool wrote:
> > On Thu, May 30, 2019 at 05:09:06PM +1000, Alexey Kardashevskiy wrote:
> > > so, it is sort-of nack from David and sort-of ack from Segher, what
> > > happens now?
> > 
> > Maybe what we really need just a CI call to get all properties of a node
> > at once?  Will that speed up things enough?
> > 
> > That way you need no change at all in lifetime of properties and how they
> > are used, etc.; just a client getting the properties is a lot faster.
> 
> Hrm... if we're going to create a new interface, let's go for what we
> need.
> 
> What we need is the FDT. It's a rather ubiquitous thing these days, it
> makes sense to have a way to fetch an FDT directly from FW.

That is all you need if you do not want to use OF at all.

If you *do* want to keep having an Open Firmware, what we want or need
is a faster way to walk huge device trees.

> There is no use for the "fetch all properties" cases other than
> building an FDT that any of us can think of, and it would create a more
> complicated interface than just "fetch an FDT".

It is a simple way to speed up fetching the device tree enormously,
without needing big changes to either OF or the clients using it -- not
in the code, but importantly also not conceptually: everything works just
as before, just a lot faster.

> So I go for the simple one and agree with Alexey's idea.

When dealing with a whole device tree you have to know about the various
dynamically generated nodes and props, and handle each appropriately.


Segher
Alexey Kardashevskiy June 3, 2019, 2:56 a.m. UTC | #11
On 03/06/2019 09:23, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, May 31, 2019 at 11:03:26AM +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2019-05-30 at 14:37 -0500, Segher Boessenkool wrote:
>>> On Thu, May 30, 2019 at 05:09:06PM +1000, Alexey Kardashevskiy wrote:
>>>> so, it is sort-of nack from David and sort-of ack from Segher, what
>>>> happens now?
>>>
>>> Maybe what we really need just a CI call to get all properties of a node
>>> at once?  Will that speed up things enough?
>>>
>>> That way you need no change at all in lifetime of properties and how they
>>> are used, etc.; just a client getting the properties is a lot faster.
>>
>> Hrm... if we're going to create a new interface, let's go for what we
>> need.
>>
>> What we need is the FDT. It's a rather ubiquitous thing these days, it
>> makes sense to have a way to fetch an FDT directly from FW.
> 
> That is all you need if you do not want to use OF at all.

? We also need OF drivers to boot grub and the system, and a default
console for early booting, but yes, we do not want to keep using slof
that much.

> If you *do* want to keep having an Open Firmware, what we want or need
> is a faster way to walk huge device trees.

Why? We do not need to _walk_ the tree at all (we _have_ to now and
while walking we do nothing but pack everything together into the fdt
blob) as slof can easily do this already.

>> There is no use for the "fetch all properties" cases other than
>> building an FDT that any of us can think of, and it would create a more
>> complicated interface than just "fetch an FDT".
> 
> It is a simple way to speed up fetching the device tree enormously,
> without needing big changes to either OF or the clients using it -- not
> in the code, but importantly also not conceptually: everything works just
> as before, just a lot faster.

I can safely presume though that this will never ever be used in
practice. And it will be still slower, a tiny bit. And require new code
in both slof and linux.

I can rather see us getting rid of SLOF in the future which in turn will
require the fdt blob pointer in r5 (or whatever it is, forgot) when the
guest starts; so "fetch-all-props" won't be used there either.

>> So I go for the simple one and agree with Alexey's idea.
> 
> When dealing with a whole device tree you have to know about the various
> dynamically generated nodes and props, and handle each appropriately.

The code I am changing fetches the device tree and build an fdt. What is
that special knowledge in this context you are talking about?
Benjamin Herrenschmidt June 3, 2019, 9:18 p.m. UTC | #12
On Mon, 2019-06-03 at 12:56 +1000, Alexey Kardashevskiy wrote:
> 
> > That is all you need if you do not want to use OF at all.
> 
> ? We also need OF drivers to boot grub and the system, and a default
> console for early booting, but yes, we do not want to keep using slof
> that much.
> 
> > If you *do* want to keep having an Open Firmware, what we want or need
> > is a faster way to walk huge device trees.
> 
> Why? We do not need to _walk_ the tree at all (we _have_ to now and
> while walking we do nothing but pack everything together into the fdt
> blob) as slof can easily do this already.

I agree with Alexey. In a sense this can be thought as an extension of
"quiesce", which effectively kills OF.

Yes we could make property fetching faster but mostly by creating a new
bulk interface which is quite a bit of work, a new API, and will in
practice not be used for anything other than creating the FDT. In that
case, nothing will beat in performance having OF create the FDT itself
based on its current tree.

> > > There is no use for the "fetch all properties" cases other than
> > > building an FDT that any of us can think of, and it would create a more
> > > complicated interface than just "fetch an FDT".
> > 
> > It is a simple way to speed up fetching the device tree enormously,
> > without needing big changes to either OF or the clients using it -- not
> > in the code, but importantly also not conceptually: everything works just
> > as before, just a lot faster.
> 
> I can safely presume though that this will never ever be used in
> practice. And it will be still slower, a tiny bit. And require new code
> in both slof and linux.

Correct.

> I can rather see us getting rid of SLOF in the future which in turn will
> require the fdt blob pointer in r5 (or whatever it is, forgot) when the
> guest starts; so "fetch-all-props" won't be used there either.
> 
> > > So I go for the simple one and agree with Alexey's idea.
> > 
> > When dealing with a whole device tree you have to know about the various
> > dynamically generated nodes and props, and handle each appropriately.
> 
> The code I am changing fetches the device tree and build an fdt. What is
> that special knowledge in this context you are talking about?

Ben.
Segher Boessenkool June 3, 2019, 11:42 p.m. UTC | #13
Hi!

On Mon, Jun 03, 2019 at 12:56:26PM +1000, Alexey Kardashevskiy wrote:
> On 03/06/2019 09:23, Segher Boessenkool wrote:
> >> So I go for the simple one and agree with Alexey's idea.
> > 
> > When dealing with a whole device tree you have to know about the various
> > dynamically generated nodes and props, and handle each appropriately.
> 
> The code I am changing fetches the device tree and build an fdt. What is
> that special knowledge in this context you are talking about?

Things like /options are dynamically generated.

I thought you would just be able to reuse some FDT parsing code to
implement my suggested new interface.  Maybe that was too optimistic.


Segher
Segher Boessenkool June 3, 2019, 11:49 p.m. UTC | #14
On Tue, Jun 04, 2019 at 07:18:42AM +1000, Benjamin Herrenschmidt wrote:
> Yes we could make property fetching faster but mostly by creating a new
> bulk interface which is quite a bit of work, a new API, and will in
> practice not be used for anything other than creating the FDT. In that
> case, nothing will beat in performance having OF create the FDT itself
> based on its current tree.

And that will change the whole boot protocol, the interaction between OF
and its client, which is a much bigger change, not conceptually trivial
at all.  Copying all properties at once is, which is why I suggested it.

It would take away the opposition to your patch.


Segher
Benjamin Herrenschmidt June 4, 2019, 12:18 a.m. UTC | #15
On Mon, 2019-06-03 at 18:42 -0500, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Jun 03, 2019 at 12:56:26PM +1000, Alexey Kardashevskiy wrote:
> > On 03/06/2019 09:23, Segher Boessenkool wrote:
> > > > So I go for the simple one and agree with Alexey's idea.
> > > 
> > > When dealing with a whole device tree you have to know about the
> > > various
> > > dynamically generated nodes and props, and handle each
> > > appropriately.
> > 
> > The code I am changing fetches the device tree and build an fdt.
> > What is
> > that special knowledge in this context you are talking about?
> 
> Things like /options are dynamically generated.

They are generated before we do the call to extract the fdt, what's the
problem there ?

> I thought you would just be able to reuse some FDT parsing code to
> implement my suggested new interface.  Maybe that was too optimistic.

No, the idea is to have SLOF re-flatten it's live tree and hand us a
blob. At least that's my understanding.

Ben.
David Gibson June 4, 2019, 12:32 a.m. UTC | #16
On Mon, Jun 03, 2019 at 06:49:32PM -0500, Segher Boessenkool wrote:
> On Tue, Jun 04, 2019 at 07:18:42AM +1000, Benjamin Herrenschmidt wrote:
> > Yes we could make property fetching faster but mostly by creating a new
> > bulk interface which is quite a bit of work, a new API, and will in
> > practice not be used for anything other than creating the FDT. In that
> > case, nothing will beat in performance having OF create the FDT itself
> > based on its current tree.
> 
> And that will change the whole boot protocol, the interaction between OF
> and its client, which is a much bigger change, not conceptually trivial
> at all.  Copying all properties at once is, which is why I suggested it.

Much as I wasn't terribly convinced by the original idea, I agree with
Ben and Alexey here.  Your approach has slightly more complexity for
slightly less benefit.  If it's worth doing yours it's better to do
theirs.

> It would take away the opposition to your patch.

Only from you, AFAICT...
Alexey Kardashevskiy June 4, 2019, 5 a.m. UTC | #17
On 04/06/2019 09:42, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Jun 03, 2019 at 12:56:26PM +1000, Alexey Kardashevskiy wrote:
>> On 03/06/2019 09:23, Segher Boessenkool wrote:
>>>> So I go for the simple one and agree with Alexey's idea.
>>>
>>> When dealing with a whole device tree you have to know about the various
>>> dynamically generated nodes and props, and handle each appropriately.
>>
>> The code I am changing fetches the device tree and build an fdt. What is
>> that special knowledge in this context you are talking about?
> 
> Things like /options are dynamically generated.

Generated by the guest kernel, after walking the tree and before making
the fdt blob? I cannot see it, what do I miss? Otherwise it is all in
slof and the same result is fetched one way or another.


> I thought you would just be able to reuse some FDT parsing code to
> implement my suggested new interface.  Maybe that was too optimistic.
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f33ff4163a51..72e7a602b68e 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -2457,6 +2457,48 @@  static void __init flatten_device_tree(void)
 		prom_panic("Can't allocate initial device-tree chunk\n");
 	mem_end = mem_start + room;
 
+	hdr = (void *) mem_start;
+	if (!call_prom_ret("fdt-fetch", 2, 1, NULL, mem_start,
+				room - sizeof(mem_reserve_map)) &&
+			hdr->version >= 17) {
+		u32 size;
+		struct mem_map_entry *fwrmap;
+
+		/* Fixup the boot cpuid */
+		hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
+
+		/*
+		 * Store the struct and strings addresses, mostly
+		 * for consistency, only dt_header_start actually matters later.
+		 */
+		dt_header_start = mem_start;
+		dt_string_start = mem_start + be32_to_cpu(hdr->off_dt_strings);
+		dt_string_end = dt_string_start +
+			be32_to_cpu(hdr->dt_strings_size);
+		dt_struct_start = mem_start + be32_to_cpu(hdr->off_dt_struct);
+		dt_struct_end = dt_struct_start +
+			be32_to_cpu(hdr->dt_struct_size);
+
+		/*
+		 * Calculate the reserved map location (which we put
+		 * at the blob end) and update total size.
+		 */
+		fwrmap = (void *)(mem_start + be32_to_cpu(hdr->off_mem_rsvmap));
+		hdr->off_mem_rsvmap = hdr->totalsize;
+		size = be32_to_cpu(hdr->totalsize);
+		hdr->totalsize = cpu_to_be32(size + sizeof(mem_reserve_map));
+
+		/* Merge reserved map from firmware to ours */
+		for ( ; fwrmap->size; ++fwrmap)
+			reserve_mem(be64_to_cpu(fwrmap->base),
+					be64_to_cpu(fwrmap->size));
+
+		rsvmap = (u64 *)(mem_start + size);
+
+		prom_debug("Fetched DTB: %d bytes to @%lx\n", size, mem_start);
+		goto finalize_exit;
+	}
+
 	/* Get root of tree */
 	root = call_prom("peer", 1, 1, (phandle)0);
 	if (root == (phandle)0)
@@ -2504,6 +2546,7 @@  static void __init flatten_device_tree(void)
 	/* Version 16 is not backward compatible */
 	hdr->last_comp_version = cpu_to_be32(0x10);
 
+finalize_exit:
 	/* Copy the reserve map in */
 	memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));