diff mbox series

[RFC,04/11] net: phy: Handle c22 regs presence better

Message ID 20200522213059.1535892-5-jeremy.linton@arm.com
State RFC
Delegated to: David Miller
Headers show
Series Make C45 autoprobe more robust | expand

Commit Message

Jeremy Linton May 22, 2020, 9:30 p.m. UTC
Until this point, we have been sanitizing the c22
regs presence bit out of all the MMD device lists.
This is incorrect as it causes the 0xFFFFFFFF checks
to incorrectly fail. Further, it turns out that we
want to utilize this flag to make a determination that
there is actually a phy at this location and we should
be accessing it using c22.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 drivers/net/phy/phy_device.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

Comments

Russell King (Oracle) May 23, 2020, 6:37 p.m. UTC | #1
On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> Until this point, we have been sanitizing the c22
> regs presence bit out of all the MMD device lists.
> This is incorrect as it causes the 0xFFFFFFFF checks
> to incorrectly fail. Further, it turns out that we
> want to utilize this flag to make a determination that
> there is actually a phy at this location and we should
> be accessing it using c22.
> 
> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> ---
>  drivers/net/phy/phy_device.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index f0761fa5e40b..2d677490ecab 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>  		return -EIO;
>  	*devices_in_package |= phy_reg;
>  
> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> -	*devices_in_package &= ~BIT(0);
> -
>  	return 0;
>  }
>  
> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>  	int i;
>  	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>  	u32 *devs = &c45_ids->devices_in_package;
> +	bool c22_present = false;
> +	bool valid_id = false;
>  
>  	/* Find first non-zero Devices In package. Device zero is reserved
>  	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>  		return 0;
>  	}
>  
> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> +	c22_present = *devs & BIT(0);
> +	*devs &= ~BIT(0);
> +
>  	/* Now probe Device Identifiers for each device present. */
>  	for (i = 1; i < num_ids; i++) {
>  		if (!(c45_ids->devices_in_package & (1 << i)))
> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>  		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>  		if (ret < 0)
>  			return ret;
> +		if (valid_phy_id(c45_ids->device_ids[i]))
> +			valid_id = true;

Here you are using your "devices in package" validator to validate the
PHY ID value.  One of the things it does is mask this value with
0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
looks completely wrong.

> +	}
> +
> +	if (!valid_id && c22_present) {
> +		*phy_id = 0xffffffff;
> +	        return 0;
>  	}
>  	*phy_id = 0;
>  	return 0;
> -- 
> 2.26.2
> 
>
Jeremy Linton May 25, 2020, 3:34 a.m. UTC | #2
Hi,

On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>> Until this point, we have been sanitizing the c22
>> regs presence bit out of all the MMD device lists.
>> This is incorrect as it causes the 0xFFFFFFFF checks
>> to incorrectly fail. Further, it turns out that we
>> want to utilize this flag to make a determination that
>> there is actually a phy at this location and we should
>> be accessing it using c22.
>>
>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>> ---
>>   drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>   1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index f0761fa5e40b..2d677490ecab 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>   		return -EIO;
>>   	*devices_in_package |= phy_reg;
>>   
>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>> -	*devices_in_package &= ~BIT(0);
>> -
>>   	return 0;
>>   }
>>   
>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>   	int i;
>>   	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>   	u32 *devs = &c45_ids->devices_in_package;
>> +	bool c22_present = false;
>> +	bool valid_id = false;
>>   
>>   	/* Find first non-zero Devices In package. Device zero is reserved
>>   	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>   		return 0;
>>   	}
>>   
>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>> +	c22_present = *devs & BIT(0);
>> +	*devs &= ~BIT(0);
>> +
>>   	/* Now probe Device Identifiers for each device present. */
>>   	for (i = 1; i < num_ids; i++) {
>>   		if (!(c45_ids->devices_in_package & (1 << i)))
>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>   		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>   		if (ret < 0)
>>   			return ret;
>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>> +			valid_id = true;
> 
> Here you are using your "devices in package" validator to validate the
> PHY ID value.  One of the things it does is mask this value with
> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> looks completely wrong.

I think in this case I was just using it like the comment in 
get_phy_device() "if the phy_id is mostly F's, there is no device here".

My understanding is that the code is trying to avoid the 0xFFFFFFFF 
returns that seem to indicate "bus ok, phy didn't respond".

I just checked the OUI registration, and while there are a couple OUI's 
registered that have a number of FFF's in them, none of those cases 
seems to overlap sufficiently to cause this to throw them out. Plus a 
phy would also have to have model+revision set to 'F's. So while might 
be possible, if unlikely, at the moment I think the OUI registration 
keeps this from being a problem. Particularly, if i'm reading the 
mapping correctly, the OUI mapping guarantees that the field cannot be 
all '1's due to the OUI having X & M bits cleared. It sort of looks like 
the mapping is trying to lose those bits, by tossing bit 1 & 2, but the 
X & M are in the wrong octet (AFAIK, I just read it three times cause it 
didn't make any sense).
Russell King (Oracle) May 25, 2020, 9:53 a.m. UTC | #3
On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > Until this point, we have been sanitizing the c22
> > > regs presence bit out of all the MMD device lists.
> > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > to incorrectly fail. Further, it turns out that we
> > > want to utilize this flag to make a determination that
> > > there is actually a phy at this location and we should
> > > be accessing it using c22.
> > > 
> > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > ---
> > >   drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > >   1 file changed, 13 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > index f0761fa5e40b..2d677490ecab 100644
> > > --- a/drivers/net/phy/phy_device.c
> > > +++ b/drivers/net/phy/phy_device.c
> > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > >   		return -EIO;
> > >   	*devices_in_package |= phy_reg;
> > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > -	*devices_in_package &= ~BIT(0);
> > > -
> > >   	return 0;
> > >   }
> > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   	int i;
> > >   	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > >   	u32 *devs = &c45_ids->devices_in_package;
> > > +	bool c22_present = false;
> > > +	bool valid_id = false;
> > >   	/* Find first non-zero Devices In package. Device zero is reserved
> > >   	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   		return 0;
> > >   	}
> > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > +	c22_present = *devs & BIT(0);
> > > +	*devs &= ~BIT(0);
> > > +
> > >   	/* Now probe Device Identifiers for each device present. */
> > >   	for (i = 1; i < num_ids; i++) {
> > >   		if (!(c45_ids->devices_in_package & (1 << i)))
> > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > >   		if (ret < 0)
> > >   			return ret;
> > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > +			valid_id = true;
> > 
> > Here you are using your "devices in package" validator to validate the
> > PHY ID value.  One of the things it does is mask this value with
> > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > looks completely wrong.
> 
> I think in this case I was just using it like the comment in
> get_phy_device() "if the phy_id is mostly F's, there is no device here".

Yes, that is certainly an interesting comment.  What's so magic about
this 0x1fffffff?  If it's about the time taken for the bus to rise
to logic 1 when not being actively driven by a PHY, then it actually
makes little sense, because we perform two transations to read each half
of the field, and both should have the same behaviour.  If this was the
issue, we should be masking and testing against 0x1fff1fff rather than
0x1fffffff.

> I just checked the OUI registration, and while there are a couple OUI's
> registered that have a number of FFF's in them, none of those cases seems to
> overlap sufficiently to cause this to throw them out. Plus a phy would also
> have to have model+revision set to 'F's. So while might be possible, if
> unlikely, at the moment I think the OUI registration keeps this from being a
> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> guarantees that the field cannot be all '1's due to the OUI having X & M
> bits cleared. It sort of looks like the mapping is trying to lose those
> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> just read it three times cause it didn't make any sense).

The most-bits-set OUI that is currently allocated is 5C-FF-FF.  This
would result in a register value of 0x73fffc00 to 0x73ffffff, so as
you say, it should be safe.
Russell King (Oracle) May 25, 2020, 10:06 a.m. UTC | #4
On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > Until this point, we have been sanitizing the c22
> > > regs presence bit out of all the MMD device lists.
> > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > to incorrectly fail. Further, it turns out that we
> > > want to utilize this flag to make a determination that
> > > there is actually a phy at this location and we should
> > > be accessing it using c22.
> > > 
> > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > ---
> > >   drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > >   1 file changed, 13 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > index f0761fa5e40b..2d677490ecab 100644
> > > --- a/drivers/net/phy/phy_device.c
> > > +++ b/drivers/net/phy/phy_device.c
> > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > >   		return -EIO;
> > >   	*devices_in_package |= phy_reg;
> > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > -	*devices_in_package &= ~BIT(0);
> > > -
> > >   	return 0;
> > >   }
> > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   	int i;
> > >   	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > >   	u32 *devs = &c45_ids->devices_in_package;
> > > +	bool c22_present = false;
> > > +	bool valid_id = false;
> > >   	/* Find first non-zero Devices In package. Device zero is reserved
> > >   	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   		return 0;
> > >   	}
> > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > +	c22_present = *devs & BIT(0);
> > > +	*devs &= ~BIT(0);
> > > +
> > >   	/* Now probe Device Identifiers for each device present. */
> > >   	for (i = 1; i < num_ids; i++) {
> > >   		if (!(c45_ids->devices_in_package & (1 << i)))
> > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > >   		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > >   		if (ret < 0)
> > >   			return ret;
> > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > +			valid_id = true;
> > 
> > Here you are using your "devices in package" validator to validate the
> > PHY ID value.  One of the things it does is mask this value with
> > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > looks completely wrong.
> 
> I think in this case I was just using it like the comment in
> get_phy_device() "if the phy_id is mostly F's, there is no device here".
> 
> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
> that seem to indicate "bus ok, phy didn't respond".
> 
> I just checked the OUI registration, and while there are a couple OUI's
> registered that have a number of FFF's in them, none of those cases seems to
> overlap sufficiently to cause this to throw them out. Plus a phy would also
> have to have model+revision set to 'F's. So while might be possible, if
> unlikely, at the moment I think the OUI registration keeps this from being a
> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> guarantees that the field cannot be all '1's due to the OUI having X & M
> bits cleared. It sort of looks like the mapping is trying to lose those
> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> just read it three times cause it didn't make any sense).

I should also note that we have at least one supported PHY where one
of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
odd numbered registers in one of the vendor MMDs for addresses 0
through 0xefff - which has a bit set in the devices-in-package.

It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
devices-in-package bit is clear in most of the valid MMDs, so we
shouldn't touch it.

These reveal the problem of randomly probing MMDs - they can return
unexpected values and not be as well behaved as we would like them to
be.  Using register 8 to detect presence may be beneficial, but that
may also introduce problems as we haven't used that before (and we
don't know whether any PHY that wrong.)  I know at least the 88x3310
gets it right for all except the vendor MMDs, where the low addresses
appear non-confromant to the 802.3 specs.  Both vendor MMDs are
definitely implemented, just not with anything conforming to 802.3.
Jeremy Linton May 25, 2020, 9:51 p.m. UTC | #5
Hi,

On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
> On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
>> Hi,
>>
>> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
>>> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>>>> Until this point, we have been sanitizing the c22
>>>> regs presence bit out of all the MMD device lists.
>>>> This is incorrect as it causes the 0xFFFFFFFF checks
>>>> to incorrectly fail. Further, it turns out that we
>>>> want to utilize this flag to make a determination that
>>>> there is actually a phy at this location and we should
>>>> be accessing it using c22.
>>>>
>>>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>>>> ---
>>>>    drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>    1 file changed, 13 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>> index f0761fa5e40b..2d677490ecab 100644
>>>> --- a/drivers/net/phy/phy_device.c
>>>> +++ b/drivers/net/phy/phy_device.c
>>>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>>>    		return -EIO;
>>>>    	*devices_in_package |= phy_reg;
>>>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>> -	*devices_in_package &= ~BIT(0);
>>>> -
>>>>    	return 0;
>>>>    }
>>>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>    	int i;
>>>>    	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>>>    	u32 *devs = &c45_ids->devices_in_package;
>>>> +	bool c22_present = false;
>>>> +	bool valid_id = false;
>>>>    	/* Find first non-zero Devices In package. Device zero is reserved
>>>>    	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>>>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>    		return 0;
>>>>    	}
>>>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>> +	c22_present = *devs & BIT(0);
>>>> +	*devs &= ~BIT(0);
>>>> +
>>>>    	/* Now probe Device Identifiers for each device present. */
>>>>    	for (i = 1; i < num_ids; i++) {
>>>>    		if (!(c45_ids->devices_in_package & (1 << i)))
>>>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>    		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>>>    		if (ret < 0)
>>>>    			return ret;
>>>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>>>> +			valid_id = true;
>>>
>>> Here you are using your "devices in package" validator to validate the
>>> PHY ID value.  One of the things it does is mask this value with
>>> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
>>> looks completely wrong.
>>
>> I think in this case I was just using it like the comment in
>> get_phy_device() "if the phy_id is mostly F's, there is no device here".
>>
>> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
>> that seem to indicate "bus ok, phy didn't respond".
>>
>> I just checked the OUI registration, and while there are a couple OUI's
>> registered that have a number of FFF's in them, none of those cases seems to
>> overlap sufficiently to cause this to throw them out. Plus a phy would also
>> have to have model+revision set to 'F's. So while might be possible, if
>> unlikely, at the moment I think the OUI registration keeps this from being a
>> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
>> guarantees that the field cannot be all '1's due to the OUI having X & M
>> bits cleared. It sort of looks like the mapping is trying to lose those
>> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
>> just read it three times cause it didn't make any sense).
> 
> I should also note that we have at least one supported PHY where one
> of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
> odd numbered registers in one of the vendor MMDs for addresses 0
> through 0xefff - which has a bit set in the devices-in-package.
> 
> It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
> devices-in-package bit is clear in most of the valid MMDs, so we
> shouldn't touch it.
> 
> These reveal the problem of randomly probing MMDs - they can return
> unexpected values and not be as well behaved as we would like them to
> be.  Using register 8 to detect presence may be beneficial, but that
> may also introduce problems as we haven't used that before (and we
> don't know whether any PHY that wrong.)  I know at least the 88x3310
> gets it right for all except the vendor MMDs, where the low addresses
> appear non-confromant to the 802.3 specs.  Both vendor MMDs are
> definitely implemented, just not with anything conforming to 802.3.

Yes, we know even for the NXP reference hardware, one of the phy's 
doesn't probe out correctly because it doesn't respond to the ieee 
defined registers. I think at this point, there really isn't anything we 
can do about that unless we involve the (ACPI) firmware in currently 
nonstandard behaviors.

So, my goals here have been to first, not break anything, and then do a 
slightly better job finding phy's that are (mostly?) responding 
correctly to the 802.3 spec. So we can say "if you hardware is ACPI 
conformant, and you have IEEE conformant phy's you should be ok". So, 
for your example phy, I guess the immediate answer is "use DT" or "find 
a conformant phy", or even "abstract it in the firmware and use a 
mailbox interface".
Russell King (Oracle) May 25, 2020, 10:01 p.m. UTC | #6
On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
> > On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> > > Hi,
> > > 
> > > On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > > > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > > > Until this point, we have been sanitizing the c22
> > > > > regs presence bit out of all the MMD device lists.
> > > > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > > > to incorrectly fail. Further, it turns out that we
> > > > > want to utilize this flag to make a determination that
> > > > > there is actually a phy at this location and we should
> > > > > be accessing it using c22.
> > > > > 
> > > > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > > > ---
> > > > >    drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > > > >    1 file changed, 13 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > > > index f0761fa5e40b..2d677490ecab 100644
> > > > > --- a/drivers/net/phy/phy_device.c
> > > > > +++ b/drivers/net/phy/phy_device.c
> > > > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > > > >    		return -EIO;
> > > > >    	*devices_in_package |= phy_reg;
> > > > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > -	*devices_in_package &= ~BIT(0);
> > > > > -
> > > > >    	return 0;
> > > > >    }
> > > > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > >    	int i;
> > > > >    	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > > > >    	u32 *devs = &c45_ids->devices_in_package;
> > > > > +	bool c22_present = false;
> > > > > +	bool valid_id = false;
> > > > >    	/* Find first non-zero Devices In package. Device zero is reserved
> > > > >    	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > >    		return 0;
> > > > >    	}
> > > > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > +	c22_present = *devs & BIT(0);
> > > > > +	*devs &= ~BIT(0);
> > > > > +
> > > > >    	/* Now probe Device Identifiers for each device present. */
> > > > >    	for (i = 1; i < num_ids; i++) {
> > > > >    		if (!(c45_ids->devices_in_package & (1 << i)))
> > > > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > >    		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > > > >    		if (ret < 0)
> > > > >    			return ret;
> > > > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > > > +			valid_id = true;
> > > > 
> > > > Here you are using your "devices in package" validator to validate the
> > > > PHY ID value.  One of the things it does is mask this value with
> > > > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > > > looks completely wrong.
> > > 
> > > I think in this case I was just using it like the comment in
> > > get_phy_device() "if the phy_id is mostly F's, there is no device here".
> > > 
> > > My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
> > > that seem to indicate "bus ok, phy didn't respond".
> > > 
> > > I just checked the OUI registration, and while there are a couple OUI's
> > > registered that have a number of FFF's in them, none of those cases seems to
> > > overlap sufficiently to cause this to throw them out. Plus a phy would also
> > > have to have model+revision set to 'F's. So while might be possible, if
> > > unlikely, at the moment I think the OUI registration keeps this from being a
> > > problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> > > guarantees that the field cannot be all '1's due to the OUI having X & M
> > > bits cleared. It sort of looks like the mapping is trying to lose those
> > > bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> > > just read it three times cause it didn't make any sense).
> > 
> > I should also note that we have at least one supported PHY where one
> > of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
> > odd numbered registers in one of the vendor MMDs for addresses 0
> > through 0xefff - which has a bit set in the devices-in-package.
> > 
> > It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
> > devices-in-package bit is clear in most of the valid MMDs, so we
> > shouldn't touch it.
> > 
> > These reveal the problem of randomly probing MMDs - they can return
> > unexpected values and not be as well behaved as we would like them to
> > be.  Using register 8 to detect presence may be beneficial, but that
> > may also introduce problems as we haven't used that before (and we
> > don't know whether any PHY that wrong.)  I know at least the 88x3310
> > gets it right for all except the vendor MMDs, where the low addresses
> > appear non-confromant to the 802.3 specs.  Both vendor MMDs are
> > definitely implemented, just not with anything conforming to 802.3.
> 
> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> probe out correctly because it doesn't respond to the ieee defined
> registers. I think at this point, there really isn't anything we can do
> about that unless we involve the (ACPI) firmware in currently nonstandard
> behaviors.
> 
> So, my goals here have been to first, not break anything, and then do a
> slightly better job finding phy's that are (mostly?) responding correctly to
> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> have IEEE conformant phy's you should be ok". So, for your example phy, I
> guess the immediate answer is "use DT" or "find a conformant phy", or even
> "abstract it in the firmware and use a mailbox interface".

You haven't understood.  The PHY does conform for most of the MMDs,
but there are a number that do not conform.
Andrew Lunn May 25, 2020, 10:06 p.m. UTC | #7
> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> probe out correctly because it doesn't respond to the ieee defined
> registers. I think at this point, there really isn't anything we can do
> about that unless we involve the (ACPI) firmware in currently nonstandard
> behaviors.
> 
> So, my goals here have been to first, not break anything, and then do a
> slightly better job finding phy's that are (mostly?) responding correctly to
> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> have IEEE conformant phy's you should be ok". So, for your example phy, I
> guess the immediate answer is "use DT" or "find a conformant phy", or even
> "abstract it in the firmware and use a mailbox interface".
 
Hi Jeremy

You need to be careful here, when you say "use DT". With a c45 PHY
of_mdiobus_register_phy() calls get_phy_device() to see if the device
exists on the bus. So your changes to get_phy_device() etc, needs to
continue to find devices it used to find, even if they are not fully
complient to 802.3.

	  Andrew
Jeremy Linton May 25, 2020, 10:17 p.m. UTC | #8
Hi,

On 5/25/20 5:06 PM, Andrew Lunn wrote:
>> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
>> probe out correctly because it doesn't respond to the ieee defined
>> registers. I think at this point, there really isn't anything we can do
>> about that unless we involve the (ACPI) firmware in currently nonstandard
>> behaviors.
>>
>> So, my goals here have been to first, not break anything, and then do a
>> slightly better job finding phy's that are (mostly?) responding correctly to
>> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
>> have IEEE conformant phy's you should be ok". So, for your example phy, I
>> guess the immediate answer is "use DT" or "find a conformant phy", or even
>> "abstract it in the firmware and use a mailbox interface".
>   
> Hi Jeremy
> 
> You need to be careful here, when you say "use DT". With a c45 PHY
> of_mdiobus_register_phy() calls get_phy_device() to see if the device
> exists on the bus. So your changes to get_phy_device() etc, needs to
> continue to find devices it used to find, even if they are not fully
> complient to 802.3.
> 

Yes, that is my "don't break anything". But, in a number of cases I 
can't tell if something is an intentional "bug", or what exactly the 
intended side effect actually was. The c22 bit0 sanitation is in this 
bucket, because its basically disabling the MMD0 probe..

I know for sure we find phys that previously weren't found. OTOH, i'm 
not sure how many that were previously "found" are now getting kicked 
out by because they are doing something "bad" that looked like a bug.
Jeremy Linton May 25, 2020, 10:22 p.m. UTC | #9
On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
>> Hi,
>>
>> On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
>>> On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
>>>> Hi,
>>>>
>>>> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
>>>>> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>>>>>> Until this point, we have been sanitizing the c22
>>>>>> regs presence bit out of all the MMD device lists.
>>>>>> This is incorrect as it causes the 0xFFFFFFFF checks
>>>>>> to incorrectly fail. Further, it turns out that we
>>>>>> want to utilize this flag to make a determination that
>>>>>> there is actually a phy at this location and we should
>>>>>> be accessing it using c22.
>>>>>>
>>>>>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>>>>>> ---
>>>>>>     drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>>>     1 file changed, 13 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>>>> index f0761fa5e40b..2d677490ecab 100644
>>>>>> --- a/drivers/net/phy/phy_device.c
>>>>>> +++ b/drivers/net/phy/phy_device.c
>>>>>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>>>>>     		return -EIO;
>>>>>>     	*devices_in_package |= phy_reg;
>>>>>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>> -	*devices_in_package &= ~BIT(0);
>>>>>> -
>>>>>>     	return 0;
>>>>>>     }
>>>>>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     	int i;
>>>>>>     	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>>>>>     	u32 *devs = &c45_ids->devices_in_package;
>>>>>> +	bool c22_present = false;
>>>>>> +	bool valid_id = false;
>>>>>>     	/* Find first non-zero Devices In package. Device zero is reserved
>>>>>>     	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>>>>>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     		return 0;
>>>>>>     	}
>>>>>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>> +	c22_present = *devs & BIT(0);
>>>>>> +	*devs &= ~BIT(0);
>>>>>> +
>>>>>>     	/* Now probe Device Identifiers for each device present. */
>>>>>>     	for (i = 1; i < num_ids; i++) {
>>>>>>     		if (!(c45_ids->devices_in_package & (1 << i)))
>>>>>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>>>>>     		if (ret < 0)
>>>>>>     			return ret;
>>>>>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>>>>>> +			valid_id = true;
>>>>>
>>>>> Here you are using your "devices in package" validator to validate the
>>>>> PHY ID value.  One of the things it does is mask this value with
>>>>> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
>>>>> looks completely wrong.
>>>>
>>>> I think in this case I was just using it like the comment in
>>>> get_phy_device() "if the phy_id is mostly F's, there is no device here".
>>>>
>>>> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
>>>> that seem to indicate "bus ok, phy didn't respond".
>>>>
>>>> I just checked the OUI registration, and while there are a couple OUI's
>>>> registered that have a number of FFF's in them, none of those cases seems to
>>>> overlap sufficiently to cause this to throw them out. Plus a phy would also
>>>> have to have model+revision set to 'F's. So while might be possible, if
>>>> unlikely, at the moment I think the OUI registration keeps this from being a
>>>> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
>>>> guarantees that the field cannot be all '1's due to the OUI having X & M
>>>> bits cleared. It sort of looks like the mapping is trying to lose those
>>>> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
>>>> just read it three times cause it didn't make any sense).
>>>
>>> I should also note that we have at least one supported PHY where one
>>> of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
>>> odd numbered registers in one of the vendor MMDs for addresses 0
>>> through 0xefff - which has a bit set in the devices-in-package.
>>>
>>> It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
>>> devices-in-package bit is clear in most of the valid MMDs, so we
>>> shouldn't touch it.
>>>
>>> These reveal the problem of randomly probing MMDs - they can return
>>> unexpected values and not be as well behaved as we would like them to
>>> be.  Using register 8 to detect presence may be beneficial, but that
>>> may also introduce problems as we haven't used that before (and we
>>> don't know whether any PHY that wrong.)  I know at least the 88x3310
>>> gets it right for all except the vendor MMDs, where the low addresses
>>> appear non-confromant to the 802.3 specs.  Both vendor MMDs are
>>> definitely implemented, just not with anything conforming to 802.3.
>>
>> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
>> probe out correctly because it doesn't respond to the ieee defined
>> registers. I think at this point, there really isn't anything we can do
>> about that unless we involve the (ACPI) firmware in currently nonstandard
>> behaviors.
>>
>> So, my goals here have been to first, not break anything, and then do a
>> slightly better job finding phy's that are (mostly?) responding correctly to
>> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
>> have IEEE conformant phy's you should be ok". So, for your example phy, I
>> guess the immediate answer is "use DT" or "find a conformant phy", or even
>> "abstract it in the firmware and use a mailbox interface".
> 
> You haven't understood.  The PHY does conform for most of the MMDs,
> but there are a number that do not conform.

Probably...

Except that i'm not sure how that is a problem at the moment, its still 
going to trigger as a found phy, and walk the same mmd list as before 
requesting drivers. Those drivers haven't changed their behavior so 
where is the problem? If there is a problem its in 7/11 where things are 
getting kicked due to seemingly invalid Ids.

The 1/11 devices=0 case actually appears to be a bug i'm fixing because 
you won't get an ID or a MMD list from that (before or after).
Andrew Lunn May 25, 2020, 11:06 p.m. UTC | #10
> I know for sure we find phys that previously weren't found.

That is in itself somewhat dangerous. Those using primitive
configuration systems are probably going to use phy_find_first(),
rather than an address on the bus.  I always recommend against that,
because if another PHY suddenly pops up on the bus, bad things can
happen.

   Andrew
Russell King (Oracle) May 25, 2020, 11:07 p.m. UTC | #11
On Mon, May 25, 2020 at 05:17:27PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/25/20 5:06 PM, Andrew Lunn wrote:
> > > Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> > > probe out correctly because it doesn't respond to the ieee defined
> > > registers. I think at this point, there really isn't anything we can do
> > > about that unless we involve the (ACPI) firmware in currently nonstandard
> > > behaviors.
> > > 
> > > So, my goals here have been to first, not break anything, and then do a
> > > slightly better job finding phy's that are (mostly?) responding correctly to
> > > the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> > > have IEEE conformant phy's you should be ok". So, for your example phy, I
> > > guess the immediate answer is "use DT" or "find a conformant phy", or even
> > > "abstract it in the firmware and use a mailbox interface".
> > Hi Jeremy
> > 
> > You need to be careful here, when you say "use DT". With a c45 PHY
> > of_mdiobus_register_phy() calls get_phy_device() to see if the device
> > exists on the bus. So your changes to get_phy_device() etc, needs to
> > continue to find devices it used to find, even if they are not fully
> > complient to 802.3.
> > 
> 
> Yes, that is my "don't break anything". But, in a number of cases I can't
> tell if something is an intentional "bug", or what exactly the intended side
> effect actually was. The c22 bit0 sanitation is in this bucket, because its
> basically disabling the MMD0 probe..

I'm really not sure it causes any problem what so ever.  Have you read
the commit adding cortina.c to see what it says - there is an
interesting comment about what it requires in firmware.  That is, it
calls for an explicit "ethernet-phy-id" compatible in DT naming the
PHY ID, but that can't be used for Clause 45 PHYs (it will be ignored.)
So, it will be treated by the kernel as a Clause 22 PHY.

It is presently in use:

arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";

Now, given this, none of the Clause 45 PHY detection code will be
touched while probing for these PHYs, so really that work-around to
read form MMD 0 in the Clause 45 probing function really doesn't seem
to apply to these PHYs.

Next, when you read cortina.c, it becomes obvious that the PHY's MMD 0
doesn't even follow IEEE 802.3 - the ID registers are at register 0/1
not 2/3.  So even if we did try to read the ID from MMD 0, we wouldn't
be reading the ID from the right registers.

Hence, I don't think anything has been broken at all by the commit
you refer to.

> I know for sure we find phys that previously weren't found. OTOH, i'm not
> sure how many that were previously "found" are now getting kicked out by
> because they are doing something "bad" that looked like a bug.

I don't think you've found any problem what so ever.

For these PHYs to be automatically probed, they need to have a DT
string identifying them as clause 45 compliant.  From the DTS files
I've provided above, that isn't the case, this code path won't be
reached, so nothing has been broken.  In any case, for the
reasons I mention above wrt non-standard register layout, it seems
it couldn't have worked via this probing code.

I would dig some more into the history of the change to
get_phy_c45_ids() and how it relates to the addition of the Cortina
driver, but unfortunately my machine is being painfully slow with
git log searching the history that it's way too time consuming for
me to do anything further now, but the conclusion I'm coming to is
that there has been no regression how you think there has been.
Russell King (Oracle) May 25, 2020, 11:09 p.m. UTC | #12
On Mon, May 25, 2020 at 05:22:07PM -0500, Jeremy Linton wrote:
> On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> > On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
> > > Hi,
> > > 
> > > On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
> > > > On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> > > > > Hi,
> > > > > 
> > > > > On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > > > > > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > > > > > Until this point, we have been sanitizing the c22
> > > > > > > regs presence bit out of all the MMD device lists.
> > > > > > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > > > > > to incorrectly fail. Further, it turns out that we
> > > > > > > want to utilize this flag to make a determination that
> > > > > > > there is actually a phy at this location and we should
> > > > > > > be accessing it using c22.
> > > > > > > 
> > > > > > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > > > > > ---
> > > > > > >     drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > > > > > >     1 file changed, 13 insertions(+), 3 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > > > > > index f0761fa5e40b..2d677490ecab 100644
> > > > > > > --- a/drivers/net/phy/phy_device.c
> > > > > > > +++ b/drivers/net/phy/phy_device.c
> > > > > > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > > > > > >     		return -EIO;
> > > > > > >     	*devices_in_package |= phy_reg;
> > > > > > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > -	*devices_in_package &= ~BIT(0);
> > > > > > > -
> > > > > > >     	return 0;
> > > > > > >     }
> > > > > > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > >     	int i;
> > > > > > >     	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > > > > > >     	u32 *devs = &c45_ids->devices_in_package;
> > > > > > > +	bool c22_present = false;
> > > > > > > +	bool valid_id = false;
> > > > > > >     	/* Find first non-zero Devices In package. Device zero is reserved
> > > > > > >     	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > > > > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > >     		return 0;
> > > > > > >     	}
> > > > > > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > +	c22_present = *devs & BIT(0);
> > > > > > > +	*devs &= ~BIT(0);
> > > > > > > +
> > > > > > >     	/* Now probe Device Identifiers for each device present. */
> > > > > > >     	for (i = 1; i < num_ids; i++) {
> > > > > > >     		if (!(c45_ids->devices_in_package & (1 << i)))
> > > > > > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > >     		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > > > > > >     		if (ret < 0)
> > > > > > >     			return ret;
> > > > > > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > > > > > +			valid_id = true;
> > > > > > 
> > > > > > Here you are using your "devices in package" validator to validate the
> > > > > > PHY ID value.  One of the things it does is mask this value with
> > > > > > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > > > > > looks completely wrong.
> > > > > 
> > > > > I think in this case I was just using it like the comment in
> > > > > get_phy_device() "if the phy_id is mostly F's, there is no device here".
> > > > > 
> > > > > My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
> > > > > that seem to indicate "bus ok, phy didn't respond".
> > > > > 
> > > > > I just checked the OUI registration, and while there are a couple OUI's
> > > > > registered that have a number of FFF's in them, none of those cases seems to
> > > > > overlap sufficiently to cause this to throw them out. Plus a phy would also
> > > > > have to have model+revision set to 'F's. So while might be possible, if
> > > > > unlikely, at the moment I think the OUI registration keeps this from being a
> > > > > problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> > > > > guarantees that the field cannot be all '1's due to the OUI having X & M
> > > > > bits cleared. It sort of looks like the mapping is trying to lose those
> > > > > bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> > > > > just read it three times cause it didn't make any sense).
> > > > 
> > > > I should also note that we have at least one supported PHY where one
> > > > of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
> > > > odd numbered registers in one of the vendor MMDs for addresses 0
> > > > through 0xefff - which has a bit set in the devices-in-package.
> > > > 
> > > > It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
> > > > devices-in-package bit is clear in most of the valid MMDs, so we
> > > > shouldn't touch it.
> > > > 
> > > > These reveal the problem of randomly probing MMDs - they can return
> > > > unexpected values and not be as well behaved as we would like them to
> > > > be.  Using register 8 to detect presence may be beneficial, but that
> > > > may also introduce problems as we haven't used that before (and we
> > > > don't know whether any PHY that wrong.)  I know at least the 88x3310
> > > > gets it right for all except the vendor MMDs, where the low addresses
> > > > appear non-confromant to the 802.3 specs.  Both vendor MMDs are
> > > > definitely implemented, just not with anything conforming to 802.3.
> > > 
> > > Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> > > probe out correctly because it doesn't respond to the ieee defined
> > > registers. I think at this point, there really isn't anything we can do
> > > about that unless we involve the (ACPI) firmware in currently nonstandard
> > > behaviors.
> > > 
> > > So, my goals here have been to first, not break anything, and then do a
> > > slightly better job finding phy's that are (mostly?) responding correctly to
> > > the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> > > have IEEE conformant phy's you should be ok". So, for your example phy, I
> > > guess the immediate answer is "use DT" or "find a conformant phy", or even
> > > "abstract it in the firmware and use a mailbox interface".
> > 
> > You haven't understood.  The PHY does conform for most of the MMDs,
> > but there are a number that do not conform.
> 
> Probably...
> 
> Except that i'm not sure how that is a problem at the moment, its still
> going to trigger as a found phy, and walk the same mmd list as before
> requesting drivers. Those drivers haven't changed their behavior so where is
> the problem? If there is a problem its in 7/11 where things are getting
> kicked due to seemingly invalid Ids.
> 
> The 1/11 devices=0 case actually appears to be a bug i'm fixing because you
> won't get an ID or a MMD list from that (before or after).

I think I've just flattened that argument in my immediately preceding
reply on the Cortina situation; I think you've grossly misread that
through not fully researching the history and then finding the
existing users.

There is no bug that you are fixing from what I can see.
Andrew Lunn May 25, 2020, 11:12 p.m. UTC | #13
> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";

Hi Jeremy

You are doing this work for NXP right? Maybe you can ask them to go
searching in the cellar and find you one of these boards?

	  Andrew
Jeremy Linton May 25, 2020, 11:16 p.m. UTC | #14
Hi,

On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
>> Hi,
>>
>> On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
>>> On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
>>>> Hi,
>>>>
>>>> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
>>>>> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>>>>>> Until this point, we have been sanitizing the c22
>>>>>> regs presence bit out of all the MMD device lists.
>>>>>> This is incorrect as it causes the 0xFFFFFFFF checks
>>>>>> to incorrectly fail. Further, it turns out that we
>>>>>> want to utilize this flag to make a determination that
>>>>>> there is actually a phy at this location and we should
>>>>>> be accessing it using c22.
>>>>>>
>>>>>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>>>>>> ---
>>>>>>     drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>>>     1 file changed, 13 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>>>> index f0761fa5e40b..2d677490ecab 100644
>>>>>> --- a/drivers/net/phy/phy_device.c
>>>>>> +++ b/drivers/net/phy/phy_device.c
>>>>>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>>>>>     		return -EIO;
>>>>>>     	*devices_in_package |= phy_reg;
>>>>>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>> -	*devices_in_package &= ~BIT(0);
>>>>>> -
>>>>>>     	return 0;
>>>>>>     }
>>>>>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     	int i;
>>>>>>     	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>>>>>     	u32 *devs = &c45_ids->devices_in_package;
>>>>>> +	bool c22_present = false;
>>>>>> +	bool valid_id = false;
>>>>>>     	/* Find first non-zero Devices In package. Device zero is reserved
>>>>>>     	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>>>>>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     		return 0;
>>>>>>     	}
>>>>>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>> +	c22_present = *devs & BIT(0);
>>>>>> +	*devs &= ~BIT(0);
>>>>>> +
>>>>>>     	/* Now probe Device Identifiers for each device present. */
>>>>>>     	for (i = 1; i < num_ids; i++) {
>>>>>>     		if (!(c45_ids->devices_in_package & (1 << i)))
>>>>>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>     		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>>>>>     		if (ret < 0)
>>>>>>     			return ret;
>>>>>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>>>>>> +			valid_id = true;
>>>>>
>>>>> Here you are using your "devices in package" validator to validate the
>>>>> PHY ID value.  One of the things it does is mask this value with
>>>>> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
>>>>> looks completely wrong.
>>>>
>>>> I think in this case I was just using it like the comment in
>>>> get_phy_device() "if the phy_id is mostly F's, there is no device here".
>>>>
>>>> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
>>>> that seem to indicate "bus ok, phy didn't respond".
>>>>
>>>> I just checked the OUI registration, and while there are a couple OUI's
>>>> registered that have a number of FFF's in them, none of those cases seems to
>>>> overlap sufficiently to cause this to throw them out. Plus a phy would also
>>>> have to have model+revision set to 'F's. So while might be possible, if
>>>> unlikely, at the moment I think the OUI registration keeps this from being a
>>>> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
>>>> guarantees that the field cannot be all '1's due to the OUI having X & M
>>>> bits cleared. It sort of looks like the mapping is trying to lose those
>>>> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
>>>> just read it three times cause it didn't make any sense).
>>>
>>> I should also note that we have at least one supported PHY where one
>>> of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
>>> odd numbered registers in one of the vendor MMDs for addresses 0
>>> through 0xefff - which has a bit set in the devices-in-package.
>>>
>>> It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
>>> devices-in-package bit is clear in most of the valid MMDs, so we
>>> shouldn't touch it.
>>>
>>> These reveal the problem of randomly probing MMDs - they can return
>>> unexpected values and not be as well behaved as we would like them to
>>> be.  Using register 8 to detect presence may be beneficial, but that
>>> may also introduce problems as we haven't used that before (and we
>>> don't know whether any PHY that wrong.)  I know at least the 88x3310
>>> gets it right for all except the vendor MMDs, where the low addresses
>>> appear non-confromant to the 802.3 specs.  Both vendor MMDs are
>>> definitely implemented, just not with anything conforming to 802.3.
>>
>> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
>> probe out correctly because it doesn't respond to the ieee defined
>> registers. I think at this point, there really isn't anything we can do
>> about that unless we involve the (ACPI) firmware in currently nonstandard
>> behaviors.
>>
>> So, my goals here have been to first, not break anything, and then do a
>> slightly better job finding phy's that are (mostly?) responding correctly to
>> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
>> have IEEE conformant phy's you should be ok". So, for your example phy, I
>> guess the immediate answer is "use DT" or "find a conformant phy", or even
>> "abstract it in the firmware and use a mailbox interface".
> 
> You haven't understood.  The PHY does conform for most of the MMDs,
> but there are a number that do not conform.
> 

Maybe I should clarify. This set is still terminating the search for a 
valid MMD device list on the first MMD that responds. It then probes the 
same ID registers of the flagged MMDs as before. What has changed is 
that we search higher into the MMD address space for a device list. So 
previously if a device didn't respond to MMD0-8 it was ignored. Now it 
needs to fail all of 0-31 to be ignored. Similarly for the ID's, if we 
find what appears to be a valid MMD device list, then we will probe not 
only the original 1-8 MMDs for IDs, but 1-31 MMDs for IDs.

So any device which presented a non zero, non "mostly ff's" devices list 
in 0-8 will get the same device list as before. Similarly we probe for 
ids at the same MMD addresses as before, with additional MMD detection 
 >8. This change means we pick up additional phys, and we detect the 
correct MMD ids for more devices.

The possible negative differences are in 7/11 where we transition a 
device which responded with an ID=0 to 0xFFFFFFFF which will cause the 
code to not request a phy/mmd driver for 00000000 (or potentially some 
bogus OUI's due to the 1fffffff difference). Assuming a valid phy id 
gets applied somewhere, the same phy driver will load, and presumably it 
will know what to do with MMD's in the devices list.

So, I don't see anything in your example above which changes the 
detected MMD devices. Potentially though, not only will you get the 
previous MMD Id's, but you might get a few new ones. Whether MMD2 is 
probed is going to depend on whether MMD0/1's devices list has the MMD2 
device bit set (you weren't clear in the description above).
Jeremy Linton May 25, 2020, 11:22 p.m. UTC | #15
On 5/25/20 6:09 PM, Russell King - ARM Linux admin wrote:
> On Mon, May 25, 2020 at 05:22:07PM -0500, Jeremy Linton wrote:
>> On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
>>> On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
>>>> Hi,
>>>>
>>>> On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
>>>>> On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
>>>>>>> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>>>>>>>> Until this point, we have been sanitizing the c22
>>>>>>>> regs presence bit out of all the MMD device lists.
>>>>>>>> This is incorrect as it causes the 0xFFFFFFFF checks
>>>>>>>> to incorrectly fail. Further, it turns out that we
>>>>>>>> want to utilize this flag to make a determination that
>>>>>>>> there is actually a phy at this location and we should
>>>>>>>> be accessing it using c22.
>>>>>>>>
>>>>>>>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>>>>>>>> ---
>>>>>>>>      drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>>>>>      1 file changed, 13 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>>>>>> index f0761fa5e40b..2d677490ecab 100644
>>>>>>>> --- a/drivers/net/phy/phy_device.c
>>>>>>>> +++ b/drivers/net/phy/phy_device.c
>>>>>>>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>>>>>>>      		return -EIO;
>>>>>>>>      	*devices_in_package |= phy_reg;
>>>>>>>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>>>> -	*devices_in_package &= ~BIT(0);
>>>>>>>> -
>>>>>>>>      	return 0;
>>>>>>>>      }
>>>>>>>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>      	int i;
>>>>>>>>      	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>>>>>>>      	u32 *devs = &c45_ids->devices_in_package;
>>>>>>>> +	bool c22_present = false;
>>>>>>>> +	bool valid_id = false;
>>>>>>>>      	/* Find first non-zero Devices In package. Device zero is reserved
>>>>>>>>      	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>>>>>>>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>      		return 0;
>>>>>>>>      	}
>>>>>>>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>>>> +	c22_present = *devs & BIT(0);
>>>>>>>> +	*devs &= ~BIT(0);
>>>>>>>> +
>>>>>>>>      	/* Now probe Device Identifiers for each device present. */
>>>>>>>>      	for (i = 1; i < num_ids; i++) {
>>>>>>>>      		if (!(c45_ids->devices_in_package & (1 << i)))
>>>>>>>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>      		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>>>>>>>      		if (ret < 0)
>>>>>>>>      			return ret;
>>>>>>>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>>>>>>>> +			valid_id = true;
>>>>>>>
>>>>>>> Here you are using your "devices in package" validator to validate the
>>>>>>> PHY ID value.  One of the things it does is mask this value with
>>>>>>> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
>>>>>>> looks completely wrong.
>>>>>>
>>>>>> I think in this case I was just using it like the comment in
>>>>>> get_phy_device() "if the phy_id is mostly F's, there is no device here".
>>>>>>
>>>>>> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
>>>>>> that seem to indicate "bus ok, phy didn't respond".
>>>>>>
>>>>>> I just checked the OUI registration, and while there are a couple OUI's
>>>>>> registered that have a number of FFF's in them, none of those cases seems to
>>>>>> overlap sufficiently to cause this to throw them out. Plus a phy would also
>>>>>> have to have model+revision set to 'F's. So while might be possible, if
>>>>>> unlikely, at the moment I think the OUI registration keeps this from being a
>>>>>> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
>>>>>> guarantees that the field cannot be all '1's due to the OUI having X & M
>>>>>> bits cleared. It sort of looks like the mapping is trying to lose those
>>>>>> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
>>>>>> just read it three times cause it didn't make any sense).
>>>>>
>>>>> I should also note that we have at least one supported PHY where one
>>>>> of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
>>>>> odd numbered registers in one of the vendor MMDs for addresses 0
>>>>> through 0xefff - which has a bit set in the devices-in-package.
>>>>>
>>>>> It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
>>>>> devices-in-package bit is clear in most of the valid MMDs, so we
>>>>> shouldn't touch it.
>>>>>
>>>>> These reveal the problem of randomly probing MMDs - they can return
>>>>> unexpected values and not be as well behaved as we would like them to
>>>>> be.  Using register 8 to detect presence may be beneficial, but that
>>>>> may also introduce problems as we haven't used that before (and we
>>>>> don't know whether any PHY that wrong.)  I know at least the 88x3310
>>>>> gets it right for all except the vendor MMDs, where the low addresses
>>>>> appear non-confromant to the 802.3 specs.  Both vendor MMDs are
>>>>> definitely implemented, just not with anything conforming to 802.3.
>>>>
>>>> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
>>>> probe out correctly because it doesn't respond to the ieee defined
>>>> registers. I think at this point, there really isn't anything we can do
>>>> about that unless we involve the (ACPI) firmware in currently nonstandard
>>>> behaviors.
>>>>
>>>> So, my goals here have been to first, not break anything, and then do a
>>>> slightly better job finding phy's that are (mostly?) responding correctly to
>>>> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
>>>> have IEEE conformant phy's you should be ok". So, for your example phy, I
>>>> guess the immediate answer is "use DT" or "find a conformant phy", or even
>>>> "abstract it in the firmware and use a mailbox interface".
>>>
>>> You haven't understood.  The PHY does conform for most of the MMDs,
>>> but there are a number that do not conform.
>>
>> Probably...
>>
>> Except that i'm not sure how that is a problem at the moment, its still
>> going to trigger as a found phy, and walk the same mmd list as before
>> requesting drivers. Those drivers haven't changed their behavior so where is
>> the problem? If there is a problem its in 7/11 where things are getting
>> kicked due to seemingly invalid Ids.
>>
>> The 1/11 devices=0 case actually appears to be a bug i'm fixing because you
>> won't get an ID or a MMD list from that (before or after).
> 
> I think I've just flattened that argument in my immediately preceding
> reply on the Cortina situation; I think you've grossly misread that
> through not fully researching the history and then finding the
> existing users.
> 
> There is no bug that you are fixing from what I can see.

One of us is missing something,

The "cortina" solution is broken in the current kernel. That is because 
lines 726-742 are dead code due to line 693.

I believe I've understood the problem there, and corrected it in this 
set along with a few others, but its distinctly possible that isn't true.
Russell King (Oracle) May 25, 2020, 11:30 p.m. UTC | #16
On Mon, May 25, 2020 at 06:16:18PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> > On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
> > > So, my goals here have been to first, not break anything, and then do a
> > > slightly better job finding phy's that are (mostly?) responding correctly to
> > > the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> > > have IEEE conformant phy's you should be ok". So, for your example phy, I
> > > guess the immediate answer is "use DT" or "find a conformant phy", or even
> > > "abstract it in the firmware and use a mailbox interface".
> > 
> > You haven't understood.  The PHY does conform for most of the MMDs,
> > but there are a number that do not conform.
> > 
> 
> Maybe I should clarify. This set is still terminating the search for a valid
> MMD device list on the first MMD that responds. It then probes the same ID
> registers of the flagged MMDs as before. What has changed is that we search
> higher into the MMD address space for a device list. So previously if a
> device didn't respond to MMD0-8 it was ignored. Now it needs to fail all of
> 0-31 to be ignored. Similarly for the ID's, if we find what appears to be a
> valid MMD device list, then we will probe not only the original 1-8 MMDs for
> IDs, but 1-31 MMDs for IDs.

Clarification is not required; I understand what you're doing, but you
are not understanding my points.

For the 88x3310, your change means that the list of IDs for this PHY
will not only 0x002b09aX, 0x01410daX (the official IDs), but also
0x00000000 and 0xfffe0000 from MMD 30 and 31 respectively, which are
not real IDs.  That's two incorrect IDs that should actually not be
there.

Here's what the first few registers from MMD 30 and 31 look like on
this PHY:

MMD30:
 Addr  Data
 0000  0000 0000 0000 0000 0000 0000 0000 0000
 0008  0000 0000 0000 0000 0000 0000 0000 0000
 0010  0000 0000 0000 0000 0000 0000 0000 0000

MMD31:
 0000  fffe 0000 fffe 0000 fffe 0000 fffe 0000
 0008  fffe 0000 fffe 0000 fffe 0000 fffe 0000
 0010  fffe 0000 fffe 0000 fffe 0000 fffe 0000

We've got away with it so far on several counts:
1. The code doesn't probe that high for IDs.
2. We have no driver that may match those IDs.

You're taking away (1), so now all it takes is for condition (2) to
be broken, and we can end up with a regression if another driver
appears that may match using those.

So, I would suggest that you avoid reading IDs from MMD 30/31, or
maybe only read the ID from MMDs > 8 if register 8 indicates that
there is a device present at that MMD.  That would be compliant
with IEEE 802.3, preserve our existing behaviour, while allowing
us to expand the IDs that we probe for to have a better chance of
only detecting truely valid IDs.
Russell King (Oracle) May 25, 2020, 11:33 p.m. UTC | #17
On Mon, May 25, 2020 at 06:22:19PM -0500, Jeremy Linton wrote:
> On 5/25/20 6:09 PM, Russell King - ARM Linux admin wrote:
> > On Mon, May 25, 2020 at 05:22:07PM -0500, Jeremy Linton wrote:
> > > On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> > > > On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
> > > > > Hi,
> > > > > 
> > > > > On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
> > > > > > On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> > > > > > > Hi,
> > > > > > > 
> > > > > > > On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > > > > > > > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > > > > > > > Until this point, we have been sanitizing the c22
> > > > > > > > > regs presence bit out of all the MMD device lists.
> > > > > > > > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > > > > > > > to incorrectly fail. Further, it turns out that we
> > > > > > > > > want to utilize this flag to make a determination that
> > > > > > > > > there is actually a phy at this location and we should
> > > > > > > > > be accessing it using c22.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > > > > > > > ---
> > > > > > > > >      drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > > > > > > > >      1 file changed, 13 insertions(+), 3 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > > > > > > > index f0761fa5e40b..2d677490ecab 100644
> > > > > > > > > --- a/drivers/net/phy/phy_device.c
> > > > > > > > > +++ b/drivers/net/phy/phy_device.c
> > > > > > > > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > > > > > > > >      		return -EIO;
> > > > > > > > >      	*devices_in_package |= phy_reg;
> > > > > > > > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > > > -	*devices_in_package &= ~BIT(0);
> > > > > > > > > -
> > > > > > > > >      	return 0;
> > > > > > > > >      }
> > > > > > > > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > >      	int i;
> > > > > > > > >      	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > > > > > > > >      	u32 *devs = &c45_ids->devices_in_package;
> > > > > > > > > +	bool c22_present = false;
> > > > > > > > > +	bool valid_id = false;
> > > > > > > > >      	/* Find first non-zero Devices In package. Device zero is reserved
> > > > > > > > >      	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > > > > > > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > >      		return 0;
> > > > > > > > >      	}
> > > > > > > > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > > > +	c22_present = *devs & BIT(0);
> > > > > > > > > +	*devs &= ~BIT(0);
> > > > > > > > > +
> > > > > > > > >      	/* Now probe Device Identifiers for each device present. */
> > > > > > > > >      	for (i = 1; i < num_ids; i++) {
> > > > > > > > >      		if (!(c45_ids->devices_in_package & (1 << i)))
> > > > > > > > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > >      		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > > > > > > > >      		if (ret < 0)
> > > > > > > > >      			return ret;
> > > > > > > > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > > > > > > > +			valid_id = true;
> > > > > > > > 
> > > > > > > > Here you are using your "devices in package" validator to validate the
> > > > > > > > PHY ID value.  One of the things it does is mask this value with
> > > > > > > > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > > > > > > > looks completely wrong.
> > > > > > > 
> > > > > > > I think in this case I was just using it like the comment in
> > > > > > > get_phy_device() "if the phy_id is mostly F's, there is no device here".
> > > > > > > 
> > > > > > > My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
> > > > > > > that seem to indicate "bus ok, phy didn't respond".
> > > > > > > 
> > > > > > > I just checked the OUI registration, and while there are a couple OUI's
> > > > > > > registered that have a number of FFF's in them, none of those cases seems to
> > > > > > > overlap sufficiently to cause this to throw them out. Plus a phy would also
> > > > > > > have to have model+revision set to 'F's. So while might be possible, if
> > > > > > > unlikely, at the moment I think the OUI registration keeps this from being a
> > > > > > > problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> > > > > > > guarantees that the field cannot be all '1's due to the OUI having X & M
> > > > > > > bits cleared. It sort of looks like the mapping is trying to lose those
> > > > > > > bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> > > > > > > just read it three times cause it didn't make any sense).
> > > > > > 
> > > > > > I should also note that we have at least one supported PHY where one
> > > > > > of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
> > > > > > odd numbered registers in one of the vendor MMDs for addresses 0
> > > > > > through 0xefff - which has a bit set in the devices-in-package.
> > > > > > 
> > > > > > It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
> > > > > > devices-in-package bit is clear in most of the valid MMDs, so we
> > > > > > shouldn't touch it.
> > > > > > 
> > > > > > These reveal the problem of randomly probing MMDs - they can return
> > > > > > unexpected values and not be as well behaved as we would like them to
> > > > > > be.  Using register 8 to detect presence may be beneficial, but that
> > > > > > may also introduce problems as we haven't used that before (and we
> > > > > > don't know whether any PHY that wrong.)  I know at least the 88x3310
> > > > > > gets it right for all except the vendor MMDs, where the low addresses
> > > > > > appear non-confromant to the 802.3 specs.  Both vendor MMDs are
> > > > > > definitely implemented, just not with anything conforming to 802.3.
> > > > > 
> > > > > Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> > > > > probe out correctly because it doesn't respond to the ieee defined
> > > > > registers. I think at this point, there really isn't anything we can do
> > > > > about that unless we involve the (ACPI) firmware in currently nonstandard
> > > > > behaviors.
> > > > > 
> > > > > So, my goals here have been to first, not break anything, and then do a
> > > > > slightly better job finding phy's that are (mostly?) responding correctly to
> > > > > the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> > > > > have IEEE conformant phy's you should be ok". So, for your example phy, I
> > > > > guess the immediate answer is "use DT" or "find a conformant phy", or even
> > > > > "abstract it in the firmware and use a mailbox interface".
> > > > 
> > > > You haven't understood.  The PHY does conform for most of the MMDs,
> > > > but there are a number that do not conform.
> > > 
> > > Probably...
> > > 
> > > Except that i'm not sure how that is a problem at the moment, its still
> > > going to trigger as a found phy, and walk the same mmd list as before
> > > requesting drivers. Those drivers haven't changed their behavior so where is
> > > the problem? If there is a problem its in 7/11 where things are getting
> > > kicked due to seemingly invalid Ids.
> > > 
> > > The 1/11 devices=0 case actually appears to be a bug i'm fixing because you
> > > won't get an ID or a MMD list from that (before or after).
> > 
> > I think I've just flattened that argument in my immediately preceding
> > reply on the Cortina situation; I think you've grossly misread that
> > through not fully researching the history and then finding the
> > existing users.
> > 
> > There is no bug that you are fixing from what I can see.
> 
> One of us is missing something,
> 
> The "cortina" solution is broken in the current kernel. That is because
> lines 726-742 are dead code due to line 693.
> 
> I believe I've understood the problem there, and corrected it in this set
> along with a few others, but its distinctly possible that isn't true.

The code you refer to above is NOT used on the platforms that I have
identified use the Cortina PHY.  If this code is not used, it has not
caused any issue, and there is no breakage due to the change you are
referring to.
Jeremy Linton May 25, 2020, 11:42 p.m. UTC | #18
Hi,

On 5/25/20 6:33 PM, Russell King - ARM Linux admin wrote:
> On Mon, May 25, 2020 at 06:22:19PM -0500, Jeremy Linton wrote:
>> On 5/25/20 6:09 PM, Russell King - ARM Linux admin wrote:
>>> On Mon, May 25, 2020 at 05:22:07PM -0500, Jeremy Linton wrote:
>>>> On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
>>>>> On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
>>>>>>> On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
>>>>>>>>> On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
>>>>>>>>>> Until this point, we have been sanitizing the c22
>>>>>>>>>> regs presence bit out of all the MMD device lists.
>>>>>>>>>> This is incorrect as it causes the 0xFFFFFFFF checks
>>>>>>>>>> to incorrectly fail. Further, it turns out that we
>>>>>>>>>> want to utilize this flag to make a determination that
>>>>>>>>>> there is actually a phy at this location and we should
>>>>>>>>>> be accessing it using c22.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>>>>>>>       1 file changed, 13 insertions(+), 3 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>>>>>>>> index f0761fa5e40b..2d677490ecab 100644
>>>>>>>>>> --- a/drivers/net/phy/phy_device.c
>>>>>>>>>> +++ b/drivers/net/phy/phy_device.c
>>>>>>>>>> @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>>>>>>>>>>       		return -EIO;
>>>>>>>>>>       	*devices_in_package |= phy_reg;
>>>>>>>>>> -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>>>>>> -	*devices_in_package &= ~BIT(0);
>>>>>>>>>> -
>>>>>>>>>>       	return 0;
>>>>>>>>>>       }
>>>>>>>>>> @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>>>       	int i;
>>>>>>>>>>       	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>>>>>>>>>>       	u32 *devs = &c45_ids->devices_in_package;
>>>>>>>>>> +	bool c22_present = false;
>>>>>>>>>> +	bool valid_id = false;
>>>>>>>>>>       	/* Find first non-zero Devices In package. Device zero is reserved
>>>>>>>>>>       	 * for 802.3 c45 complied PHYs, so don't probe it at first.
>>>>>>>>>> @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>>>       		return 0;
>>>>>>>>>>       	}
>>>>>>>>>> +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
>>>>>>>>>> +	c22_present = *devs & BIT(0);
>>>>>>>>>> +	*devs &= ~BIT(0);
>>>>>>>>>> +
>>>>>>>>>>       	/* Now probe Device Identifiers for each device present. */
>>>>>>>>>>       	for (i = 1; i < num_ids; i++) {
>>>>>>>>>>       		if (!(c45_ids->devices_in_package & (1 << i)))
>>>>>>>>>> @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>>>>>>>>>>       		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
>>>>>>>>>>       		if (ret < 0)
>>>>>>>>>>       			return ret;
>>>>>>>>>> +		if (valid_phy_id(c45_ids->device_ids[i]))
>>>>>>>>>> +			valid_id = true;
>>>>>>>>>
>>>>>>>>> Here you are using your "devices in package" validator to validate the
>>>>>>>>> PHY ID value.  One of the things it does is mask this value with
>>>>>>>>> 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
>>>>>>>>> looks completely wrong.
>>>>>>>>
>>>>>>>> I think in this case I was just using it like the comment in
>>>>>>>> get_phy_device() "if the phy_id is mostly F's, there is no device here".
>>>>>>>>
>>>>>>>> My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
>>>>>>>> that seem to indicate "bus ok, phy didn't respond".
>>>>>>>>
>>>>>>>> I just checked the OUI registration, and while there are a couple OUI's
>>>>>>>> registered that have a number of FFF's in them, none of those cases seems to
>>>>>>>> overlap sufficiently to cause this to throw them out. Plus a phy would also
>>>>>>>> have to have model+revision set to 'F's. So while might be possible, if
>>>>>>>> unlikely, at the moment I think the OUI registration keeps this from being a
>>>>>>>> problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
>>>>>>>> guarantees that the field cannot be all '1's due to the OUI having X & M
>>>>>>>> bits cleared. It sort of looks like the mapping is trying to lose those
>>>>>>>> bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
>>>>>>>> just read it three times cause it didn't make any sense).
>>>>>>>
>>>>>>> I should also note that we have at least one supported PHY where one
>>>>>>> of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
>>>>>>> odd numbered registers in one of the vendor MMDs for addresses 0
>>>>>>> through 0xefff - which has a bit set in the devices-in-package.
>>>>>>>
>>>>>>> It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
>>>>>>> devices-in-package bit is clear in most of the valid MMDs, so we
>>>>>>> shouldn't touch it.
>>>>>>>
>>>>>>> These reveal the problem of randomly probing MMDs - they can return
>>>>>>> unexpected values and not be as well behaved as we would like them to
>>>>>>> be.  Using register 8 to detect presence may be beneficial, but that
>>>>>>> may also introduce problems as we haven't used that before (and we
>>>>>>> don't know whether any PHY that wrong.)  I know at least the 88x3310
>>>>>>> gets it right for all except the vendor MMDs, where the low addresses
>>>>>>> appear non-confromant to the 802.3 specs.  Both vendor MMDs are
>>>>>>> definitely implemented, just not with anything conforming to 802.3.
>>>>>>
>>>>>> Yes, we know even for the NXP reference hardware, one of the phy's doesn't
>>>>>> probe out correctly because it doesn't respond to the ieee defined
>>>>>> registers. I think at this point, there really isn't anything we can do
>>>>>> about that unless we involve the (ACPI) firmware in currently nonstandard
>>>>>> behaviors.
>>>>>>
>>>>>> So, my goals here have been to first, not break anything, and then do a
>>>>>> slightly better job finding phy's that are (mostly?) responding correctly to
>>>>>> the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
>>>>>> have IEEE conformant phy's you should be ok". So, for your example phy, I
>>>>>> guess the immediate answer is "use DT" or "find a conformant phy", or even
>>>>>> "abstract it in the firmware and use a mailbox interface".
>>>>>
>>>>> You haven't understood.  The PHY does conform for most of the MMDs,
>>>>> but there are a number that do not conform.
>>>>
>>>> Probably...
>>>>
>>>> Except that i'm not sure how that is a problem at the moment, its still
>>>> going to trigger as a found phy, and walk the same mmd list as before
>>>> requesting drivers. Those drivers haven't changed their behavior so where is
>>>> the problem? If there is a problem its in 7/11 where things are getting
>>>> kicked due to seemingly invalid Ids.
>>>>
>>>> The 1/11 devices=0 case actually appears to be a bug i'm fixing because you
>>>> won't get an ID or a MMD list from that (before or after).
>>>
>>> I think I've just flattened that argument in my immediately preceding
>>> reply on the Cortina situation; I think you've grossly misread that
>>> through not fully researching the history and then finding the
>>> existing users.
>>>
>>> There is no bug that you are fixing from what I can see.
>>
>> One of us is missing something,
>>
>> The "cortina" solution is broken in the current kernel. That is because
>> lines 726-742 are dead code due to line 693.
>>
>> I believe I've understood the problem there, and corrected it in this set
>> along with a few others, but its distinctly possible that isn't true.
> 
> The code you refer to above is NOT used on the platforms that I have
> identified use the Cortina PHY.  If this code is not used, it has not
> caused any issue, and there is no breakage due to the change you are
> referring to.
> 
Right, which is what I sort of expected. Because its falling back to a 
device list of 0xFFFFFFFF, which means probe every single MMD.

Combined with the lack of filtering means that your getting a bunch of 
MMD IDs that potentially are invalid, along with any that happen to be 
valid. Its that behavior (and some others) which were what blew this set 
up from a couple lines of tweaks into this mess.

I don't really see a way to guess at all the "wrong" ids that are being 
pushed into the system. Which is why I started to think about a "strict" 
mode later in the set. Maybe at this point the only way around some of 
these bugs/side effects/etc is just a second c45 probe routine if we 
don't think its possible to implement a variable strictness scanner in 
this code path without losing phys that previously were detected.
Jeremy Linton May 25, 2020, 11:46 p.m. UTC | #19
Hi,


On 5/25/20 6:12 PM, Andrew Lunn wrote:
>> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
>> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
>> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
>> arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
>> arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
>> arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> 
> Hi Jeremy
> 
> You are doing this work for NXP right? Maybe you can ask them to go
> searching in the cellar and find you one of these boards?

Yes, thats a good idea. I've been talking to various parties to let me 
run this code on some of their "odd" devices.
Andrew Lunn May 25, 2020, 11:46 p.m. UTC | #20
> Right, which is what I sort of expected. Because its falling back to a
> device list of 0xFFFFFFFF, which means probe every single MMD.
> 
> Combined with the lack of filtering means that your getting a bunch of MMD
> IDs that potentially are invalid, along with any that happen to be valid.
> Its that behavior (and some others) which were what blew this set up from a
> couple lines of tweaks into this mess.
> 
> I don't really see a way to guess at all the "wrong" ids that are being
> pushed into the system. Which is why I started to think about a "strict"
> mode later in the set. Maybe at this point the only way around some of these
> bugs/side effects/etc is just a second c45 probe routine if we don't think
> its possible to implement a variable strictness scanner in this code path
> without losing phys that previously were detected.

Hi Jeremy

I really think we need to find one of those boards which have a
cortina and see what it actually does. Can you get in contact with NXP
and see if they can find one?

    Thanks
        Andrew
Andrew Lunn May 25, 2020, 11:47 p.m. UTC | #21
On Mon, May 25, 2020 at 06:46:10PM -0500, Jeremy Linton wrote:
> Hi,
> 
> 
> On 5/25/20 6:12 PM, Andrew Lunn wrote:
> > > arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > > arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > > arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > > arch/powerpc/boot/dts/fsl/t4240rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > > arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > > arch/powerpc/boot/dts/fsl/t2080rdb.dts: compatible = "ethernet-phy-id13e5.1002";
> > 
> > Hi Jeremy
> > 
> > You are doing this work for NXP right? Maybe you can ask them to go
> > searching in the cellar and find you one of these boards?
> 
> Yes, thats a good idea. I've been talking to various parties to let me run
> this code on some of their "odd" devices.

O.K. great.

Then i think we should all stop emailing for a while until we know
more.

	Andrew
Russell King (Oracle) May 25, 2020, 11:57 p.m. UTC | #22
On Mon, May 25, 2020 at 06:42:50PM -0500, Jeremy Linton wrote:
> Hi,
> 
> On 5/25/20 6:33 PM, Russell King - ARM Linux admin wrote:
> > On Mon, May 25, 2020 at 06:22:19PM -0500, Jeremy Linton wrote:
> > > On 5/25/20 6:09 PM, Russell King - ARM Linux admin wrote:
> > > > On Mon, May 25, 2020 at 05:22:07PM -0500, Jeremy Linton wrote:
> > > > > On 5/25/20 5:01 PM, Russell King - ARM Linux admin wrote:
> > > > > > On Mon, May 25, 2020 at 04:51:16PM -0500, Jeremy Linton wrote:
> > > > > > > Hi,
> > > > > > > 
> > > > > > > On 5/25/20 5:06 AM, Russell King - ARM Linux admin wrote:
> > > > > > > > On Sun, May 24, 2020 at 10:34:13PM -0500, Jeremy Linton wrote:
> > > > > > > > > Hi,
> > > > > > > > > 
> > > > > > > > > On 5/23/20 1:37 PM, Russell King - ARM Linux admin wrote:
> > > > > > > > > > On Fri, May 22, 2020 at 04:30:52PM -0500, Jeremy Linton wrote:
> > > > > > > > > > > Until this point, we have been sanitizing the c22
> > > > > > > > > > > regs presence bit out of all the MMD device lists.
> > > > > > > > > > > This is incorrect as it causes the 0xFFFFFFFF checks
> > > > > > > > > > > to incorrectly fail. Further, it turns out that we
> > > > > > > > > > > want to utilize this flag to make a determination that
> > > > > > > > > > > there is actually a phy at this location and we should
> > > > > > > > > > > be accessing it using c22.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> > > > > > > > > > > ---
> > > > > > > > > > >       drivers/net/phy/phy_device.c | 16 +++++++++++++---
> > > > > > > > > > >       1 file changed, 13 insertions(+), 3 deletions(-)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> > > > > > > > > > > index f0761fa5e40b..2d677490ecab 100644
> > > > > > > > > > > --- a/drivers/net/phy/phy_device.c
> > > > > > > > > > > +++ b/drivers/net/phy/phy_device.c
> > > > > > > > > > > @@ -689,9 +689,6 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
> > > > > > > > > > >       		return -EIO;
> > > > > > > > > > >       	*devices_in_package |= phy_reg;
> > > > > > > > > > > -	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > > > > > -	*devices_in_package &= ~BIT(0);
> > > > > > > > > > > -
> > > > > > > > > > >       	return 0;
> > > > > > > > > > >       }
> > > > > > > > > > > @@ -742,6 +739,8 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > > > >       	int i;
> > > > > > > > > > >       	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> > > > > > > > > > >       	u32 *devs = &c45_ids->devices_in_package;
> > > > > > > > > > > +	bool c22_present = false;
> > > > > > > > > > > +	bool valid_id = false;
> > > > > > > > > > >       	/* Find first non-zero Devices In package. Device zero is reserved
> > > > > > > > > > >       	 * for 802.3 c45 complied PHYs, so don't probe it at first.
> > > > > > > > > > > @@ -770,6 +769,10 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > > > >       		return 0;
> > > > > > > > > > >       	}
> > > > > > > > > > > +	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
> > > > > > > > > > > +	c22_present = *devs & BIT(0);
> > > > > > > > > > > +	*devs &= ~BIT(0);
> > > > > > > > > > > +
> > > > > > > > > > >       	/* Now probe Device Identifiers for each device present. */
> > > > > > > > > > >       	for (i = 1; i < num_ids; i++) {
> > > > > > > > > > >       		if (!(c45_ids->devices_in_package & (1 << i)))
> > > > > > > > > > > @@ -778,6 +781,13 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
> > > > > > > > > > >       		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
> > > > > > > > > > >       		if (ret < 0)
> > > > > > > > > > >       			return ret;
> > > > > > > > > > > +		if (valid_phy_id(c45_ids->device_ids[i]))
> > > > > > > > > > > +			valid_id = true;
> > > > > > > > > > 
> > > > > > > > > > Here you are using your "devices in package" validator to validate the
> > > > > > > > > > PHY ID value.  One of the things it does is mask this value with
> > > > > > > > > > 0x1fffffff.  That means you lose some of the vendor OUI.  To me, this
> > > > > > > > > > looks completely wrong.
> > > > > > > > > 
> > > > > > > > > I think in this case I was just using it like the comment in
> > > > > > > > > get_phy_device() "if the phy_id is mostly F's, there is no device here".
> > > > > > > > > 
> > > > > > > > > My understanding is that the code is trying to avoid the 0xFFFFFFFF returns
> > > > > > > > > that seem to indicate "bus ok, phy didn't respond".
> > > > > > > > > 
> > > > > > > > > I just checked the OUI registration, and while there are a couple OUI's
> > > > > > > > > registered that have a number of FFF's in them, none of those cases seems to
> > > > > > > > > overlap sufficiently to cause this to throw them out. Plus a phy would also
> > > > > > > > > have to have model+revision set to 'F's. So while might be possible, if
> > > > > > > > > unlikely, at the moment I think the OUI registration keeps this from being a
> > > > > > > > > problem. Particularly, if i'm reading the mapping correctly, the OUI mapping
> > > > > > > > > guarantees that the field cannot be all '1's due to the OUI having X & M
> > > > > > > > > bits cleared. It sort of looks like the mapping is trying to lose those
> > > > > > > > > bits, by tossing bit 1 & 2, but the X & M are in the wrong octet (AFAIK, I
> > > > > > > > > just read it three times cause it didn't make any sense).
> > > > > > > > 
> > > > > > > > I should also note that we have at least one supported PHY where one
> > > > > > > > of the MMDs returns 0xfffe for even numbered registers and 0x0000 for
> > > > > > > > odd numbered registers in one of the vendor MMDs for addresses 0
> > > > > > > > through 0xefff - which has a bit set in the devices-in-package.
> > > > > > > > 
> > > > > > > > It also returns 0x0082 for almost every register in MMD 2, but MMD 2's
> > > > > > > > devices-in-package bit is clear in most of the valid MMDs, so we
> > > > > > > > shouldn't touch it.
> > > > > > > > 
> > > > > > > > These reveal the problem of randomly probing MMDs - they can return
> > > > > > > > unexpected values and not be as well behaved as we would like them to
> > > > > > > > be.  Using register 8 to detect presence may be beneficial, but that
> > > > > > > > may also introduce problems as we haven't used that before (and we
> > > > > > > > don't know whether any PHY that wrong.)  I know at least the 88x3310
> > > > > > > > gets it right for all except the vendor MMDs, where the low addresses
> > > > > > > > appear non-confromant to the 802.3 specs.  Both vendor MMDs are
> > > > > > > > definitely implemented, just not with anything conforming to 802.3.
> > > > > > > 
> > > > > > > Yes, we know even for the NXP reference hardware, one of the phy's doesn't
> > > > > > > probe out correctly because it doesn't respond to the ieee defined
> > > > > > > registers. I think at this point, there really isn't anything we can do
> > > > > > > about that unless we involve the (ACPI) firmware in currently nonstandard
> > > > > > > behaviors.
> > > > > > > 
> > > > > > > So, my goals here have been to first, not break anything, and then do a
> > > > > > > slightly better job finding phy's that are (mostly?) responding correctly to
> > > > > > > the 802.3 spec. So we can say "if you hardware is ACPI conformant, and you
> > > > > > > have IEEE conformant phy's you should be ok". So, for your example phy, I
> > > > > > > guess the immediate answer is "use DT" or "find a conformant phy", or even
> > > > > > > "abstract it in the firmware and use a mailbox interface".
> > > > > > 
> > > > > > You haven't understood.  The PHY does conform for most of the MMDs,
> > > > > > but there are a number that do not conform.
> > > > > 
> > > > > Probably...
> > > > > 
> > > > > Except that i'm not sure how that is a problem at the moment, its still
> > > > > going to trigger as a found phy, and walk the same mmd list as before
> > > > > requesting drivers. Those drivers haven't changed their behavior so where is
> > > > > the problem? If there is a problem its in 7/11 where things are getting
> > > > > kicked due to seemingly invalid Ids.
> > > > > 
> > > > > The 1/11 devices=0 case actually appears to be a bug i'm fixing because you
> > > > > won't get an ID or a MMD list from that (before or after).
> > > > 
> > > > I think I've just flattened that argument in my immediately preceding
> > > > reply on the Cortina situation; I think you've grossly misread that
> > > > through not fully researching the history and then finding the
> > > > existing users.
> > > > 
> > > > There is no bug that you are fixing from what I can see.
> > > 
> > > One of us is missing something,
> > > 
> > > The "cortina" solution is broken in the current kernel. That is because
> > > lines 726-742 are dead code due to line 693.
> > > 
> > > I believe I've understood the problem there, and corrected it in this set
> > > along with a few others, but its distinctly possible that isn't true.
> > 
> > The code you refer to above is NOT used on the platforms that I have
> > identified use the Cortina PHY.  If this code is not used, it has not
> > caused any issue, and there is no breakage due to the change you are
> > referring to.
> > 
> Right, which is what I sort of expected. Because its falling back to a
> device list of 0xFFFFFFFF, which means probe every single MMD.

No.  no no no no no.

In the platforms that I have identified, the Cortina PHY will be
created by the DT code (drivers/of/of_mdio.c).  The PHY device
will be created by phy_device_create() with is_c45 *false*.
phydev->c45_ids will actually end up containing all-zeros.
So, there is no list of MMDs in this case.

The phy_device_create() path does not call get_phy_c45_ids().
This code is not run for any of the platforms I've identified
for a Cortina PHY.

The workaround for the devices-in-package was added back in
2015, two years _before_ there was a Cortina PHY driver, when
the phylib support for Clause 45 PHYs was in its infancy - there
was something really basic that didn't care what ID the PHY was,
just assumed it was a 10G PHY, no configuration of it, and just
reported link up/down.  So, back then IDs were mostly meaningless
for Clause 45 PHYs.

In 2017, a Cortina PHY driver was added, and we now have some
platforms that use this, and they _totally_ avoid this code path.

The workaround is likely obsolete and redundant, but we've no way
to know if removing it will create a regression.

In any case, for the reasons I've already clearly set out in one of
my previous emails analysing the Cortina situation (which you seem
to have ignored), pointing out that even the MMD 0 register set is
not compatible and would likely not reveal a valid ID, I think it's
highly likely that Cortina PHYs did not work for very long between
2015 and 2017, but continue to work fine today.

Please, rather than immediately writing yet another email trying to
"clarify" something in reply to this email, please go away and think
about some of the points I've raised, read the code as it stands,
not only in phylib but also in drivers/of.  Look at the device tree
descriptions for the boards I've pointed out.  Analyse what the code
would do.  It will help you immensely to have that understanding,
and I'm sure you will come to the same conclusion I have that the
workaround we see in get_phy_c45_ids() is likely obsolete.
diff mbox series

Patch

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index f0761fa5e40b..2d677490ecab 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -689,9 +689,6 @@  static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
 		return -EIO;
 	*devices_in_package |= phy_reg;
 
-	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
-	*devices_in_package &= ~BIT(0);
-
 	return 0;
 }
 
@@ -742,6 +739,8 @@  static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
 	int i;
 	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
 	u32 *devs = &c45_ids->devices_in_package;
+	bool c22_present = false;
+	bool valid_id = false;
 
 	/* Find first non-zero Devices In package. Device zero is reserved
 	 * for 802.3 c45 complied PHYs, so don't probe it at first.
@@ -770,6 +769,10 @@  static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
 		return 0;
 	}
 
+	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
+	c22_present = *devs & BIT(0);
+	*devs &= ~BIT(0);
+
 	/* Now probe Device Identifiers for each device present. */
 	for (i = 1; i < num_ids; i++) {
 		if (!(c45_ids->devices_in_package & (1 << i)))
@@ -778,6 +781,13 @@  static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
 		ret = _get_phy_id(bus, addr, i, &c45_ids->device_ids[i], true);
 		if (ret < 0)
 			return ret;
+		if (valid_phy_id(c45_ids->device_ids[i]))
+			valid_id = true;
+	}
+
+	if (!valid_id && c22_present) {
+		*phy_id = 0xffffffff;
+	        return 0;
 	}
 	*phy_id = 0;
 	return 0;