diff mbox series

[net-next,v2,09/11] net: dsa: microchip: ksz9477: add hardware time stamping support

Message ID 20201112153537.22383-10-ceggers@arri.de
State Superseded
Headers show
Series net: dsa: microchip: PTP support for KSZ956x | expand

Commit Message

Christian Eggers Nov. 12, 2020, 3:35 p.m. UTC
Add routines required for TX hardware time stamping.

The KSZ9563 only supports one step time stamping
(HWTSTAMP_TX_ONESTEP_P2P), which requires linuxptp-2.0 or later. PTP
mode is permanently enabled (changes tail tag; depends on
CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP).TX time stamps are reported via an
interrupt / device registers whilst RX time stamps are reported via an
additional tail tag.

Currently, only P2P delay measurement is supported. See patchwork
discussion and comments in ksz9477_ptp_init() for details:
https://patchwork.ozlabs.org/project/netdev/patch/20201019172435.4416-8-ceggers@arri.de/

One step TX time stamping of PDelay_Resp requires the RX time stamp from
the associated PDelay_Req message. linuxptp assumes that the RX time
stamp has already been subtracted from the PDelay_Req correction field
(as done by the TI PHYTER). linuxptp will echo back the value of the
correction field in the PDelay_Resp message.

In order to be compatible to this already established interface, the
KSZ9563 code emulates this behavior. When processing the PDelay_Resp
message, the time stamp is moved back from the correction field to the
tail tag, as the hardware doesn't support negative values on this field.
Of course, the UDP checksums (if any) have to be corrected after this
(for both directions).

Everything has been tested on a Microchip KSZ9563 switch.

Signed-off-by: Christian Eggers <ceggers@arri.de>
---
 drivers/net/dsa/microchip/ksz9477_main.c |   9 +-
 drivers/net/dsa/microchip/ksz9477_ptp.c  | 544 +++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz9477_ptp.h  |  27 ++
 include/linux/dsa/ksz_common.h           |  47 ++
 net/dsa/tag_ksz.c                        | 121 ++++-
 5 files changed, 740 insertions(+), 8 deletions(-)

Comments

Vladimir Oltean Nov. 13, 2020, 2:40 a.m. UTC | #1
On Thu, Nov 12, 2020 at 04:35:35PM +0100, Christian Eggers wrote:
> Add routines required for TX hardware time stamping.
>
> The KSZ9563 only supports one step time stamping
> (HWTSTAMP_TX_ONESTEP_P2P), which requires linuxptp-2.0 or later. PTP
> mode is permanently enabled (changes tail tag; depends on
> CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP).TX time stamps are reported via an
> interrupt / device registers whilst RX time stamps are reported via an
> additional tail tag.
>
> Currently, only P2P delay measurement is supported. See patchwork
> discussion and comments in ksz9477_ptp_init() for details:
> https://patchwork.ozlabs.org/project/netdev/patch/20201019172435.4416-8-ceggers@arri.de/
>
> One step TX time stamping of PDelay_Resp requires the RX time stamp from
> the associated PDelay_Req message. linuxptp assumes that the RX time
> stamp has already been subtracted from the PDelay_Req correction field
> (as done by the TI PHYTER). linuxptp will echo back the value of the

No, not TI PHYTER, this was discussed before, but the ZHAW InES PTP time
stamping IP core.

> correction field in the PDelay_Resp message.

Misleading. Not just linuxptp will do that. Any PTP stack must do that.

> In order to be compatible to this already established interface, the
> KSZ9563 code emulates this behavior. When processing the PDelay_Resp
> message, the time stamp is moved back from the correction field to the
> tail tag, as the hardware doesn't support negative values on this field.

Old comment.

> Of course, the UDP checksums (if any) have to be corrected after this
> (for both directions).
>
> Everything has been tested on a Microchip KSZ9563 switch.
>
> Signed-off-by: Christian Eggers <ceggers@arri.de>
> ---
>  drivers/net/dsa/microchip/ksz9477_main.c |   9 +-
>  drivers/net/dsa/microchip/ksz9477_ptp.c  | 544 +++++++++++++++++++++++
>  drivers/net/dsa/microchip/ksz9477_ptp.h  |  27 ++
>  include/linux/dsa/ksz_common.h           |  47 ++
>  net/dsa/tag_ksz.c                        | 121 ++++-
>  5 files changed, 740 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/dsa/microchip/ksz9477_main.c b/drivers/net/dsa/microchip/ksz9477_main.c
> index 7d623400139f..42cd17c8c25d 100644
> --- a/drivers/net/dsa/microchip/ksz9477_main.c
> +++ b/drivers/net/dsa/microchip/ksz9477_main.c
> @@ -1388,6 +1388,7 @@ static const struct dsa_switch_ops ksz9477_switch_ops = {
>  	.phy_read		= ksz9477_phy_read16,
>  	.phy_write		= ksz9477_phy_write16,
>  	.phylink_mac_link_down	= ksz_mac_link_down,
> +	.get_ts_info		= ksz9477_ptp_get_ts_info,
>  	.port_enable		= ksz_enable_port,
>  	.get_strings		= ksz9477_get_strings,
>  	.get_ethtool_stats	= ksz_get_ethtool_stats,
> @@ -1408,6 +1409,11 @@ static const struct dsa_switch_ops ksz9477_switch_ops = {
>  	.port_mdb_del           = ksz9477_port_mdb_del,
>  	.port_mirror_add	= ksz9477_port_mirror_add,
>  	.port_mirror_del	= ksz9477_port_mirror_del,
> +	.port_hwtstamp_get      = ksz9477_ptp_port_hwtstamp_get,
> +	.port_hwtstamp_set      = ksz9477_ptp_port_hwtstamp_set,
> +	.port_txtstamp          = ksz9477_ptp_port_txtstamp,
> +	/* never defer rx delivery, tstamping is done via tail tagging */
> +	.port_rxtstamp          = NULL,
>  };
>
>  static u32 ksz9477_get_port_addr(int port, int offset)
> @@ -1554,7 +1560,8 @@ static irqreturn_t ksz9477_switch_irq_thread(int irq, void *dev_id)
>  			if (ret)
>  				return result;
>
> -			/* ToDo: Add specific handling of port interrupts */
> +			if (data8 & PORT_PTP_INT)
> +				result |= ksz9477_ptp_port_interrupt(dev, port);

No... why are you making assumptions about the value of IRQ_NONE?

>  		}
>  	}
>
> diff --git a/drivers/net/dsa/microchip/ksz9477_ptp.c b/drivers/net/dsa/microchip/ksz9477_ptp.c
> index 44d7bbdea518..12698568b68b 100644
> --- a/drivers/net/dsa/microchip/ksz9477_ptp.c
> +++ b/drivers/net/dsa/microchip/ksz9477_ptp.c
> @@ -6,7 +6,10 @@
>   * Copyright (c) 2020 ARRI Lighting
>   */
>
> +#include <linux/net_tstamp.h>
> +#include <linux/ptp_classify.h>
>  #include <linux/ptp_clock_kernel.h>
> +#include <linux/sysfs.h>
>
>  #include "ksz_common.h"
>  #include "ksz9477_reg.h"
> @@ -71,8 +74,10 @@ static int ksz9477_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
>  static int ksz9477_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
>  {
>  	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
> +	struct timespec64 delta64 = ns_to_timespec64(delta);
>  	s32 sec, nsec;
>  	u16 data16;
> +	unsigned long flags;

Reverse Christmas tree variable declaration, please.

>  	int ret;
>
>  	mutex_lock(&dev->ptp_mutex);
> @@ -103,6 +108,10 @@ static int ksz9477_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
>  	if (ret)
>  		goto error_return;
>
> +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);

I believe that spin_lock_irqsave is unnecessary, since there is no code
that runs in hardirq context.

> +	dev->ptp_clock_time = timespec64_add(dev->ptp_clock_time, delta64);
> +	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> +
>  error_return:
>  	mutex_unlock(&dev->ptp_mutex);
>  	return ret;
> @@ -160,6 +169,7 @@ static int ksz9477_ptp_settime(struct ptp_clock_info *ptp, struct timespec64 con
>  {
>  	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
>  	u16 data16;
> +	unsigned long flags;
>  	int ret;
>
>  	mutex_lock(&dev->ptp_mutex);
> @@ -198,6 +208,10 @@ static int ksz9477_ptp_settime(struct ptp_clock_info *ptp, struct timespec64 con
>  	if (ret)
>  		goto error_return;
>
> +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> +	dev->ptp_clock_time = *ts;
> +	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> +
>  error_return:
>  	mutex_unlock(&dev->ptp_mutex);
>  	return ret;
> @@ -208,9 +222,27 @@ static int ksz9477_ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_reque
>  	return -ENOTTY;
>  }
>
> +static long ksz9477_ptp_do_aux_work(struct ptp_clock_info *ptp)
> +{
> +	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
> +	struct timespec64 ts;
> +	unsigned long flags;
> +
> +	mutex_lock(&dev->ptp_mutex);
> +	_ksz9477_ptp_gettime(dev, &ts);
> +	mutex_unlock(&dev->ptp_mutex);
> +
> +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> +	dev->ptp_clock_time = ts;
> +	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> +
> +	return HZ;  /* reschedule in 1 second */
> +}
> +
>  static int ksz9477_ptp_start_clock(struct ksz_device *dev)
>  {
>  	u16 data;
> +	unsigned long flags;
>  	int ret;
>
>  	ret = ksz_read16(dev, REG_PTP_CLK_CTRL, &data);
> @@ -230,6 +262,11 @@ static int ksz9477_ptp_start_clock(struct ksz_device *dev)
>  	if (ret)
>  		return ret;
>
> +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> +	dev->ptp_clock_time.tv_sec = 0;
> +	dev->ptp_clock_time.tv_nsec = 0;
> +	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> +
>  	return 0;
>  }
>
> @@ -251,11 +288,249 @@ static int ksz9477_ptp_stop_clock(struct ksz_device *dev)
>  	return 0;
>  }
>
> +/* Time stamping support */
> +
> +static int ksz9477_ptp_enable_mode(struct ksz_device *dev)
> +{
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable PTP mode */
> +	data |= PTP_ENABLE;
> +	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;

	return ksz_write16(dev, REG_PTP_MSG_CONF1, data);

> +}
> +
> +static int ksz9477_ptp_disable_mode(struct ksz_device *dev)
> +{
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Disable PTP mode */
> +	data &= ~PTP_ENABLE;
> +	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}

Please merge this with ksz9477_ptp_enable_mode.

> +
> +static int ksz9477_ptp_enable_port_ptp_interrupts(struct ksz_device *dev, int port)
> +{
> +	u32 addr = PORT_CTRL_ADDR(port, REG_PORT_INT_MASK);
> +	u8 data;
> +	int ret;
> +
> +	ret = ksz_read8(dev, addr, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable port PTP interrupt (0 means enabled) */
> +	data &= ~PORT_PTP_INT;
> +	ret = ksz_write8(dev, addr, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +static int ksz9477_ptp_disable_port_ptp_interrupts(struct ksz_device *dev, int port)
> +{
> +	u32 addr = PORT_CTRL_ADDR(port, REG_PORT_INT_MASK);
> +	u8 data;
> +	int ret;
> +
> +	ret = ksz_read8(dev, addr, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable port PTP interrupt (1 means disabled) */
> +	data |= PORT_PTP_INT;
> +	ret = ksz_write8(dev, addr, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}

Same comments as above.

> +
> +static int ksz9477_ptp_enable_port_egress_interrupts(struct ksz_device *dev, int port)

Could we make this line shorter?

> +{
> +	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_ENABLE__2);
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, addr, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable port xdelay egress timestamp interrupt (1 means enabled) */
> +	data |= PTP_PORT_XDELAY_REQ_INT;
> +	ret = ksz_write16(dev, addr, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +static int ksz9477_ptp_disable_port_egress_interrupts(struct ksz_device *dev, int port)
> +{
> +	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_ENABLE__2);
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, addr, &data);
> +	if (ret)
> +		return ret;
> +
> +	/* Disable port xdelay egress timestamp interrupts (0 means disabled) */
> +	data &= PTP_PORT_XDELAY_REQ_INT;
> +	ret = ksz_write16(dev, addr, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}

Don't you get tired of copy-pasting code? :)

> +
> +static int ksz9477_ptp_port_init(struct ksz_device *dev, int port)
> +{
> +	struct ksz_port *prt = &dev->ports[port];
> +	int ret;
> +
> +	/* Read rx and tx delay from port registers */
> +	ret = ksz_read16(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_RX_DELAY__2),
> +			 &prt->tstamp_rx_latency_ns);
> +	if (ret)
> +		return ret;
> +
> +	ret = ksz_read16(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_DELAY__2),
> +			 &prt->tstamp_tx_latency_ns);
> +	if (ret)
> +		return ret;
> +
> +	if (port != dev->cpu_port) {

Just return early for the CPU port.
	if (port == dev->cpu_port)
		return 0;

> +		ret = ksz9477_ptp_enable_port_ptp_interrupts(dev, port);
> +		if (ret)
> +			return ret;
> +
> +		ret = ksz9477_ptp_enable_port_egress_interrupts(dev, port);
> +		if (ret)
> +			goto error_disable_port_ptp_interrupts;
> +	}
> +
> +	return 0;
> +
> +error_disable_port_ptp_interrupts:
> +	if (port != dev->cpu_port)
> +		ksz9477_ptp_disable_port_ptp_interrupts(dev, port);

The "if" condition here is completely unnecessary since the only goto
that can reach this code has already satisfied that condition.

> +	return ret;
> +}
> +
> +static void ksz9477_ptp_port_deinit(struct ksz_device *dev, int port)
> +{
> +	if (port != dev->cpu_port) {

Return early?

> +		ksz9477_ptp_disable_port_egress_interrupts(dev, port);
> +		ksz9477_ptp_disable_port_ptp_interrupts(dev, port);
> +	}
> +}
> +
> +static int ksz9477_ptp_ports_init(struct ksz_device *dev)
> +{
> +	int port;
> +	int ret;
> +
> +	for (port = 0; port < dev->port_cnt; port++) {
> +		ret = ksz9477_ptp_port_init(dev, port);
> +		if (ret)
> +			goto error_deinit;
> +	}
> +
> +	return 0;
> +
> +error_deinit:
> +	for (--port; port >= 0; --port)

Is that idiom runtime-tested? If not, or you're unsure if it works or
not, you can use:
	while (port-- > 0)

> +		ksz9477_ptp_port_deinit(dev, port);
> +	return ret;
> +}
> +
> +static void ksz9477_ptp_ports_deinit(struct ksz_device *dev)
> +{
> +	int port;
> +
> +	for (port = dev->port_cnt - 1; port >= 0; --port)

Nice, but also probably not worth the effort?

> +		ksz9477_ptp_port_deinit(dev, port);
> +}
> +
> +/* device attributes */
> +
> +enum ksz9477_ptp_tcmode {
> +	KSZ9477_PTP_TCMODE_E2E,
> +	KSZ9477_PTP_TCMODE_P2P,
> +};
> +
> +static int ksz9477_ptp_tcmode_set(struct ksz_device *dev, enum ksz9477_ptp_tcmode tcmode)
> +{
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
> +	if (ret)
> +		return ret;
> +
> +	if (tcmode == KSZ9477_PTP_TCMODE_P2P)
> +		data |= PTP_TC_P2P;
> +	else
> +		data &= ~PTP_TC_P2P;
> +
> +	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;

	return ksz_write16(dev, REG_PTP_MSG_CONF1, data);

> +}
> +
> +enum ksz9477_ptp_ocmode {
> +	KSZ9477_PTP_OCMODE_SLAVE,
> +	KSZ9477_PTP_OCMODE_MASTER,
> +};
> +
> +static int ksz9477_ptp_ocmode_set(struct ksz_device *dev, enum ksz9477_ptp_ocmode ocmode)
> +{
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
> +	if (ret)
> +		return ret;
> +
> +	if (ocmode == KSZ9477_PTP_OCMODE_MASTER)
> +		data |= PTP_MASTER;
> +	else
> +		data &= ~PTP_MASTER;
> +
> +	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
>  int ksz9477_ptp_init(struct ksz_device *dev)
>  {
>  	int ret;
>
>  	mutex_init(&dev->ptp_mutex);
> +	spin_lock_init(&dev->ptp_clock_lock);
>
>  	/* PTP clock properties */
>
> @@ -275,6 +550,7 @@ int ksz9477_ptp_init(struct ksz_device *dev)
>  	dev->ptp_caps.gettime64   = ksz9477_ptp_gettime;
>  	dev->ptp_caps.settime64   = ksz9477_ptp_settime;
>  	dev->ptp_caps.enable      = ksz9477_ptp_enable;
> +	dev->ptp_caps.do_aux_work = ksz9477_ptp_do_aux_work;
>
>  	/* Start hardware counter (will overflow after 136 years) */
>  	ret = ksz9477_ptp_start_clock(dev);
> @@ -287,8 +563,44 @@ int ksz9477_ptp_init(struct ksz_device *dev)
>  		goto error_stop_clock;
>  	}
>
> +	/* Enable PTP mode (will affect tail tagging format) */
> +	ret = ksz9477_ptp_enable_mode(dev);
> +	if (ret)
> +		goto error_unregister_clock;
> +
> +	/* Init switch ports */
> +	ret = ksz9477_ptp_ports_init(dev);
> +	if (ret)
> +		goto error_disable_mode;
> +
> +	/* Currently, only P2P delay measurement is supported.  Setting ocmode to
> +	 * slave will work independently of actually being master or slave.
> +	 * For E2E delay measurement, switching between master and slave would
> +	 * be required, as the KSZ devices filters out PTP messages depending on
> +	 * the ocmode setting:
> +	 * - in slave mode, DelayReq messages are filtered out
> +	 * - in master mode, Sync messages are filtered out
> +	 * Currently (and probably also in future) there is no interface in the
> +	 * kernel which allows switching between master and slave mode.  For this
> +	 * reason, E2E cannot be supported. See patchwork for full discussion:
> +	 * https://patchwork.ozlabs.org/project/netdev/patch/20201019172435.4416-8-ceggers@arri.de/
> +	 */
> +	ksz9477_ptp_tcmode_set(dev, KSZ9477_PTP_TCMODE_P2P);
> +	ksz9477_ptp_ocmode_set(dev, KSZ9477_PTP_OCMODE_SLAVE);
> +
> +	/* Schedule cyclic call of ksz_ptp_do_aux_work() */
> +	ret = ptp_schedule_worker(dev->ptp_clock, 0);
> +	if (ret)
> +		goto error_ports_deinit;
> +
>  	return 0;
>
> +error_ports_deinit:
> +	ksz9477_ptp_ports_deinit(dev);
> +error_disable_mode:
> +	ksz9477_ptp_disable_mode(dev);
> +error_unregister_clock:
> +	ptp_clock_unregister(dev->ptp_clock);
>  error_stop_clock:
>  	ksz9477_ptp_stop_clock(dev);
>  	return ret;
> @@ -296,6 +608,238 @@ int ksz9477_ptp_init(struct ksz_device *dev)
>
>  void ksz9477_ptp_deinit(struct ksz_device *dev)
>  {
> +	ksz9477_ptp_ports_deinit(dev);
> +	ksz9477_ptp_disable_mode(dev);
>  	ptp_clock_unregister(dev->ptp_clock);
>  	ksz9477_ptp_stop_clock(dev);
>  }
> +
> +irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port)
> +{
> +	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_STATUS__2);
> +	struct ksz_port *prt = &dev->ports[port];
> +	irqreturn_t result = IRQ_NONE;
> +	u16 data;
> +	int ret;
> +
> +	ret = ksz_read16(dev, addr, &data);
> +	if (ret)
> +		return IRQ_NONE;
> +
> +	if ((data & PTP_PORT_XDELAY_REQ_INT) && prt->tstamp_tx_xdelay_skb) {

It's almost as if you just want the "if" condition for the IRQ being
asserted, and WARN_ON(!prt->tstamp_tx_xdelay_skb). Otherwise, it would
mean that you received a TX timestamp interrupt for a packet that the
kernel has no clue of whatsoever, and you just keep chugging along.
Of course, you would still need to take care that you don't access a
NULL pointer, and you would still need to clear the IRQ, as you do now.

> +		/* Timestamp for Pdelay_Req / Delay_Req */
> +		u32 tstamp_raw;
> +		ktime_t tstamp;
> +		struct skb_shared_hwtstamps shhwtstamps;
> +		struct sk_buff *tmp_skb;

Reverse Christmas notation.

> +
> +		/* In contrast to the KSZ9563R data sheet, the format of the
> +		 * port time stamp registers is also 2 bit seconds + 30 bit
> +		 * nanoseconds (same as in the tail tags).
> +		 */
> +		ret = ksz_read32(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_XDELAY_TS), &tstamp_raw);
> +		if (ret)
> +			return result;

No reason whatsoever to keep IRQ_NONE in a variable.

> +
> +		tstamp = ksz9477_decode_tstamp(tstamp_raw, prt->tstamp_tx_latency_ns);
> +		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
> +		shhwtstamps.hwtstamp = ksz9477_tstamp_to_clock(dev, tstamp);
> +
> +		/* skb_complete_tx_timestamp() will free up the client to make
> +		 * another timestamp-able transmit. We have to be ready for it
> +		 * -- by clearing the ps->tx_skb "flag" -- beforehand.
> +		 */
> +
> +		tmp_skb = prt->tstamp_tx_xdelay_skb;
> +		prt->tstamp_tx_xdelay_skb = NULL;
> +		clear_bit_unlock(KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS, &prt->tstamp_state);
> +		skb_complete_tx_timestamp(tmp_skb, &shhwtstamps);
> +	}
> +
> +	/* Clear interrupt(s) (W1C) */
> +	ret = ksz_write16(dev, addr, data);
> +	if (ret)
> +		return IRQ_NONE;
> +
> +	return IRQ_HANDLED;
> +}
> +
> +/* DSA PTP operations */
> +
> +int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port __always_unused,
> +			    struct ethtool_ts_info *ts)
> +{
> +	struct ksz_device *dev = ds->priv;
> +
> +	ts->so_timestamping =
> +			SOF_TIMESTAMPING_TX_HARDWARE |
> +			SOF_TIMESTAMPING_RX_HARDWARE |
> +			SOF_TIMESTAMPING_RAW_HARDWARE;
> +
> +	ts->phc_index = ptp_clock_index(dev->ptp_clock);
> +
> +	ts->tx_types =

Odd that you didn't continue on the same line? (same above)

> +			BIT(HWTSTAMP_TX_OFF) |
> +			BIT(HWTSTAMP_TX_ONESTEP_P2P);
> +
> +	ts->rx_filters =
> +			BIT(HWTSTAMP_FILTER_NONE) |
> +			BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
> +			BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
> +			BIT(HWTSTAMP_FILTER_PTP_V2_EVENT);
> +
> +	return 0;
> +}
> +
> +static int ksz9477_set_hwtstamp_config(struct ksz_device *dev, int port,
> +				       struct hwtstamp_config *config)
> +{
> +	struct ksz_port *prt = &dev->ports[port];
> +	bool tstamp_enable = false;
> +
> +	/* Prevent the TX/RX paths from trying to interact with the
> +	 * timestamp hardware while we reconfigure it.
> +	 */
> +	clear_bit_unlock(KSZ_HWTSTAMP_ENABLED, &prt->tstamp_state);
> +
> +	/* reserved for future extensions */
> +	if (config->flags)
> +		return -EINVAL;
> +
> +	switch (config->tx_type) {
> +	case HWTSTAMP_TX_OFF:
> +		tstamp_enable = false;
> +		break;
> +	case HWTSTAMP_TX_ONESTEP_P2P:
> +		tstamp_enable = true;
> +		break;
> +	default:
> +		return -ERANGE;
> +	}
> +
> +	switch (config->rx_filter) {
> +	case HWTSTAMP_FILTER_NONE:
> +		tstamp_enable = false;
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;  /* UDPv4/UDPv6 */

I don't think the comments are really necessary. Additionally, you
exceed the 80 characters limit.

> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:

I don't think you want to accept DELAY_REQ though? These rx_filters are
a bit arbitrary to begin with, and you might be one of the first to
differentiate here, but you really can't timestamp delay request/response.

> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;  /* 802.3 ether */
> +		break;
> +	case HWTSTAMP_FILTER_PTP_V2_EVENT:
> +	case HWTSTAMP_FILTER_PTP_V2_SYNC:
> +	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
> +		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;     /* UDP / 802.3 */
> +		break;
> +	case HWTSTAMP_FILTER_ALL:
> +	default:
> +		config->rx_filter = HWTSTAMP_FILTER_NONE;
> +		return -ERANGE;
> +	}
> +
> +	/* Once hardware has been configured, enable timestamp checks
> +	 * in the RX/TX paths.
> +	 */
> +	if (tstamp_enable)
> +		set_bit(KSZ_HWTSTAMP_ENABLED, &prt->tstamp_state);
> +
> +	return 0;
> +}
> +
> +int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr)
> +{
> +	struct ksz_device *dev = ds->priv;
> +	struct hwtstamp_config *port_tstamp_config = &dev->ports[port].tstamp_config;
> +
> +	return copy_to_user(ifr->ifr_data,
> +			    port_tstamp_config, sizeof(*port_tstamp_config)) ? -EFAULT : 0;
> +}
> +
> +int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr)
> +{
> +	struct ksz_device *dev = ds->priv;
> +	struct hwtstamp_config *port_tstamp_config = &dev->ports[port].tstamp_config;
> +	struct hwtstamp_config config;
> +	int err;
> +
> +	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
> +		return -EFAULT;
> +
> +	err = ksz9477_set_hwtstamp_config(dev, port, &config);
> +	if (err)
> +		return err;
> +
> +	/* Save the chosen configuration to be returned later. */
> +	memcpy(port_tstamp_config, &config, sizeof(config));
> +
> +	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0;
> +}
> +
> +/* Returns a pointer to the PTP header if the caller should time stamp,
> + * or NULL if the caller should not.
> + */
> +static struct ptp_header *ksz9477_ptp_should_tstamp(struct ksz_port *port, struct sk_buff *skb,
> +						    unsigned int type)

The fact that others have a function called "should_tstamp" which
returns a pointer to a struct ptp_header doesn't mean that this is a
good model to copy. Also, you exceeded 80 characters by quite a bit.

> +{
> +	enum ksz9477_ptp_event_messages msg_type;
> +	struct ptp_header *hdr;
> +
> +	if (!test_bit(KSZ_HWTSTAMP_ENABLED, &port->tstamp_state))
> +		return NULL;
> +
> +	hdr = ptp_parse_header(skb, type);
> +	if (!hdr)
> +		return NULL;
> +
> +	msg_type = ptp_get_msgtype(hdr, type);
> +
> +	switch (msg_type) {
> +	/* As the KSZ9563 always performs one step time stamping, only the time
> +	 * stamp for Pdelay_Req is reported to the application via socket error
> +	 * queue.  Time stamps for Sync and Pdelay_resp will be applied directly
> +	 * to the outgoing message (e.g. correction field), but will NOT be
> +	 * reported to the socket.
> +	 * Delay_Req is not time stamped as E2E is currently not supported by
> +	 * this driver.  See ksz9477_ptp_init() for details.
> +	 */
> +	case PTP_Event_Message_Pdelay_Req:
> +		break;
> +	default:
> +		return NULL;
> +	}
> +
> +	return hdr;
> +}
> +
> +bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone,
> +			       unsigned int type)
> +{
> +	struct ksz_device *dev = ds->priv;
> +	struct ksz_port *prt = &dev->ports[port];
> +	struct ptp_header *hdr;
> +
> +	/* KSZ9563 supports PTPv2 only */
> +	if (!(type & PTP_CLASS_V2))
> +		return false;

It should be sufficient that you specified this in the
config->rx_filters from ksz9477_set_hwtstamp_config? I'm not even sure
who uses PTP v1 anyway.

> +
> +	/* Should already been tested in dsa_skb_tx_timestamp()? */
> +	if (!(skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP))
> +		return false;

Yeah, should have...
What do you think about this one though:
https://lore.kernel.org/netdev/20201104015834.mcn2eoibxf6j3ksw@skbuf/

> +
> +	hdr = ksz9477_ptp_should_tstamp(prt, clone, type);
> +	if (!hdr)
> +		return false;
> +
> +	if (test_and_set_bit_lock(KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS,
> +				  &prt->tstamp_state))
> +		return false;  /* free cloned skb */
> +
> +	prt->tstamp_tx_xdelay_skb = clone;

Who do you think will guarantee you that a second timestampable packet
may not come in before the TX timestamp interrupt for the first one has
fired?

Either the hardware supports matching a TX timestamp to a PTP event
message (by sequenceId or whatnot), case in which you need more complex
logic in the IRQ handler, or the hardware can take a single TX timestamp
at a time, case in which you'll need an skb_queue and a process context
to wait for the TX timestamp of the previous PTP message before calling
dsa_enqueue_skb for the next PTP event message. There are already
implementations of both models in DSA that you can look at.

> +
> +	return true;  /* keep cloned skb */
> +}
> diff --git a/drivers/net/dsa/microchip/ksz9477_ptp.h b/drivers/net/dsa/microchip/ksz9477_ptp.h
> index 0076538419fa..e8d50a086311 100644
> --- a/drivers/net/dsa/microchip/ksz9477_ptp.h
> +++ b/drivers/net/dsa/microchip/ksz9477_ptp.h
> @@ -10,6 +10,9 @@
>  #ifndef DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_
>  #define DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_
>
> +#include <linux/irqreturn.h>
> +#include <linux/types.h>
> +
>  #include "ksz_common.h"
>
>  #if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
> @@ -17,11 +20,35 @@
>  int ksz9477_ptp_init(struct ksz_device *dev);
>  void ksz9477_ptp_deinit(struct ksz_device *dev);
>
> +irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port);
> +
> +int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *ts);
> +int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr);
> +int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr);
> +bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone,
> +			       unsigned int type);
> +
>  #else
>
>  static inline int ksz9477_ptp_init(struct ksz_device *dev) { return 0; }
>  static inline void ksz9477_ptp_deinit(struct ksz_device *dev) {}
>
> +static inline irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port)
> +	{ return IRQ_NONE; }
> +
> +static inline int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port,
> +					  struct ethtool_ts_info *ts) { return -EOPNOTSUPP; }
> +
> +static inline int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port,
> +						struct ifreq *ifr) { return -EOPNOTSUPP; }
> +
> +static inline int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr)
> +	{ return -EOPNOTSUPP; }
> +
> +static inline bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port,
> +					     struct sk_buff *clone, unsigned int type)
> +	{ return false; }
> +

In networking we still have the 80-characters rule, please follow it.

>  #endif
>
>  #endif /* DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_ */
> diff --git a/include/linux/dsa/ksz_common.h b/include/linux/dsa/ksz_common.h
> index 4d5b6cc9429a..bfe8873b1636 100644
> --- a/include/linux/dsa/ksz_common.h
> +++ b/include/linux/dsa/ksz_common.h
> @@ -5,16 +5,27 @@
>  #ifndef _NET_DSA_KSZ_COMMON_H_
>  #define _NET_DSA_KSZ_COMMON_H_
>
> +#include <linux/bitfield.h>
> +#include <linux/bits.h>
>  #include <linux/gpio/consumer.h>
>  #include <linux/kernel.h>
> +#include <linux/ktime.h>
>  #include <linux/mutex.h>
> +#include <linux/net_tstamp.h>
>  #include <linux/phy.h>
>  #include <linux/ptp_clock_kernel.h>
> +#include <linux/spinlock.h>
>  #include <linux/regmap.h>
>  #include <linux/timer.h>
>  #include <linux/workqueue.h>
>  #include <net/dsa.h>
>
> +/* All time stamps from the KSZ consist of 2 bits for seconds and 30 bits for
> + * nanoseconds. This is NOT the same as 32 bits for nanoseconds.
> + */
> +#define KSZ_TSTAMP_SEC_MASK  GENMASK(31, 30)
> +#define KSZ_TSTAMP_NSEC_MASK GENMASK(29, 0)
> +
>  struct ksz_platform_data;
>  struct ksz_dev_ops;
>  struct vlan_table;
> @@ -25,6 +36,12 @@ struct ksz_port_mib {
>  	u64 *counters;
>  };
>
> +/* state flags for ksz_port::tstamp_state */
> +enum {
> +	KSZ_HWTSTAMP_ENABLED,
> +	KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS,
> +};
> +
>  struct ksz_port {
>  	u16 member;
>  	u16 vid_member;
> @@ -41,6 +58,13 @@ struct ksz_port {
>
>  	struct ksz_port_mib mib;
>  	phy_interface_t interface;
> +#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
> +	u16 tstamp_rx_latency_ns;   /* rx delay from wire to tstamp unit */
> +	u16 tstamp_tx_latency_ns;   /* tx delay from tstamp unit to wire */
> +	struct hwtstamp_config tstamp_config;
> +	struct sk_buff *tstamp_tx_xdelay_skb;
> +	unsigned long tstamp_state;
> +#endif
>  };
>
>  struct ksz_device {
> @@ -98,7 +122,30 @@ struct ksz_device {
>  	struct ptp_clock *ptp_clock;
>  	struct ptp_clock_info ptp_caps;
>  	struct mutex ptp_mutex;
> +	spinlock_t ptp_clock_lock; /* for ptp_clock_time */
> +	/* approximated current time, read once per second from hardware */
> +	struct timespec64 ptp_clock_time;
>  #endif
>  };
>
> +/* KSZ9563 will only timestamp event messages */
> +enum ksz9477_ptp_event_messages {
> +	PTP_Event_Message_Sync        = 0x0,
> +	PTP_Event_Message_Delay_Req   = 0x1,
> +	PTP_Event_Message_Pdelay_Req  = 0x2,
> +	PTP_Event_Message_Pdelay_Resp = 0x3,
> +};
> +
> +/* net/dsa/tag_ksz.c */
> +static inline ktime_t ksz9477_decode_tstamp(u32 tstamp, int offset_ns)
> +{
> +	u64 ns = FIELD_GET(KSZ_TSTAMP_SEC_MASK, tstamp) * NSEC_PER_SEC +
> +		 FIELD_GET(KSZ_TSTAMP_NSEC_MASK, tstamp);
> +
> +	/* Add/remove excess delay between wire and time stamp unit */
> +	return ns_to_ktime(ns + offset_ns);
> +}
> +
> +ktime_t ksz9477_tstamp_to_clock(struct ksz_device *ksz, ktime_t tstamp);
> +
>  #endif /* _NET_DSA_KSZ_COMMON_H_ */
> diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
> index 4820dbcedfa2..c16eb9eae19c 100644
> --- a/net/dsa/tag_ksz.c
> +++ b/net/dsa/tag_ksz.c
> @@ -4,10 +4,14 @@
>   * Copyright (c) 2017 Microchip Technology
>   */
>
> +#include <asm/unaligned.h>
> +#include <linux/dsa/ksz_common.h>
>  #include <linux/etherdevice.h>
>  #include <linux/list.h>
> +#include <linux/ptp_classify.h>
>  #include <linux/slab.h>
>  #include <net/dsa.h>
> +#include <net/checksum.h>
>  #include "dsa_priv.h"
>
>  /* Typically only one byte is used for tail tag. */
> @@ -87,26 +91,125 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795);
>  /*
>   * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
>   * ---------------------------------------------------------------------------
> - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
> + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
>   * ---------------------------------------------------------------------------
> + * ts   : time stamp (only present if PTP is enabled on the hardware).
>   * tag0 : Prioritization (not used now)
>   * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
>   *
> - * For Egress (KSZ9477 -> Host), 1 byte is added before FCS.
> + * For Egress (KSZ9477 -> Host), 1/4 bytes are added before FCS.
>   * ---------------------------------------------------------------------------
> - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
> + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
>   * ---------------------------------------------------------------------------
> + * ts   : time stamp (only present of bit 7 in tag0 is set
>   * tag0 : zero-based value represents port
>   *	  (eg, 0x00=port1, 0x02=port3, 0x06=port7)
>   */
>
>  #define KSZ9477_INGRESS_TAG_LEN		2
>  #define KSZ9477_PTP_TAG_LEN		4
> -#define KSZ9477_PTP_TAG_INDICATION	0x80
> +#define KSZ9477_PTP_TAG_INDICATION	BIT(7)
>
>  #define KSZ9477_TAIL_TAG_OVERRIDE	BIT(9)
>  #define KSZ9477_TAIL_TAG_LOOKUP		BIT(10)
>
> +#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
> +/* Time stamp tag is only inserted if PTP is enabled in hardware. */
> +static void ksz9477_xmit_timestamp(struct sk_buff *skb)
> +{
> +	/* We send always 0 in the tail tag.  For PDelay_Resp, the ingress
> +	 * time stamp of the PDelay_Req message has already been subtracted from
> +	 * the correction field on rx.
> +	 */
> +	put_unaligned_be32(0, skb_put(skb, KSZ9477_PTP_TAG_LEN));
> +}

On TX you don't need the "PTP time stamp" field at all, can't you
disable it?

> +
> +ktime_t ksz9477_tstamp_to_clock(struct ksz_device *ksz, ktime_t tstamp)
> +{
> +	struct timespec64 ts = ktime_to_timespec64(tstamp);
> +	struct timespec64 ptp_clock_time;
> +	struct timespec64 diff;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&ksz->ptp_clock_lock, flags);
> +	ptp_clock_time = ksz->ptp_clock_time;
> +	spin_unlock_irqrestore(&ksz->ptp_clock_lock, flags);
> +
> +	/* calculate full time from partial time stamp */
> +	ts.tv_sec = (ptp_clock_time.tv_sec & ~3) | ts.tv_sec;
> +
> +	/* find nearest possible point in time */
> +	diff = timespec64_sub(ts, ptp_clock_time);
> +	if (diff.tv_sec > 2)
> +		ts.tv_sec -= 4;
> +	else if (diff.tv_sec < -2)
> +		ts.tv_sec += 4;
> +
> +	return timespec64_to_ktime(ts);
> +}
> +EXPORT_SYMBOL(ksz9477_tstamp_to_clock);

It should be noted that I tried to find fault with this simplistic
implementation, where you just reconstruct the partial timestamp with
whatever PTP time you've got laying around, but I couldn't (or at least
I couldn't prove it).

In principle there should be a problem when the current PTP time wraps
around before you get the chance to reconstruct the partial timestamp.
That one you can detect by ensuring that the partial timestamp is larger
than the lower bits of the current PTP time. But that imposes the
restriction that the current PTP time must be collected after the
partial timestamp was taken by the MAC. And that means that PTP
timestamping must be done in process context, because it's accessing
SPI/I2C. Which means a very convoluted implementation, a nightmare
frankly.

The way you seem to be avoiding this, while still detecting the
wraparound case, is that you're just patching in the partial timestamp
into the "current" (i.e. at most 1 second old) PTP time, and then you
take a look at how well it fits. If the diff is larger than half the
wraparound interval, and positive (like: the "current" PTP time was
collected by the driver after the MAC took the partial timestamp), then
the current PTP time is too far ahead and must have wrapped around. If
the diff is large but negative (like: the partial timestamp, which is in
the "current" PTP time's future, has wrapped around), then the "current"
PTP time needs to be manually boosted.

This all seems to work because you have a somewhat workable budget of 4
seconds wraparound time. I am not sure that reading the PTP time once
per second is desirable under all circumstances if avoidable, and
there's also an even bigger assumption, which is that the PTP worker
will in fact get scheduled with a delay no larger than 2 seconds. I
suppose that is an academic only concern though.

So good for you that you can use a function so simple for timestamp
reconstruction. By the way, what about the name ksz9477_tstamp_reconstruct?
I don't exactly understand where does the "tstamp_to_clock" name come
from.

> +
> +static void ksz9477_rcv_timestamp(struct sk_buff *skb, u8 *tag,
> +				  struct net_device *dev, unsigned int port)
> +{
> +	struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
> +	u8 *tstamp_raw = tag - KSZ9477_PTP_TAG_LEN;
> +	enum ksz9477_ptp_event_messages msg_type;
> +	struct dsa_switch *ds = dev->dsa_ptr->ds;
> +	struct ksz_device *ksz = ds->priv;
> +	struct ksz_port *prt = &ksz->ports[port];
> +	struct ptp_header *ptp_hdr;
> +	unsigned int ptp_type;
> +	ktime_t tstamp;
> +
> +	/* convert time stamp and write to skb */
> +	tstamp = ksz9477_decode_tstamp(get_unaligned_be32(tstamp_raw),
> +				       -prt->tstamp_rx_latency_ns);
> +	memset(hwtstamps, 0, sizeof(*hwtstamps));
> +	hwtstamps->hwtstamp = ksz9477_tstamp_to_clock(ksz, tstamp);
> +
> +	/* For PDelay_Req messages, user space (ptp4l) expects that the hardware
> +	 * subtracts the ingress time stamp from the correction field.  The
> +	 * separate hw time stamp from the sk_buff struct will not be used in
> +	 * this case.
> +	 */
> +	if (skb_headroom(skb) < ETH_HLEN)
> +		return;

And what does the comment have to do with the code?

> +
> +	__skb_push(skb, ETH_HLEN);
> +	ptp_type = ptp_classify_raw(skb);
> +	__skb_pull(skb, ETH_HLEN);
> +
> +	if (ptp_type == PTP_CLASS_NONE)
> +		return;
> +
> +	ptp_hdr = ptp_parse_header(skb, ptp_type);
> +	if (!ptp_hdr)
> +		return;
> +
> +	msg_type = ptp_get_msgtype(ptp_hdr, ptp_type);
> +	if (msg_type != PTP_Event_Message_Pdelay_Req)
> +		return;

Would you be so generous to also modify ptp_get_msgtype to return this
enum of yours? There is also some opportunity for reuse with drivers/ptp/ptp_ines.c.

> +
> +	/* Only subtract partial time stamp from the correction field.  When the
> +	 * hardware adds the egress time stamp to the correction field of the
> +	 * PDelay_Resp message on tx, also only the partial time stamp will be
> +	 * added.
> +	 */
> +	ptp_onestep_p2p_move_t2_to_correction(skb, ptp_type, ptp_hdr, tstamp);
> +}
> +#else   /* IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP) */
> +static void ksz9477_xmit_timestamp(struct sk_buff *skb __maybe_unused)
> +{
> +}
> +
> +static void ksz9477_rcv_timestamp(struct sk_buff *skb __maybe_unused, u8 *tag __maybe_unused,
> +				  struct net_device *dev __maybe_unused,
> +				  unsigned int port __maybe_unused)

Where did you see __maybe_unused being utilized in this way? And what's
so "maybe" about it? They are absolutely unused, and the compiler should
not complain. Please remove these variable attributes.

> +{
> +}
> +#endif  /* IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP) */
> +
>  static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
>  				    struct net_device *dev)
>  {
> @@ -116,6 +219,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
>  	u16 val;
>
>  	/* Tag encoding */
> +	ksz9477_xmit_timestamp(skb);
>  	tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN);
>  	addr = skb_mac_header(skb);
>
> @@ -138,8 +242,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev,
>  	unsigned int len = KSZ_EGRESS_TAG_LEN;
>
>  	/* Extra 4-bytes PTP timestamp */
> -	if (tag[0] & KSZ9477_PTP_TAG_INDICATION)
> +	if (tag[0] & KSZ9477_PTP_TAG_INDICATION) {
> +		ksz9477_rcv_timestamp(skb, tag, dev, port);
>  		len += KSZ9477_PTP_TAG_LEN;
> +	}
>
>  	return ksz_common_rcv(skb, dev, port, len);
>  }
> @@ -149,7 +255,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = {
>  	.proto	= DSA_TAG_PROTO_KSZ9477,
>  	.xmit	= ksz9477_xmit,
>  	.rcv	= ksz9477_rcv,
> -	.overhead = KSZ9477_INGRESS_TAG_LEN,
> +	.overhead = KSZ9477_INGRESS_TAG_LEN + KSZ9477_PTP_TAG_LEN,
>  	.tail_tag = true,
>  };
>
> @@ -167,6 +273,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
>  	u8 *tag;
>
>  	/* Tag encoding */
> +	ksz9477_xmit_timestamp(skb);
>  	tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
>  	addr = skb_mac_header(skb);
>
> @@ -183,7 +290,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = {
>  	.proto	= DSA_TAG_PROTO_KSZ9893,
>  	.xmit	= ksz9893_xmit,
>  	.rcv	= ksz9477_rcv,
> -	.overhead = KSZ_INGRESS_TAG_LEN,
> +	.overhead = KSZ_INGRESS_TAG_LEN + KSZ9477_PTP_TAG_LEN,
>  	.tail_tag = true,
>  };
>
> --

Long and exhausting patch... Could you split it up into a patch for the
control path and another one for the data path?
Richard Cochran Nov. 13, 2020, 3:31 a.m. UTC | #2
On Fri, Nov 13, 2020 at 04:40:20AM +0200, Vladimir Oltean wrote:
> > @@ -103,6 +108,10 @@ static int ksz9477_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
> >  	if (ret)
> >  		goto error_return;
> >
> > +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> 
> I believe that spin_lock_irqsave is unnecessary, since there is no code
> that runs in hardirq context.

The .adjtime method is in a system call context.  If all of the code
that manipulates dev->ptp_clock_time can sleep, then you can use a
mutex.  Otherwise spin_lock_irqsave is the safe choice.

Thanks,
Richard
Christian Eggers Nov. 13, 2020, 6:57 p.m. UTC | #3
On Friday, 13 November 2020, 03:40:20 CET, Vladimir Oltean wrote:
> On Thu, Nov 12, 2020 at 04:35:35PM +0100, Christian Eggers wrote:
[...]
> > @@ -103,6 +108,10 @@ static int ksz9477_ptp_adjtime(struct ptp_clock_info
> > *ptp, s64 delta)> 
> >  	if (ret)
> >  	
> >  		goto error_return;
> > 
> > +	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> 
> I believe that spin_lock_irqsave is unnecessary, since there is no code
> that runs in hardirq context.
I'll check this again. Originally I had only a mutex for everything, but later
it turned out that for ptp_clock_time a spinlock is required. Maybe this has
changed since starting of my work on the driver.

> 
> > +	dev->ptp_clock_time = timespec64_add(dev->ptp_clock_time, delta64);
> > +	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> > +

[...]

> Could we make this line shorter?
...
> Additionally, you exceed the 80 characters limit.
...
> Also, you exceeded 80 characters by quite a bit.
...
> In networking we still have the 80-characters rule, please follow it.
Can this be added to the netdev-FAQ (just below the section about 
"comment style convention")?

> > +static void ksz9477_ptp_ports_deinit(struct ksz_device *dev)
> > +{
> > +	int port;
> > +
> > +	for (port = dev->port_cnt - 1; port >= 0; --port)
> 
> Nice, but also probably not worth the effort?
What do you mean. Shall I used forward direction?

> 
> > +		ksz9477_ptp_port_deinit(dev, port);
> > +}
> 

[...]

> > +bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port, struct
> > sk_buff *clone, +			       unsigned int type)
> > +{
> > +	struct ksz_device *dev = ds->priv;
> > +	struct ksz_port *prt = &dev->ports[port];
> > +	struct ptp_header *hdr;
> > +
> > +	/* KSZ9563 supports PTPv2 only */
> > +	if (!(type & PTP_CLASS_V2))
> > +		return false;
> 
> It should be sufficient that you specified this in the
> config->rx_filters from ksz9477_set_hwtstamp_config? I'm not even sure
> who uses PTP v1 anyway.
> 
> > +
> > +	/* Should already been tested in dsa_skb_tx_timestamp()? */
> > +	if (!(skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP))
> > +		return false;
> 
> Yeah, should have...
> What do you think about this one though:
> https://lore.kernel.org/netdev/20201104015834.mcn2eoibxf6j3ksw@skbuf/
I am not an expert for performance stuff. But for me it looks obvious that
cheaper checks should be performed first. What about also moving checking
for ops->port_txtstamp above ptp_classify_raw()?

Is there any reason why this isn't already applied?

> 
> > +
> > +	hdr = ksz9477_ptp_should_tstamp(prt, clone, type);
> > +	if (!hdr)
> > +		return false;
> > +
> > +	if (test_and_set_bit_lock(KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS,
> > +				  &prt->tstamp_state))
> > +		return false;  /* free cloned skb */
> > +
> > +	prt->tstamp_tx_xdelay_skb = clone;
> 
> Who do you think will guarantee you that a second timestampable packet
> may not come in before the TX timestamp interrupt for the first one has
> fired?
> 
> Either the hardware supports matching a TX timestamp to a PTP event
> message (by sequenceId or whatnot),
it doesn't

> case in which you need more complex
> logic in the IRQ handler, or the hardware can take a single TX timestamp
> at a time, 
yes

> case in which you'll need an skb_queue and a process context
> to wait for the TX timestamp of the previous PTP message before calling
> dsa_enqueue_skb for the next PTP event message. There are already
> implementations of both models in DSA that you can look at.
In the past I got sometimes a "timeout waiting for hw timestamp" (or similar)
message from ptp4l. I am not sure whether this is still the case, but this may
explain this type of problems.

> 
> > +
> > +	return true;  /* keep cloned skb */
> > +}
> 

[...]

> > diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
> > index 4820dbcedfa2..c16eb9eae19c 100644
> > --- a/net/dsa/tag_ksz.c
> > +++ b/net/dsa/tag_ksz.c

[...]

> > 
> > +#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
> > +/* Time stamp tag is only inserted if PTP is enabled in hardware. */
> > +static void ksz9477_xmit_timestamp(struct sk_buff *skb)
> > +{
> > +	/* We send always 0 in the tail tag.  For PDelay_Resp, the ingress
> > +	 * time stamp of the PDelay_Req message has already been subtracted from
> > +	 * the correction field on rx.
> > +	 */
> > +	put_unaligned_be32(0, skb_put(skb, KSZ9477_PTP_TAG_LEN));
> > +}
> 
> On TX you don't need the "PTP time stamp" field at all, can't you
> disable it?

No :-( 

From the data sheet, section 4.4.9:

"In the host-to-switch direction, the 4-byte timestamp field is always present when PTP is enabled, as shown in Figure 4-
6. This is true for all packets sent by the host, including IBA packets. The host uses this field to insert the receive time-
stamp from PTP Pdelay_Req messages into the Pdelay_Resp messages. For all other traffic and PTP message types,
the host should populate the timestamp field with zeros."

> 
> > +
> > +ktime_t ksz9477_tstamp_to_clock(struct ksz_device *ksz, ktime_t tstamp)
> > +{
> > +	struct timespec64 ts = ktime_to_timespec64(tstamp);
> > +	struct timespec64 ptp_clock_time;
> > +	struct timespec64 diff;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&ksz->ptp_clock_lock, flags);
> > +	ptp_clock_time = ksz->ptp_clock_time;
> > +	spin_unlock_irqrestore(&ksz->ptp_clock_lock, flags);
> > +
> > +	/* calculate full time from partial time stamp */
> > +	ts.tv_sec = (ptp_clock_time.tv_sec & ~3) | ts.tv_sec;
> > +
> > +	/* find nearest possible point in time */
> > +	diff = timespec64_sub(ts, ptp_clock_time);
> > +	if (diff.tv_sec > 2)
> > +		ts.tv_sec -= 4;
> > +	else if (diff.tv_sec < -2)
> > +		ts.tv_sec += 4;
> > +
> > +	return timespec64_to_ktime(ts);
> > +}
> > +EXPORT_SYMBOL(ksz9477_tstamp_to_clock);
> 
> It should be noted that I tried to find fault with this simplistic
> implementation, where you just reconstruct the partial timestamp with
> whatever PTP time you've got laying around, but I couldn't (or at least
> I couldn't prove it).
> 
> In principle there should be a problem when the current PTP time wraps
> around before you get the chance to reconstruct the partial timestamp.
> That one you can detect by ensuring that the partial timestamp is larger
> than the lower bits of the current PTP time. But that imposes the
> restriction that the current PTP time must be collected after the
> partial timestamp was taken by the MAC. And that means that PTP
> timestamping must be done in process context, because it's accessing
> SPI/I2C. Which means a very convoluted implementation, a nightmare
> frankly.
> 
> The way you seem to be avoiding this, while still detecting the
> wraparound case, is that you're just patching in the partial timestamp
> into the "current" (i.e. at most 1 second old) PTP time, and then you
> take a look at how well it fits. If the diff is larger than half the
> wraparound interval, and positive (like: the "current" PTP time was
> collected by the driver after the MAC took the partial timestamp), then
> the current PTP time is too far ahead and must have wrapped around. If
> the diff is large but negative (like: the partial timestamp, which is in
> the "current" PTP time's future, has wrapped around), then the "current"
> PTP time needs to be manually boosted.
> 
> This all seems to work because you have a somewhat workable budget of 4
> seconds wraparound time. I am not sure that reading the PTP time once
> per second is desirable under all circumstances if avoidable, and
> there's also an even bigger assumption, which is that the PTP worker
> will in fact get scheduled with a delay no larger than 2 seconds. I
> suppose that is an academic only concern though.
> 
> So good for you that you can use a function so simple for timestamp
> reconstruction. 
You already told me that another hardware has much less budget than 4 seconds.
How is timestamp reconstruction done there? Is there any code which I should
reuse?

> By the way, what about the name ksz9477_tstamp_reconstruct?
it's ok.

> I don't exactly understand where does the "tstamp_to_clock" name come
> from.
Invented by me. The function "converts" a time stamp into the "clock" time. 
But "reconstruct" fits better than "convert".

> 
> > +
> > +static void ksz9477_rcv_timestamp(struct sk_buff *skb, u8 *tag,
> > +				  struct net_device *dev, unsigned int port)
> > +{
> > +	struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
> > +	u8 *tstamp_raw = tag - KSZ9477_PTP_TAG_LEN;
> > +	enum ksz9477_ptp_event_messages msg_type;
> > +	struct dsa_switch *ds = dev->dsa_ptr->ds;
> > +	struct ksz_device *ksz = ds->priv;
> > +	struct ksz_port *prt = &ksz->ports[port];
> > +	struct ptp_header *ptp_hdr;
> > +	unsigned int ptp_type;
> > +	ktime_t tstamp;
> > +
> > +	/* convert time stamp and write to skb */
> > +	tstamp = ksz9477_decode_tstamp(get_unaligned_be32(tstamp_raw),
> > +				       -prt->tstamp_rx_latency_ns);
> > +	memset(hwtstamps, 0, sizeof(*hwtstamps));
> > +	hwtstamps->hwtstamp = ksz9477_tstamp_to_clock(ksz, tstamp);
> > +
> > +	/* For PDelay_Req messages, user space (ptp4l) expects that the hardware
> > +	 * subtracts the ingress time stamp from the correction field.  The
> > +	 * separate hw time stamp from the sk_buff struct will not be used in
> > +	 * this case.
> > +	 */
> > +	if (skb_headroom(skb) < ETH_HLEN)
> > +		return;
> 
> And what does the comment have to do with the code?
The comment if for the whole remaining part of the function, not for the single line.
>
> > +
> > +	__skb_push(skb, ETH_HLEN);
> > +	ptp_type = ptp_classify_raw(skb);
> > +	__skb_pull(skb, ETH_HLEN);
> > +
> > +	if (ptp_type == PTP_CLASS_NONE)
> > +		return;
> > +
> > +	ptp_hdr = ptp_parse_header(skb, ptp_type);
> > +	if (!ptp_hdr)
> > +		return;
> > +
> > +	msg_type = ptp_get_msgtype(ptp_hdr, ptp_type);
> > +	if (msg_type != PTP_Event_Message_Pdelay_Req)
> > +		return;
> 
> Would you be so generous to also modify ptp_get_msgtype to return this
> enum of yours? There is also some opportunity for reuse with
> drivers/ptp/ptp_ines.c.
no problem.

> > +
> > +	/* Only subtract partial time stamp from the correction field.  When the
> > +	 * hardware adds the egress time stamp to the correction field of the
> > +	 * PDelay_Resp message on tx, also only the partial time stamp will be
> > +	 * added.
> > +	 */
> > +	ptp_onestep_p2p_move_t2_to_correction(skb, ptp_type, ptp_hdr, tstamp);
> > +}
> > +#else   /* IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP) */
> > +static void ksz9477_xmit_timestamp(struct sk_buff *skb __maybe_unused)
> > +{
> > +}
> > +
> > +static void ksz9477_rcv_timestamp(struct sk_buff *skb __maybe_unused, u8
> > *tag __maybe_unused, +				  struct net_device *dev __maybe_unused,
> > +				  unsigned int port __maybe_unused)
> 
> Where did you see __maybe_unused being utilized in this way? And what's
> so "maybe" about it? They are absolutely unused, and the compiler should
> not complain. Please remove these variable attributes.
ok, __always_unused would fit.

I added the attributes due to Documentation/process/4.Coding.rst:

"Code submitted for review should, as a rule, not produce any compiler
warnings." [...] "Note that not all compiler warnings are enabled by default.  Build the
kernel with "make EXTRA_CFLAGS=-W" to get the full set."

I assumed that reducing the number of warnings raised by "-W" should be reduced
as a long term goal. Is this wrong.

Side note: Documentation/kbuild/makefiles.rst declares usage of EXTRA_CFLAGS as
deprecated.

[...]
> 
> Long and exhausting patch... Could you split it up into a patch for the
> control path and another one for the data path?
I am not sure whether the result will exactly look as you expect, but I'll try.

Thanks a lot for the fast and comprehensive review! As soon as I get your
answer regarding patch 3/11 (split ksz_common.h), I will perform the changes.

Have a nice weekend
Christian
Vladimir Oltean Nov. 14, 2020, 12:54 a.m. UTC | #4
On Fri, Nov 13, 2020 at 07:57:32PM +0100, Christian Eggers wrote:
> On Friday, 13 November 2020, 03:40:20 CET, Vladimir Oltean wrote:
> > On Thu, Nov 12, 2020 at 04:35:35PM +0100, Christian Eggers wrote:
> [...]
> > > @@ -103,6 +108,10 @@ static int ksz9477_ptp_adjtime(struct ptp_clock_info
> > > *ptp, s64 delta)>
> > >     if (ret)
> > >
> > >             goto error_return;
> > >
> > > +   spin_lock_irqsave(&dev->ptp_clock_lock, flags);
> >
> > I believe that spin_lock_irqsave is unnecessary, since there is no code
> > that runs in hardirq context.
> I'll check this again. Originally I had only a mutex for everything, but later
> it turned out that for ptp_clock_time a spinlock is required. Maybe this has
> changed since starting of my work on the driver.

Yes, it's called from the networking softirq.
The typical assumption is that the networking data path can run in
both hardirq and softirq context (or, well, in process context if it
gets picked up by ksoftirqd), so one would think that _irqsave would be
justified. But the hardirq stuff is only used by netpoll, for
netconsole. So you would never hit that condition for PTP timestamping.

> >
> > > +   dev->ptp_clock_time = timespec64_add(dev->ptp_clock_time, delta64);
> > > +   spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
> > > +
>
> [...]
>
> > Could we make this line shorter?
> ...
> > Additionally, you exceed the 80 characters limit.
> ...
> > Also, you exceeded 80 characters by quite a bit.
> ...
> > In networking we still have the 80-characters rule, please follow it.
> Can this be added to the netdev-FAQ (just below the section about
> "comment style convention")?
>
> > > +static void ksz9477_ptp_ports_deinit(struct ksz_device *dev)
> > > +{
> > > +   int port;
> > > +
> > > +   for (port = dev->port_cnt - 1; port >= 0; --port)
> >
> > Nice, but also probably not worth the effort?
> What do you mean. Shall I used forward direction?

Yes, that's what I meant.

> > > +
> > > +   /* Should already been tested in dsa_skb_tx_timestamp()? */
> > > +   if (!(skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP))
> > > +           return false;
> >
> > Yeah, should have...
> > What do you think about this one though:
> > https://lore.kernel.org/netdev/20201104015834.mcn2eoibxf6j3ksw@skbuf/
> I am not an expert for performance stuff. But for me it looks obvious that
> cheaper checks should be performed first. What about also moving checking
> for ops->port_txtstamp above ptp_classify_raw()?

I am no expert either. Also, it looks like I'm not even keeping on top
of things lately. I'll try to return to that investigation during this
weekend.

>
> Is there any reason why this isn't already applied?

Probably because nobody sent a patch for it?

> > case in which you'll need an skb_queue and a process context
> > to wait for the TX timestamp of the previous PTP message before calling
> > dsa_enqueue_skb for the next PTP event message. There are already
> > implementations of both models in DSA that you can look at.
> In the past I got sometimes a "timeout waiting for hw timestamp" (or similar)
> message from ptp4l. I am not sure whether this is still the case, but this may
> explain this type of problems.

Yeah, well, the default tx_timestamp_timeout value of 1 ms chosen by
ptp4l is not going to be enough in general for DSA. If you schedule a
workqueue for timestamping, that delay will only get worse, but luckily
you can increase the timestamp timeout value and all should be fine.

> > So good for you that you can use a function so simple for timestamp
> > reconstruction.
> You already told me that another hardware has much less budget than 4 seconds.

sja1105 has 24 bits of partial timestamp (and 1 bit measures 8 ns). So
it wraps around in 135 ms. You can imagine that periodically reading the
PTP clock over SPI is not an option there :)

> How is timestamp reconstruction done there? Is there any code which I should
> reuse?

No, I wasn't suggesting you reuse that logic, since it's very
error-prone. If you can get away with reconstruction done on-the-fly,
great. But just for reference:
- In drivers/net/dsa/sja1105/, the actual transmission of the PTP
  packets is done synchronously, from process context, and an interrupt
  is not even used. See sja1105_ptp_txtstamp_skb and
  sja1105_tstamp_reconstruct. Actually, more interesting would be the RX
  timestamping case, since we have a worse problem there: the partial
  PTP timestamp is obtained in softirq context, and we need process
  context for the current PTP time. For that, see sja1105_port_rxtstamp
  and sja1105_rxtstamp_work.
- In drivers/net/dsa/ocelot/, the reconstruction is done in IRQ context,
  since it is a memory-mapped switch and therefore, reading the PTP time
  is "cheap". See ocelot_get_txtstamp and ocelot_get_hwtimestamp.
The point is that both these drivers read the full PTP current time
_after_ the partial timestamp was obtained. That's what gives you a
solid guarantee that the "partial_timestamp > current_ptp_time & lower_bits"
condition means "wraparound occurred" and not something else.

> > > +static void ksz9477_rcv_timestamp(struct sk_buff *skb __maybe_unused, u8
> > > *tag __maybe_unused, +                                struct net_device *dev __maybe_unused,
> > > +                             unsigned int port __maybe_unused)
> >
> > Where did you see __maybe_unused being utilized in this way? And what's
> > so "maybe" about it? They are absolutely unused, and the compiler should
> > not complain. Please remove these variable attributes.
> ok, __always_unused would fit.
>
> I added the attributes due to Documentation/process/4.Coding.rst:
>
> "Code submitted for review should, as a rule, not produce any compiler
> warnings." [...] "Note that not all compiler warnings are enabled by default.  Build the
> kernel with "make EXTRA_CFLAGS=-W" to get the full set."
>
> I assumed that reducing the number of warnings raised by "-W" should be reduced
> as a long term goal. Is this wrong.

Uhm, I don't know of any compiler flag that would cause warnings for
unused arguments of a function. And the kernel uses this stub function
scheme for so many things, that even if that flag existed, the chance of
it getting enabled in the kernel's Makefile would be zero. It would
flood everybody in useless warning messages.

So please don't add __maybe_unused or __always_unused or whatever. Just
do what everybody else does.

> Side note: Documentation/kbuild/makefiles.rst declares usage of EXTRA_CFLAGS as
> deprecated.

Yeah, if you can build-test with "make W=1" and "make W=2" you should be
safe, no need to overthink anything. Beware though, there's going to be
a lot of output coming your way... There's also "make C=1" for sparse,
if you want to be proactive and avoid later patches from the static
analysis crew. But you'll need to use a source-built sparse binary
there, the one packaged by typical distributions won't cut it, and will
give up saying "error: too many
errors".
diff mbox series

Patch

diff --git a/drivers/net/dsa/microchip/ksz9477_main.c b/drivers/net/dsa/microchip/ksz9477_main.c
index 7d623400139f..42cd17c8c25d 100644
--- a/drivers/net/dsa/microchip/ksz9477_main.c
+++ b/drivers/net/dsa/microchip/ksz9477_main.c
@@ -1388,6 +1388,7 @@  static const struct dsa_switch_ops ksz9477_switch_ops = {
 	.phy_read		= ksz9477_phy_read16,
 	.phy_write		= ksz9477_phy_write16,
 	.phylink_mac_link_down	= ksz_mac_link_down,
+	.get_ts_info		= ksz9477_ptp_get_ts_info,
 	.port_enable		= ksz_enable_port,
 	.get_strings		= ksz9477_get_strings,
 	.get_ethtool_stats	= ksz_get_ethtool_stats,
@@ -1408,6 +1409,11 @@  static const struct dsa_switch_ops ksz9477_switch_ops = {
 	.port_mdb_del           = ksz9477_port_mdb_del,
 	.port_mirror_add	= ksz9477_port_mirror_add,
 	.port_mirror_del	= ksz9477_port_mirror_del,
+	.port_hwtstamp_get      = ksz9477_ptp_port_hwtstamp_get,
+	.port_hwtstamp_set      = ksz9477_ptp_port_hwtstamp_set,
+	.port_txtstamp          = ksz9477_ptp_port_txtstamp,
+	/* never defer rx delivery, tstamping is done via tail tagging */
+	.port_rxtstamp          = NULL,
 };
 
 static u32 ksz9477_get_port_addr(int port, int offset)
@@ -1554,7 +1560,8 @@  static irqreturn_t ksz9477_switch_irq_thread(int irq, void *dev_id)
 			if (ret)
 				return result;
 
-			/* ToDo: Add specific handling of port interrupts */
+			if (data8 & PORT_PTP_INT)
+				result |= ksz9477_ptp_port_interrupt(dev, port);
 		}
 	}
 
diff --git a/drivers/net/dsa/microchip/ksz9477_ptp.c b/drivers/net/dsa/microchip/ksz9477_ptp.c
index 44d7bbdea518..12698568b68b 100644
--- a/drivers/net/dsa/microchip/ksz9477_ptp.c
+++ b/drivers/net/dsa/microchip/ksz9477_ptp.c
@@ -6,7 +6,10 @@ 
  * Copyright (c) 2020 ARRI Lighting
  */
 
+#include <linux/net_tstamp.h>
+#include <linux/ptp_classify.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/sysfs.h>
 
 #include "ksz_common.h"
 #include "ksz9477_reg.h"
@@ -71,8 +74,10 @@  static int ksz9477_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 static int ksz9477_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
 	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
+	struct timespec64 delta64 = ns_to_timespec64(delta);
 	s32 sec, nsec;
 	u16 data16;
+	unsigned long flags;
 	int ret;
 
 	mutex_lock(&dev->ptp_mutex);
@@ -103,6 +108,10 @@  static int ksz9477_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 	if (ret)
 		goto error_return;
 
+	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
+	dev->ptp_clock_time = timespec64_add(dev->ptp_clock_time, delta64);
+	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
+
 error_return:
 	mutex_unlock(&dev->ptp_mutex);
 	return ret;
@@ -160,6 +169,7 @@  static int ksz9477_ptp_settime(struct ptp_clock_info *ptp, struct timespec64 con
 {
 	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
 	u16 data16;
+	unsigned long flags;
 	int ret;
 
 	mutex_lock(&dev->ptp_mutex);
@@ -198,6 +208,10 @@  static int ksz9477_ptp_settime(struct ptp_clock_info *ptp, struct timespec64 con
 	if (ret)
 		goto error_return;
 
+	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
+	dev->ptp_clock_time = *ts;
+	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
+
 error_return:
 	mutex_unlock(&dev->ptp_mutex);
 	return ret;
@@ -208,9 +222,27 @@  static int ksz9477_ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_reque
 	return -ENOTTY;
 }
 
+static long ksz9477_ptp_do_aux_work(struct ptp_clock_info *ptp)
+{
+	struct ksz_device *dev = container_of(ptp, struct ksz_device, ptp_caps);
+	struct timespec64 ts;
+	unsigned long flags;
+
+	mutex_lock(&dev->ptp_mutex);
+	_ksz9477_ptp_gettime(dev, &ts);
+	mutex_unlock(&dev->ptp_mutex);
+
+	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
+	dev->ptp_clock_time = ts;
+	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
+
+	return HZ;  /* reschedule in 1 second */
+}
+
 static int ksz9477_ptp_start_clock(struct ksz_device *dev)
 {
 	u16 data;
+	unsigned long flags;
 	int ret;
 
 	ret = ksz_read16(dev, REG_PTP_CLK_CTRL, &data);
@@ -230,6 +262,11 @@  static int ksz9477_ptp_start_clock(struct ksz_device *dev)
 	if (ret)
 		return ret;
 
+	spin_lock_irqsave(&dev->ptp_clock_lock, flags);
+	dev->ptp_clock_time.tv_sec = 0;
+	dev->ptp_clock_time.tv_nsec = 0;
+	spin_unlock_irqrestore(&dev->ptp_clock_lock, flags);
+
 	return 0;
 }
 
@@ -251,11 +288,249 @@  static int ksz9477_ptp_stop_clock(struct ksz_device *dev)
 	return 0;
 }
 
+/* Time stamping support */
+
+static int ksz9477_ptp_enable_mode(struct ksz_device *dev)
+{
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
+	if (ret)
+		return ret;
+
+	/* Enable PTP mode */
+	data |= PTP_ENABLE;
+	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_disable_mode(struct ksz_device *dev)
+{
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
+	if (ret)
+		return ret;
+
+	/* Disable PTP mode */
+	data &= ~PTP_ENABLE;
+	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_enable_port_ptp_interrupts(struct ksz_device *dev, int port)
+{
+	u32 addr = PORT_CTRL_ADDR(port, REG_PORT_INT_MASK);
+	u8 data;
+	int ret;
+
+	ret = ksz_read8(dev, addr, &data);
+	if (ret)
+		return ret;
+
+	/* Enable port PTP interrupt (0 means enabled) */
+	data &= ~PORT_PTP_INT;
+	ret = ksz_write8(dev, addr, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_disable_port_ptp_interrupts(struct ksz_device *dev, int port)
+{
+	u32 addr = PORT_CTRL_ADDR(port, REG_PORT_INT_MASK);
+	u8 data;
+	int ret;
+
+	ret = ksz_read8(dev, addr, &data);
+	if (ret)
+		return ret;
+
+	/* Enable port PTP interrupt (1 means disabled) */
+	data |= PORT_PTP_INT;
+	ret = ksz_write8(dev, addr, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_enable_port_egress_interrupts(struct ksz_device *dev, int port)
+{
+	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_ENABLE__2);
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, addr, &data);
+	if (ret)
+		return ret;
+
+	/* Enable port xdelay egress timestamp interrupt (1 means enabled) */
+	data |= PTP_PORT_XDELAY_REQ_INT;
+	ret = ksz_write16(dev, addr, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_disable_port_egress_interrupts(struct ksz_device *dev, int port)
+{
+	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_ENABLE__2);
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, addr, &data);
+	if (ret)
+		return ret;
+
+	/* Disable port xdelay egress timestamp interrupts (0 means disabled) */
+	data &= PTP_PORT_XDELAY_REQ_INT;
+	ret = ksz_write16(dev, addr, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ksz9477_ptp_port_init(struct ksz_device *dev, int port)
+{
+	struct ksz_port *prt = &dev->ports[port];
+	int ret;
+
+	/* Read rx and tx delay from port registers */
+	ret = ksz_read16(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_RX_DELAY__2),
+			 &prt->tstamp_rx_latency_ns);
+	if (ret)
+		return ret;
+
+	ret = ksz_read16(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_DELAY__2),
+			 &prt->tstamp_tx_latency_ns);
+	if (ret)
+		return ret;
+
+	if (port != dev->cpu_port) {
+		ret = ksz9477_ptp_enable_port_ptp_interrupts(dev, port);
+		if (ret)
+			return ret;
+
+		ret = ksz9477_ptp_enable_port_egress_interrupts(dev, port);
+		if (ret)
+			goto error_disable_port_ptp_interrupts;
+	}
+
+	return 0;
+
+error_disable_port_ptp_interrupts:
+	if (port != dev->cpu_port)
+		ksz9477_ptp_disable_port_ptp_interrupts(dev, port);
+	return ret;
+}
+
+static void ksz9477_ptp_port_deinit(struct ksz_device *dev, int port)
+{
+	if (port != dev->cpu_port) {
+		ksz9477_ptp_disable_port_egress_interrupts(dev, port);
+		ksz9477_ptp_disable_port_ptp_interrupts(dev, port);
+	}
+}
+
+static int ksz9477_ptp_ports_init(struct ksz_device *dev)
+{
+	int port;
+	int ret;
+
+	for (port = 0; port < dev->port_cnt; port++) {
+		ret = ksz9477_ptp_port_init(dev, port);
+		if (ret)
+			goto error_deinit;
+	}
+
+	return 0;
+
+error_deinit:
+	for (--port; port >= 0; --port)
+		ksz9477_ptp_port_deinit(dev, port);
+	return ret;
+}
+
+static void ksz9477_ptp_ports_deinit(struct ksz_device *dev)
+{
+	int port;
+
+	for (port = dev->port_cnt - 1; port >= 0; --port)
+		ksz9477_ptp_port_deinit(dev, port);
+}
+
+/* device attributes */
+
+enum ksz9477_ptp_tcmode {
+	KSZ9477_PTP_TCMODE_E2E,
+	KSZ9477_PTP_TCMODE_P2P,
+};
+
+static int ksz9477_ptp_tcmode_set(struct ksz_device *dev, enum ksz9477_ptp_tcmode tcmode)
+{
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
+	if (ret)
+		return ret;
+
+	if (tcmode == KSZ9477_PTP_TCMODE_P2P)
+		data |= PTP_TC_P2P;
+	else
+		data &= ~PTP_TC_P2P;
+
+	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+enum ksz9477_ptp_ocmode {
+	KSZ9477_PTP_OCMODE_SLAVE,
+	KSZ9477_PTP_OCMODE_MASTER,
+};
+
+static int ksz9477_ptp_ocmode_set(struct ksz_device *dev, enum ksz9477_ptp_ocmode ocmode)
+{
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, REG_PTP_MSG_CONF1, &data);
+	if (ret)
+		return ret;
+
+	if (ocmode == KSZ9477_PTP_OCMODE_MASTER)
+		data |= PTP_MASTER;
+	else
+		data &= ~PTP_MASTER;
+
+	ret = ksz_write16(dev, REG_PTP_MSG_CONF1, data);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 int ksz9477_ptp_init(struct ksz_device *dev)
 {
 	int ret;
 
 	mutex_init(&dev->ptp_mutex);
+	spin_lock_init(&dev->ptp_clock_lock);
 
 	/* PTP clock properties */
 
@@ -275,6 +550,7 @@  int ksz9477_ptp_init(struct ksz_device *dev)
 	dev->ptp_caps.gettime64   = ksz9477_ptp_gettime;
 	dev->ptp_caps.settime64   = ksz9477_ptp_settime;
 	dev->ptp_caps.enable      = ksz9477_ptp_enable;
+	dev->ptp_caps.do_aux_work = ksz9477_ptp_do_aux_work;
 
 	/* Start hardware counter (will overflow after 136 years) */
 	ret = ksz9477_ptp_start_clock(dev);
@@ -287,8 +563,44 @@  int ksz9477_ptp_init(struct ksz_device *dev)
 		goto error_stop_clock;
 	}
 
+	/* Enable PTP mode (will affect tail tagging format) */
+	ret = ksz9477_ptp_enable_mode(dev);
+	if (ret)
+		goto error_unregister_clock;
+
+	/* Init switch ports */
+	ret = ksz9477_ptp_ports_init(dev);
+	if (ret)
+		goto error_disable_mode;
+
+	/* Currently, only P2P delay measurement is supported.  Setting ocmode to
+	 * slave will work independently of actually being master or slave.
+	 * For E2E delay measurement, switching between master and slave would
+	 * be required, as the KSZ devices filters out PTP messages depending on
+	 * the ocmode setting:
+	 * - in slave mode, DelayReq messages are filtered out
+	 * - in master mode, Sync messages are filtered out
+	 * Currently (and probably also in future) there is no interface in the
+	 * kernel which allows switching between master and slave mode.  For this
+	 * reason, E2E cannot be supported. See patchwork for full discussion:
+	 * https://patchwork.ozlabs.org/project/netdev/patch/20201019172435.4416-8-ceggers@arri.de/
+	 */
+	ksz9477_ptp_tcmode_set(dev, KSZ9477_PTP_TCMODE_P2P);
+	ksz9477_ptp_ocmode_set(dev, KSZ9477_PTP_OCMODE_SLAVE);
+
+	/* Schedule cyclic call of ksz_ptp_do_aux_work() */
+	ret = ptp_schedule_worker(dev->ptp_clock, 0);
+	if (ret)
+		goto error_ports_deinit;
+
 	return 0;
 
+error_ports_deinit:
+	ksz9477_ptp_ports_deinit(dev);
+error_disable_mode:
+	ksz9477_ptp_disable_mode(dev);
+error_unregister_clock:
+	ptp_clock_unregister(dev->ptp_clock);
 error_stop_clock:
 	ksz9477_ptp_stop_clock(dev);
 	return ret;
@@ -296,6 +608,238 @@  int ksz9477_ptp_init(struct ksz_device *dev)
 
 void ksz9477_ptp_deinit(struct ksz_device *dev)
 {
+	ksz9477_ptp_ports_deinit(dev);
+	ksz9477_ptp_disable_mode(dev);
 	ptp_clock_unregister(dev->ptp_clock);
 	ksz9477_ptp_stop_clock(dev);
 }
+
+irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port)
+{
+	u32 addr = PORT_CTRL_ADDR(port, REG_PTP_PORT_TX_INT_STATUS__2);
+	struct ksz_port *prt = &dev->ports[port];
+	irqreturn_t result = IRQ_NONE;
+	u16 data;
+	int ret;
+
+	ret = ksz_read16(dev, addr, &data);
+	if (ret)
+		return IRQ_NONE;
+
+	if ((data & PTP_PORT_XDELAY_REQ_INT) && prt->tstamp_tx_xdelay_skb) {
+		/* Timestamp for Pdelay_Req / Delay_Req */
+		u32 tstamp_raw;
+		ktime_t tstamp;
+		struct skb_shared_hwtstamps shhwtstamps;
+		struct sk_buff *tmp_skb;
+
+		/* In contrast to the KSZ9563R data sheet, the format of the
+		 * port time stamp registers is also 2 bit seconds + 30 bit
+		 * nanoseconds (same as in the tail tags).
+		 */
+		ret = ksz_read32(dev, PORT_CTRL_ADDR(port, REG_PTP_PORT_XDELAY_TS), &tstamp_raw);
+		if (ret)
+			return result;
+
+		tstamp = ksz9477_decode_tstamp(tstamp_raw, prt->tstamp_tx_latency_ns);
+		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+		shhwtstamps.hwtstamp = ksz9477_tstamp_to_clock(dev, tstamp);
+
+		/* skb_complete_tx_timestamp() will free up the client to make
+		 * another timestamp-able transmit. We have to be ready for it
+		 * -- by clearing the ps->tx_skb "flag" -- beforehand.
+		 */
+
+		tmp_skb = prt->tstamp_tx_xdelay_skb;
+		prt->tstamp_tx_xdelay_skb = NULL;
+		clear_bit_unlock(KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS, &prt->tstamp_state);
+		skb_complete_tx_timestamp(tmp_skb, &shhwtstamps);
+	}
+
+	/* Clear interrupt(s) (W1C) */
+	ret = ksz_write16(dev, addr, data);
+	if (ret)
+		return IRQ_NONE;
+
+	return IRQ_HANDLED;
+}
+
+/* DSA PTP operations */
+
+int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port __always_unused,
+			    struct ethtool_ts_info *ts)
+{
+	struct ksz_device *dev = ds->priv;
+
+	ts->so_timestamping =
+			SOF_TIMESTAMPING_TX_HARDWARE |
+			SOF_TIMESTAMPING_RX_HARDWARE |
+			SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	ts->phc_index = ptp_clock_index(dev->ptp_clock);
+
+	ts->tx_types =
+			BIT(HWTSTAMP_TX_OFF) |
+			BIT(HWTSTAMP_TX_ONESTEP_P2P);
+
+	ts->rx_filters =
+			BIT(HWTSTAMP_FILTER_NONE) |
+			BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+			BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+			BIT(HWTSTAMP_FILTER_PTP_V2_EVENT);
+
+	return 0;
+}
+
+static int ksz9477_set_hwtstamp_config(struct ksz_device *dev, int port,
+				       struct hwtstamp_config *config)
+{
+	struct ksz_port *prt = &dev->ports[port];
+	bool tstamp_enable = false;
+
+	/* Prevent the TX/RX paths from trying to interact with the
+	 * timestamp hardware while we reconfigure it.
+	 */
+	clear_bit_unlock(KSZ_HWTSTAMP_ENABLED, &prt->tstamp_state);
+
+	/* reserved for future extensions */
+	if (config->flags)
+		return -EINVAL;
+
+	switch (config->tx_type) {
+	case HWTSTAMP_TX_OFF:
+		tstamp_enable = false;
+		break;
+	case HWTSTAMP_TX_ONESTEP_P2P:
+		tstamp_enable = true;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		tstamp_enable = false;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;  /* UDPv4/UDPv6 */
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;  /* 802.3 ether */
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;     /* UDP / 802.3 */
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	default:
+		config->rx_filter = HWTSTAMP_FILTER_NONE;
+		return -ERANGE;
+	}
+
+	/* Once hardware has been configured, enable timestamp checks
+	 * in the RX/TX paths.
+	 */
+	if (tstamp_enable)
+		set_bit(KSZ_HWTSTAMP_ENABLED, &prt->tstamp_state);
+
+	return 0;
+}
+
+int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr)
+{
+	struct ksz_device *dev = ds->priv;
+	struct hwtstamp_config *port_tstamp_config = &dev->ports[port].tstamp_config;
+
+	return copy_to_user(ifr->ifr_data,
+			    port_tstamp_config, sizeof(*port_tstamp_config)) ? -EFAULT : 0;
+}
+
+int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr)
+{
+	struct ksz_device *dev = ds->priv;
+	struct hwtstamp_config *port_tstamp_config = &dev->ports[port].tstamp_config;
+	struct hwtstamp_config config;
+	int err;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	err = ksz9477_set_hwtstamp_config(dev, port, &config);
+	if (err)
+		return err;
+
+	/* Save the chosen configuration to be returned later. */
+	memcpy(port_tstamp_config, &config, sizeof(config));
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0;
+}
+
+/* Returns a pointer to the PTP header if the caller should time stamp,
+ * or NULL if the caller should not.
+ */
+static struct ptp_header *ksz9477_ptp_should_tstamp(struct ksz_port *port, struct sk_buff *skb,
+						    unsigned int type)
+{
+	enum ksz9477_ptp_event_messages msg_type;
+	struct ptp_header *hdr;
+
+	if (!test_bit(KSZ_HWTSTAMP_ENABLED, &port->tstamp_state))
+		return NULL;
+
+	hdr = ptp_parse_header(skb, type);
+	if (!hdr)
+		return NULL;
+
+	msg_type = ptp_get_msgtype(hdr, type);
+
+	switch (msg_type) {
+	/* As the KSZ9563 always performs one step time stamping, only the time
+	 * stamp for Pdelay_Req is reported to the application via socket error
+	 * queue.  Time stamps for Sync and Pdelay_resp will be applied directly
+	 * to the outgoing message (e.g. correction field), but will NOT be
+	 * reported to the socket.
+	 * Delay_Req is not time stamped as E2E is currently not supported by
+	 * this driver.  See ksz9477_ptp_init() for details.
+	 */
+	case PTP_Event_Message_Pdelay_Req:
+		break;
+	default:
+		return NULL;
+	}
+
+	return hdr;
+}
+
+bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone,
+			       unsigned int type)
+{
+	struct ksz_device *dev = ds->priv;
+	struct ksz_port *prt = &dev->ports[port];
+	struct ptp_header *hdr;
+
+	/* KSZ9563 supports PTPv2 only */
+	if (!(type & PTP_CLASS_V2))
+		return false;
+
+	/* Should already been tested in dsa_skb_tx_timestamp()? */
+	if (!(skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP))
+		return false;
+
+	hdr = ksz9477_ptp_should_tstamp(prt, clone, type);
+	if (!hdr)
+		return false;
+
+	if (test_and_set_bit_lock(KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS,
+				  &prt->tstamp_state))
+		return false;  /* free cloned skb */
+
+	prt->tstamp_tx_xdelay_skb = clone;
+
+	return true;  /* keep cloned skb */
+}
diff --git a/drivers/net/dsa/microchip/ksz9477_ptp.h b/drivers/net/dsa/microchip/ksz9477_ptp.h
index 0076538419fa..e8d50a086311 100644
--- a/drivers/net/dsa/microchip/ksz9477_ptp.h
+++ b/drivers/net/dsa/microchip/ksz9477_ptp.h
@@ -10,6 +10,9 @@ 
 #ifndef DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_
 #define DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_
 
+#include <linux/irqreturn.h>
+#include <linux/types.h>
+
 #include "ksz_common.h"
 
 #if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
@@ -17,11 +20,35 @@ 
 int ksz9477_ptp_init(struct ksz_device *dev);
 void ksz9477_ptp_deinit(struct ksz_device *dev);
 
+irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port);
+
+int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *ts);
+int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr);
+int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr);
+bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone,
+			       unsigned int type);
+
 #else
 
 static inline int ksz9477_ptp_init(struct ksz_device *dev) { return 0; }
 static inline void ksz9477_ptp_deinit(struct ksz_device *dev) {}
 
+static inline irqreturn_t ksz9477_ptp_port_interrupt(struct ksz_device *dev, int port)
+	{ return IRQ_NONE; }
+
+static inline int ksz9477_ptp_get_ts_info(struct dsa_switch *ds, int port,
+					  struct ethtool_ts_info *ts) { return -EOPNOTSUPP; }
+
+static inline int ksz9477_ptp_port_hwtstamp_get(struct dsa_switch *ds, int port,
+						struct ifreq *ifr) { return -EOPNOTSUPP; }
+
+static inline int ksz9477_ptp_port_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr)
+	{ return -EOPNOTSUPP; }
+
+static inline bool ksz9477_ptp_port_txtstamp(struct dsa_switch *ds, int port,
+					     struct sk_buff *clone, unsigned int type)
+	{ return false; }
+
 #endif
 
 #endif /* DRIVERS_NET_DSA_MICROCHIP_KSZ9477_PTP_H_ */
diff --git a/include/linux/dsa/ksz_common.h b/include/linux/dsa/ksz_common.h
index 4d5b6cc9429a..bfe8873b1636 100644
--- a/include/linux/dsa/ksz_common.h
+++ b/include/linux/dsa/ksz_common.h
@@ -5,16 +5,27 @@ 
 #ifndef _NET_DSA_KSZ_COMMON_H_
 #define _NET_DSA_KSZ_COMMON_H_
 
+#include <linux/bitfield.h>
+#include <linux/bits.h>
 #include <linux/gpio/consumer.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
 #include <linux/mutex.h>
+#include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/spinlock.h>
 #include <linux/regmap.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <net/dsa.h>
 
+/* All time stamps from the KSZ consist of 2 bits for seconds and 30 bits for
+ * nanoseconds. This is NOT the same as 32 bits for nanoseconds.
+ */
+#define KSZ_TSTAMP_SEC_MASK  GENMASK(31, 30)
+#define KSZ_TSTAMP_NSEC_MASK GENMASK(29, 0)
+
 struct ksz_platform_data;
 struct ksz_dev_ops;
 struct vlan_table;
@@ -25,6 +36,12 @@  struct ksz_port_mib {
 	u64 *counters;
 };
 
+/* state flags for ksz_port::tstamp_state */
+enum {
+	KSZ_HWTSTAMP_ENABLED,
+	KSZ_HWTSTAMP_TX_XDELAY_IN_PROGRESS,
+};
+
 struct ksz_port {
 	u16 member;
 	u16 vid_member;
@@ -41,6 +58,13 @@  struct ksz_port {
 
 	struct ksz_port_mib mib;
 	phy_interface_t interface;
+#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
+	u16 tstamp_rx_latency_ns;   /* rx delay from wire to tstamp unit */
+	u16 tstamp_tx_latency_ns;   /* tx delay from tstamp unit to wire */
+	struct hwtstamp_config tstamp_config;
+	struct sk_buff *tstamp_tx_xdelay_skb;
+	unsigned long tstamp_state;
+#endif
 };
 
 struct ksz_device {
@@ -98,7 +122,30 @@  struct ksz_device {
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_caps;
 	struct mutex ptp_mutex;
+	spinlock_t ptp_clock_lock; /* for ptp_clock_time */
+	/* approximated current time, read once per second from hardware */
+	struct timespec64 ptp_clock_time;
 #endif
 };
 
+/* KSZ9563 will only timestamp event messages */
+enum ksz9477_ptp_event_messages {
+	PTP_Event_Message_Sync        = 0x0,
+	PTP_Event_Message_Delay_Req   = 0x1,
+	PTP_Event_Message_Pdelay_Req  = 0x2,
+	PTP_Event_Message_Pdelay_Resp = 0x3,
+};
+
+/* net/dsa/tag_ksz.c */
+static inline ktime_t ksz9477_decode_tstamp(u32 tstamp, int offset_ns)
+{
+	u64 ns = FIELD_GET(KSZ_TSTAMP_SEC_MASK, tstamp) * NSEC_PER_SEC +
+		 FIELD_GET(KSZ_TSTAMP_NSEC_MASK, tstamp);
+
+	/* Add/remove excess delay between wire and time stamp unit */
+	return ns_to_ktime(ns + offset_ns);
+}
+
+ktime_t ksz9477_tstamp_to_clock(struct ksz_device *ksz, ktime_t tstamp);
+
 #endif /* _NET_DSA_KSZ_COMMON_H_ */
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 4820dbcedfa2..c16eb9eae19c 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -4,10 +4,14 @@ 
  * Copyright (c) 2017 Microchip Technology
  */
 
+#include <asm/unaligned.h>
+#include <linux/dsa/ksz_common.h>
 #include <linux/etherdevice.h>
 #include <linux/list.h>
+#include <linux/ptp_classify.h>
 #include <linux/slab.h>
 #include <net/dsa.h>
+#include <net/checksum.h>
 #include "dsa_priv.h"
 
 /* Typically only one byte is used for tail tag. */
@@ -87,26 +91,125 @@  MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795);
 /*
  * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
  * ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
  * ---------------------------------------------------------------------------
+ * ts   : time stamp (only present if PTP is enabled on the hardware).
  * tag0 : Prioritization (not used now)
  * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
  *
- * For Egress (KSZ9477 -> Host), 1 byte is added before FCS.
+ * For Egress (KSZ9477 -> Host), 1/4 bytes are added before FCS.
  * ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
  * ---------------------------------------------------------------------------
+ * ts   : time stamp (only present of bit 7 in tag0 is set
  * tag0 : zero-based value represents port
  *	  (eg, 0x00=port1, 0x02=port3, 0x06=port7)
  */
 
 #define KSZ9477_INGRESS_TAG_LEN		2
 #define KSZ9477_PTP_TAG_LEN		4
-#define KSZ9477_PTP_TAG_INDICATION	0x80
+#define KSZ9477_PTP_TAG_INDICATION	BIT(7)
 
 #define KSZ9477_TAIL_TAG_OVERRIDE	BIT(9)
 #define KSZ9477_TAIL_TAG_LOOKUP		BIT(10)
 
+#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP)
+/* Time stamp tag is only inserted if PTP is enabled in hardware. */
+static void ksz9477_xmit_timestamp(struct sk_buff *skb)
+{
+	/* We send always 0 in the tail tag.  For PDelay_Resp, the ingress
+	 * time stamp of the PDelay_Req message has already been subtracted from
+	 * the correction field on rx.
+	 */
+	put_unaligned_be32(0, skb_put(skb, KSZ9477_PTP_TAG_LEN));
+}
+
+ktime_t ksz9477_tstamp_to_clock(struct ksz_device *ksz, ktime_t tstamp)
+{
+	struct timespec64 ts = ktime_to_timespec64(tstamp);
+	struct timespec64 ptp_clock_time;
+	struct timespec64 diff;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ksz->ptp_clock_lock, flags);
+	ptp_clock_time = ksz->ptp_clock_time;
+	spin_unlock_irqrestore(&ksz->ptp_clock_lock, flags);
+
+	/* calculate full time from partial time stamp */
+	ts.tv_sec = (ptp_clock_time.tv_sec & ~3) | ts.tv_sec;
+
+	/* find nearest possible point in time */
+	diff = timespec64_sub(ts, ptp_clock_time);
+	if (diff.tv_sec > 2)
+		ts.tv_sec -= 4;
+	else if (diff.tv_sec < -2)
+		ts.tv_sec += 4;
+
+	return timespec64_to_ktime(ts);
+}
+EXPORT_SYMBOL(ksz9477_tstamp_to_clock);
+
+static void ksz9477_rcv_timestamp(struct sk_buff *skb, u8 *tag,
+				  struct net_device *dev, unsigned int port)
+{
+	struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
+	u8 *tstamp_raw = tag - KSZ9477_PTP_TAG_LEN;
+	enum ksz9477_ptp_event_messages msg_type;
+	struct dsa_switch *ds = dev->dsa_ptr->ds;
+	struct ksz_device *ksz = ds->priv;
+	struct ksz_port *prt = &ksz->ports[port];
+	struct ptp_header *ptp_hdr;
+	unsigned int ptp_type;
+	ktime_t tstamp;
+
+	/* convert time stamp and write to skb */
+	tstamp = ksz9477_decode_tstamp(get_unaligned_be32(tstamp_raw),
+				       -prt->tstamp_rx_latency_ns);
+	memset(hwtstamps, 0, sizeof(*hwtstamps));
+	hwtstamps->hwtstamp = ksz9477_tstamp_to_clock(ksz, tstamp);
+
+	/* For PDelay_Req messages, user space (ptp4l) expects that the hardware
+	 * subtracts the ingress time stamp from the correction field.  The
+	 * separate hw time stamp from the sk_buff struct will not be used in
+	 * this case.
+	 */
+	if (skb_headroom(skb) < ETH_HLEN)
+		return;
+
+	__skb_push(skb, ETH_HLEN);
+	ptp_type = ptp_classify_raw(skb);
+	__skb_pull(skb, ETH_HLEN);
+
+	if (ptp_type == PTP_CLASS_NONE)
+		return;
+
+	ptp_hdr = ptp_parse_header(skb, ptp_type);
+	if (!ptp_hdr)
+		return;
+
+	msg_type = ptp_get_msgtype(ptp_hdr, ptp_type);
+	if (msg_type != PTP_Event_Message_Pdelay_Req)
+		return;
+
+	/* Only subtract partial time stamp from the correction field.  When the
+	 * hardware adds the egress time stamp to the correction field of the
+	 * PDelay_Resp message on tx, also only the partial time stamp will be
+	 * added.
+	 */
+	ptp_onestep_p2p_move_t2_to_correction(skb, ptp_type, ptp_hdr, tstamp);
+}
+#else   /* IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP) */
+static void ksz9477_xmit_timestamp(struct sk_buff *skb __maybe_unused)
+{
+}
+
+static void ksz9477_rcv_timestamp(struct sk_buff *skb __maybe_unused, u8 *tag __maybe_unused,
+				  struct net_device *dev __maybe_unused,
+				  unsigned int port __maybe_unused)
+{
+}
+#endif  /* IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ9477_PTP) */
+
 static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 				    struct net_device *dev)
 {
@@ -116,6 +219,7 @@  static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 	u16 val;
 
 	/* Tag encoding */
+	ksz9477_xmit_timestamp(skb);
 	tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN);
 	addr = skb_mac_header(skb);
 
@@ -138,8 +242,10 @@  static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev,
 	unsigned int len = KSZ_EGRESS_TAG_LEN;
 
 	/* Extra 4-bytes PTP timestamp */
-	if (tag[0] & KSZ9477_PTP_TAG_INDICATION)
+	if (tag[0] & KSZ9477_PTP_TAG_INDICATION) {
+		ksz9477_rcv_timestamp(skb, tag, dev, port);
 		len += KSZ9477_PTP_TAG_LEN;
+	}
 
 	return ksz_common_rcv(skb, dev, port, len);
 }
@@ -149,7 +255,7 @@  static const struct dsa_device_ops ksz9477_netdev_ops = {
 	.proto	= DSA_TAG_PROTO_KSZ9477,
 	.xmit	= ksz9477_xmit,
 	.rcv	= ksz9477_rcv,
-	.overhead = KSZ9477_INGRESS_TAG_LEN,
+	.overhead = KSZ9477_INGRESS_TAG_LEN + KSZ9477_PTP_TAG_LEN,
 	.tail_tag = true,
 };
 
@@ -167,6 +273,7 @@  static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
 	u8 *tag;
 
 	/* Tag encoding */
+	ksz9477_xmit_timestamp(skb);
 	tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
 	addr = skb_mac_header(skb);
 
@@ -183,7 +290,7 @@  static const struct dsa_device_ops ksz9893_netdev_ops = {
 	.proto	= DSA_TAG_PROTO_KSZ9893,
 	.xmit	= ksz9893_xmit,
 	.rcv	= ksz9477_rcv,
-	.overhead = KSZ_INGRESS_TAG_LEN,
+	.overhead = KSZ_INGRESS_TAG_LEN + KSZ9477_PTP_TAG_LEN,
 	.tail_tag = true,
 };