diff mbox series

[v2,1/2] igb: RX descriptors handling cleanup

Message ID 20230426160908.4768-1-t.dzieciol@partner.samsung.com
State New
Headers show
Series [v2,1/2] igb: RX descriptors handling cleanup | expand

Commit Message

Format of Intel 82576 was changed in comparison to Intel 82574 extended
descriptors. This change updates filling of advanced descriptors fields
accordingly:
* remove TCP ACK detection
* add IPv6 with extensions traffic detection
* fragment checksum and IP ID is filled only when RXCSUM.IPPCSE is set (in
  addition to RXCSUM.PCSD bit cleared condition)

Refactoring is done in preparation for support of multiple advanced
descriptors RX modes, especially packet-split modes.

Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
---
 hw/net/e1000e_core.c     |  16 +-
 hw/net/e1000x_regs.h     |   1 +
 hw/net/igb_core.c        | 505 ++++++++++++++++++++++++---------------
 hw/net/igb_regs.h        |  21 +-
 hw/net/trace-events      |   6 +-
 tests/qtest/libqos/igb.c |   5 +
 6 files changed, 344 insertions(+), 210 deletions(-)

Comments

Akihiko Odaki April 27, 2023, 7:24 a.m. UTC | #1
On 2023/04/27 1:09, Tomasz Dzieciol wrote:
> Format of Intel 82576 was changed in comparison to Intel 82574 extended
> descriptors. This change updates filling of advanced descriptors fields
> accordingly:
> * remove TCP ACK detection
> * add IPv6 with extensions traffic detection
> * fragment checksum and IP ID is filled only when RXCSUM.IPPCSE is set (in
>    addition to RXCSUM.PCSD bit cleared condition)
> 
> Refactoring is done in preparation for support of multiple advanced
> descriptors RX modes, especially packet-split modes.
> 
> Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>

As I wrote earlier:
 > Please rebase to:
 > https://patchew.org/QEMU/20230420054657.50367-1-akihiko.odaki@daynix.com/
 >
 > And add a cover letter with the following lines:
 > Based-on: <20230420054657.50367-1-akihiko.odaki@daynix.com>
 > ("[PATCH v2 00/41] igb: Fix for DPDK")
 >
 > I want the series to be merged first as it includes several bug fixes.

Also, next time CC: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
He is a reviewer of igb.

> ---
>   hw/net/e1000e_core.c     |  16 +-
>   hw/net/e1000x_regs.h     |   1 +
>   hw/net/igb_core.c        | 505 ++++++++++++++++++++++++---------------
>   hw/net/igb_regs.h        |  21 +-
>   hw/net/trace-events      |   6 +-
>   tests/qtest/libqos/igb.c |   5 +
>   6 files changed, 344 insertions(+), 210 deletions(-)
> 
> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> index 78373d7db7..d874809d93 100644
> --- a/hw/net/e1000e_core.c
> +++ b/hw/net/e1000e_core.c
> @@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
>   }
>   
>   static void
> -e1000e_write_to_rx_buffers(E1000ECore *core,
> -                           hwaddr ba[MAX_PS_BUFFERS],
> -                           e1000e_ba_state *bastate,
> -                           const char *data,
> -                           dma_addr_t data_len)
> +e1000e_write_payload_to_rx_buffers(E1000ECore *core,
> +                                   hwaddr ba[MAX_PS_BUFFERS],
> +                                   e1000e_ba_state *bastate,
> +                                   const char *data,
> +                                   dma_addr_t data_len)
>   {
>       while (data_len > 0) {
>           uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
> @@ -1594,8 +1594,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
>                   while (copy_size) {
>                       iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
>   
> -                    e1000e_write_to_rx_buffers(core, ba, &bastate,
> -                                            iov->iov_base + iov_ofs, iov_copy);
> +                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
> +                                                       iov->iov_base + iov_ofs, iov_copy);
>   
>                       copy_size -= iov_copy;
>                       iov_ofs += iov_copy;
> @@ -1607,7 +1607,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
>   
>                   if (desc_offset + desc_size >= total_size) {
>                       /* Simulate FCS checksum presence in the last descriptor */
> -                    e1000e_write_to_rx_buffers(core, ba, &bastate,
> +                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
>                             (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
>                   }
>               }
> diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
> index 13760c66d3..344fd10359 100644
> --- a/hw/net/e1000x_regs.h
> +++ b/hw/net/e1000x_regs.h
> @@ -827,6 +827,7 @@ union e1000_rx_desc_packet_split {
>   /* Receive Checksum Control bits */
>   #define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
>   #define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
> +#define E1000_RXCSUM_IPPCSE     0x1000  /* IP Payload Checksum enable */
>   #define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
>   
>   #define E1000_RING_DESC_LEN       (16)
> diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
> index 96b7335b31..47c4a16e84 100644
> --- a/hw/net/igb_core.c
> +++ b/hw/net/igb_core.c
> @@ -267,6 +267,21 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
>       return false;
>   }
>   
> +typedef struct E1000E_RingInfo_st {
> +    int dbah;
> +    int dbal;
> +    int dlen;
> +    int dh;
> +    int dt;
> +    int idx;
> +} E1000E_RingInfo;
> +
> +static uint32_t
> +igb_rx_queue_desctyp_get(IGBCore *core, const E1000E_RingInfo *r)
> +{
> +    return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
> +}
> +
>   static inline bool
>   igb_rss_enabled(IGBCore *core)
>   {
> @@ -694,15 +709,6 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
>       return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
>   }
>   
> -typedef struct E1000E_RingInfo_st {
> -    int dbah;
> -    int dbal;
> -    int dlen;
> -    int dh;
> -    int dt;
> -    int idx;
> -} E1000E_RingInfo;
> -
>   static inline bool
>   igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
>   {
> @@ -941,6 +947,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
>                            bufsize;
>   }
>   
> +static uint32_t
> +igb_get_queue_rx_header_buf_size(IGBCore *core, const E1000E_RingInfo *r)
> +{
> +    uint32_t srr_size = (core->mac[E1000_SRRCTL(r->idx) >> 2] &
> +                         E1000_SRRCTL_BSIZEHDR_MASK) >> 8;
> +    return srr_size * 64;

Instead, do:
uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >> 
E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;

x >> E1000_SRRCTL_BSIZEHDRSIZE_SHIFT is identical with: (x >> 8) * 64

Also, you should use E1000_SRRCTL_BSIZEHDRSIZE_MASK rather than 
E1000_SRRCTL_BSIZEHDR_MASK here. Section 8.10.2 of the datasheet says 
the field is 3-bit.

> +}
> +
>   void
>   igb_start_recv(IGBCore *core)
>   {
> @@ -1225,21 +1239,34 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
>   }
>   
>   static inline void
> -igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> -                      hwaddr *buff_addr)
> +igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> +                                 hwaddr *buff_addr)
>   {
>       *buff_addr = le64_to_cpu(desc->read.pkt_addr);
>   }
>   
>   static inline void
>   igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
> -                  hwaddr *buff_addr)
> +                  hwaddr buff_addr[MAX_PS_BUFFERS],

Don't use an array here. Instead use a struct as e1000_adv_rx_desc does; 
it will be more explicit and easier to read. And I think this should be 
rather part of the next patch.

> +                  const E1000E_RingInfo *r)
>   {
> +    uint32_t desc_type;
> +
>       if (igb_rx_use_legacy_descriptor(core)) {
> -        igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
> -    } else {
> -        igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
> +        igb_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[1]);
> +        buff_addr[0] = 0;
> +        return;
>       }
> +
> +    /* modes other than advanced single buffer descriptor not supported */
> +    desc_type = igb_rx_queue_desctyp_get(core, r);
> +    if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
> +        trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
> +    }
> +
> +    /* advanced single buffer descriptor */
> +    igb_read_adv_rx_single_buf_descr(core, &desc->adv, &buff_addr[1]);
> +    buff_addr[0] = 0;
>   }
>   
>   static void
> @@ -1281,15 +1308,11 @@ igb_verify_csum_in_sw(IGBCore *core,
>   }
>   
>   static void
> -igb_build_rx_metadata(IGBCore *core,
> -                      struct NetRxPkt *pkt,
> -                      bool is_eop,
> -                      const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> -                      uint16_t *pkt_info, uint16_t *hdr_info,
> -                      uint32_t *rss,
> -                      uint32_t *status_flags,
> -                      uint16_t *ip_id,
> -                      uint16_t *vlan_tag)
> +igb_build_rx_metadata_common(IGBCore *core,
> +                             struct NetRxPkt *pkt,
> +                             bool is_eop,
> +                             uint32_t *status_flags,
> +                             uint16_t *vlan_tag)
>   {
>       struct virtio_net_hdr *vhdr;
>       bool hasip4, hasip6, csum_valid;
> @@ -1298,7 +1321,6 @@ igb_build_rx_metadata(IGBCore *core,
>       *status_flags = E1000_RXD_STAT_DD;
>   
>       /* No additional metadata needed for non-EOP descriptors */
> -    /* TODO: EOP apply only to status so don't skip whole function. */
>       if (!is_eop) {
>           goto func_exit;
>       }
> @@ -1315,64 +1337,6 @@ igb_build_rx_metadata(IGBCore *core,
>           trace_e1000e_rx_metadata_vlan(*vlan_tag);
>       }
>   
> -    /* Packet parsing results */
> -    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
> -        if (rss_info->enabled) {
> -            *rss = cpu_to_le32(rss_info->hash);
> -            trace_igb_rx_metadata_rss(*rss);
> -        }
> -    } else if (hasip4) {
> -            *status_flags |= E1000_RXD_STAT_IPIDV;
> -            *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
> -            trace_e1000e_rx_metadata_ip_id(*ip_id);
> -    }
> -
> -    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
> -        *status_flags |= E1000_RXD_STAT_ACK;
> -        trace_e1000e_rx_metadata_ack();
> -    }
> -
> -    if (pkt_info) {
> -        *pkt_info = rss_info->enabled ? rss_info->type : 0;
> -
> -        if (etqf < 8) {
> -            *pkt_info |= BIT(11) | (etqf << 4);
> -        } else {
> -            if (hasip4) {
> -                *pkt_info |= E1000_ADVRXD_PKT_IP4;
> -            }
> -
> -            if (hasip6) {
> -                *pkt_info |= E1000_ADVRXD_PKT_IP6;
> -            }
> -
> -            switch (l4hdr_proto) {
> -            case ETH_L4_HDR_PROTO_TCP:
> -                *pkt_info |= E1000_ADVRXD_PKT_TCP;
> -                break;
> -
> -            case ETH_L4_HDR_PROTO_UDP:
> -                *pkt_info |= E1000_ADVRXD_PKT_UDP;
> -                break;
> -
> -            case ETH_L4_HDR_PROTO_SCTP:
> -                *pkt_info |= E1000_ADVRXD_PKT_SCTP;
> -                break;
> -
> -            default:
> -                break;
> -            }
> -        }
> -    }
> -
> -    if (hdr_info) {
> -        *hdr_info = 0;
> -    }
> -
> -    if (ts) {
> -        *status_flags |= BIT(16);
> -    }
> -
>       /* RX CSO information */
>       if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
>           trace_e1000e_rx_metadata_ipv6_sum_disabled();
> @@ -1426,58 +1390,125 @@ func_exit:
>   }
>   
>   static inline void
> -igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
> +igb_write_lgcy_rx_descr(IGBCore *core,
> +                        struct e1000_rx_desc *d,

Why renaming desc to d?

>                           struct NetRxPkt *pkt,
> -                        const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> +                        const E1000E_RSSInfo *rss_info,
>                           uint16_t length)
>   {
> -    uint32_t status_flags, rss;
> -    uint16_t ip_id;
> +    uint32_t status_flags;
>   
>       assert(!rss_info->enabled);
> -    desc->length = cpu_to_le16(length);
> -    desc->csum = 0;
>   
> -    igb_build_rx_metadata(core, pkt, pkt != NULL,
> -                          rss_info, etqf, ts,
> -                          NULL, NULL, &rss,
> -                          &status_flags, &ip_id,
> -                          &desc->special);
> -    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
> -    desc->status = (uint8_t) le32_to_cpu(status_flags);
> +    memset(d, 0, sizeof(*d));
> +    d->length = cpu_to_le16(length);
> +    igb_build_rx_metadata_common(core, pkt, pkt != NULL,
> +                                 &status_flags,
> +                                 &d->special);
> +
> +    d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
> +    d->status = (uint8_t) le32_to_cpu(status_flags);
> +}
> +
> +static uint16_t
> +igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
> +{
> +    uint16_t pkt_type = 0;
> +    bool hasip4, hasip6;
> +    EthL4HdrProto l4hdr_proto;
> +
> +    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
> +
> +    if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
> +        eth_ip6_hdr_info *ip6hdr_info  = net_rx_pkt_get_ip6_info(pkt);
> +        pkt_type |= ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
> +                                                E1000_ADVRXD_PKT_IP6;
> +    } else if (hasip4) {
> +        pkt_type = E1000_ADVRXD_PKT_IP4;
> +    }
> +
> +    if (etqf < 8) {
> +        pkt_type |= (BIT(11) >> 4) | etqf;
> +        return pkt_type;
> +    }
> +
> +    switch (l4hdr_proto) {
> +    case ETH_L4_HDR_PROTO_TCP:
> +        pkt_type |= E1000_ADVRXD_PKT_TCP;
> +        break;
> +    case ETH_L4_HDR_PROTO_UDP:
> +        pkt_type |= E1000_ADVRXD_PKT_UDP;
> +        break;
> +    case ETH_L4_HDR_PROTO_SCTP:
> +        pkt_type |= E1000_ADVRXD_PKT_SCTP;
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    return pkt_type;
>   }
>   
>   static inline void
> -igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> +igb_write_adv_rx_descr(IGBCore *core,
> +                       union e1000_adv_rx_desc *d,
>                          struct NetRxPkt *pkt,
> -                       const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> +                       const E1000E_RSSInfo *rss_info,
> +                       uint16_t etqf,
> +                       bool ts,
>                          uint16_t length)
>   {
> -    memset(&desc->wb, 0, sizeof(desc->wb));
> +    bool hasip4, hasip6;
> +    EthL4HdrProto l4hdr_proto;
> +    uint16_t rss_type = 0, pkt_type;
> +    bool eop = (pkt != NULL);
> +    memset(&d->wb, 0, sizeof(d->wb));
> +
> +    d->wb.upper.length = cpu_to_le16(length);
> +    igb_build_rx_metadata_common(core, pkt, eop,
> +                                 &d->wb.upper.status_error,
> +                                 &d->wb.upper.vlan);
> +
> +    if (!eop) {
> +        return;
> +    }
> +
> +    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
>   
> -    desc->wb.upper.length = cpu_to_le16(length);
> +    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) && rss_info->enabled) {

I think the older style better described that RSS and checksumming is 
mutally exclusive.

> +        d->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
> +        rss_type = rss_info->type;
> +        trace_igb_rx_metadata_rss(d->wb.lower.hi_dword.rss, rss_type);
> +    } else if (!(core->mac[RXCSUM] & E1000_RXCSUM_PCSD) &&
> +               core->mac[RXCSUM] & E1000_RXCSUM_IPPCSE &&
> +               hasip4) {
> +        d->wb.lower.hi_dword.csum_ip.ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
> +        trace_e1000e_rx_metadata_ip_id(d->wb.lower.hi_dword.csum_ip.ip_id);
> +    }
>   
> -    igb_build_rx_metadata(core, pkt, pkt != NULL,
> -                          rss_info, etqf, ts,
> -                          &desc->wb.lower.lo_dword.pkt_info,
> -                          &desc->wb.lower.lo_dword.hdr_info,
> -                          &desc->wb.lower.hi_dword.rss,
> -                          &desc->wb.upper.status_error,
> -                          &desc->wb.lower.hi_dword.csum_ip.ip_id,
> -                          &desc->wb.upper.vlan);
> +    if (ts) {
> +        d->wb.upper.status_error |= BIT(16);
> +    }
> +
> +    pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
> +    trace_e1000e_rx_metadata_pkt_type(pkt_type);
> +    d->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
>   }
>   
>   static inline void
> -igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
> -                   struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
> -                   uint16_t etqf, bool ts, uint16_t length)
> +igb_write_rx_descr(IGBCore *core,
> +                   union e1000_rx_desc_union *desc,
> +                   struct NetRxPkt *pkt,
> +                   const E1000E_RSSInfo *rss_info,
> +                   uint16_t etqf,
> +                   bool ts,
> +                   uint16_t(*written)[MAX_PS_BUFFERS],
> +                   const E1000E_RingInfo *r)
>   {
>       if (igb_rx_use_legacy_descriptor(core)) {
> -        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
> -                                etqf, ts, length);
> +        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, (*written)[1]);
>       } else {
> -        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
> -                               etqf, ts, length);
> +        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, etqf, ts, (*written)[1]);
>       }
>   }
>   
> @@ -1513,19 +1544,6 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
>       }
>   }
>   
> -static void
> -igb_write_to_rx_buffers(IGBCore *core,
> -                        PCIDevice *d,
> -                        hwaddr ba,
> -                        uint16_t *written,
> -                        const char *data,
> -                        dma_addr_t data_len)
> -{
> -    trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
> -    pci_dma_write(d, ba + *written, data, data_len);
> -    *written += data_len;
> -}
> -
>   static void
>   igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
>                       size_t pkt_size, size_t pkt_fcs_size)
> @@ -1551,6 +1569,136 @@ igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
>              ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
>   }
>   
> +typedef struct igb_ba_state_st {
> +    uint16_t written[MAX_PS_BUFFERS];
> +    uint8_t cur_idx;
> +} igb_ba_state;
> +
> +typedef struct {
> +    size_t size;
> +    size_t total_size;
> +    size_t ps_hdr_len;
> +    size_t desc_size;
> +    size_t desc_offset;
> +    uint32_t rx_desc_packet_buf_size;
> +    uint32_t rx_desc_header_buf_size;
> +    struct iovec *iov;
> +    size_t iov_ofs;
> +    bool is_first;
> +    igb_ba_state bastate;
> +    hwaddr ba[MAX_PS_BUFFERS];
> +} igb_packet_tx_dma_state;

Follow the convention described in docs/devel/style.rst for new typedefs.

Also, it says tx but it's used for rx.

> +
> +static void
> +igb_truncate_to_descriptor_size(igb_packet_tx_dma_state *pdma_st, size_t *size)
> +{
> +    if (*size > pdma_st->rx_desc_packet_buf_size) {
> +        *size = pdma_st->rx_desc_packet_buf_size;
> +    }
> +}
> +
> +static void
> +igb_write_payload_to_rx_buffers(IGBCore *core,
> +                                PCIDevice *d,
> +                                hwaddr (*ba)[MAX_PS_BUFFERS],


> +                                igb_ba_state *bastate,
> +                                uint32_t cur_buf_len,
> +                                const char *data,
> +                                dma_addr_t data_len)
> +{
> +    while (data_len > 0) {
> +        assert(bastate->cur_idx < MAX_PS_BUFFERS);
> +
> +        uint32_t cur_buf_bytes_left = cur_buf_len -
> +                                      bastate->written[bastate->cur_idx];
> +        uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
> +
> +        trace_igb_rx_desc_buff_write(bastate->cur_idx,
> +                                     (*ba)[bastate->cur_idx],
> +                                     bastate->written[bastate->cur_idx],
> +                                     data,
> +                                     bytes_to_write);
> +
> +        pci_dma_write(d,
> +                      (*ba)[bastate->cur_idx] +
> +                      bastate->written[bastate->cur_idx],
> +                      data, bytes_to_write);
> +
> +        bastate->written[bastate->cur_idx] += bytes_to_write;
> +        data += bytes_to_write;
> +        data_len -= bytes_to_write;
> +
> +        if (bastate->written[bastate->cur_idx] == cur_buf_len) {
> +            bastate->cur_idx++;
> +        }
> +    }
> +}
> +
> +static void
> +igb_write_packet_payload_to_descr_addr(IGBCore *core,

The names of igb_write_payload_to_rx_buffers() and 
igb_write_packet_payload_to_descr_addr() don't disambiguate themselves. 
Since igb_write_payload_to_rx_buffers() only writes a framgent of the 
payload, you may rename it to 
igb_write_payload_fragment_to_rx_buffers(), and rename 
igb_write_packet_payload_to_descr_addr() to 
igb_write_payload_to_rx_buffers().

> +                                       struct NetRxPkt *pkt,
> +                                       PCIDevice *d,
> +                                       igb_packet_tx_dma_state *pdma_st,
> +                                       size_t *copy_size)
> +{
> +    static const uint32_t fcs_pad;
> +    size_t iov_copy;
> +
> +    /* Copy packet payload */
> +    while (*copy_size) {
> +        iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
> +        igb_write_payload_to_rx_buffers(core, d,
> +                                        &pdma_st->ba,
> +                                        &pdma_st->bastate,
> +                                        pdma_st->rx_desc_packet_buf_size,
> +                                        pdma_st->iov->iov_base + pdma_st->iov_ofs,
> +                                        iov_copy);
> +
> +        *copy_size -= iov_copy;
> +        pdma_st->iov_ofs += iov_copy;
> +        if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
> +            pdma_st->iov++;
> +            pdma_st->iov_ofs = 0;
> +        }
> +    }
> +
> +    if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
> +        /* Simulate FCS checksum presence in the last descriptor */
> +        igb_write_payload_to_rx_buffers(core, d,
> +                                        &pdma_st->ba,
> +                                        &pdma_st->bastate,
> +                                        pdma_st->rx_desc_packet_buf_size,
> +                                        (const char *) &fcs_pad,
> +                                        e1000x_fcs_len(core->mac));
> +    }
> +}
> +
> +static void
> +igb_write_to_rx_buffers(IGBCore *core,
> +                        struct NetRxPkt *pkt,
> +                        PCIDevice *d,
> +                        igb_packet_tx_dma_state *pdma_st)
> +{
> +    size_t copy_size;
> +
> +    if (!(pdma_st->ba)[1]) {
> +        /* as per intel docs; skip descriptors with null buf addr */
> +        trace_e1000e_rx_null_descriptor();
> +        return;
> +    }
> +
> +    if (pdma_st->desc_offset >= pdma_st->size) {
> +        return;
> +    }
> +
> +    pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
> +    igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
> +    copy_size = pdma_st->size - pdma_st->desc_offset;
> +    igb_truncate_to_descriptor_size(pdma_st, &copy_size);
> +    pdma_st->bastate.cur_idx = 1;
> +    igb_write_packet_payload_to_descr_addr(core, pkt, d, pdma_st, &copy_size);
> +}
> +
>   static void
>   igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
>                             const E1000E_RxRing *rxr,
> @@ -1560,91 +1708,58 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
>       PCIDevice *d;
>       dma_addr_t base;
>       union e1000_rx_desc_union desc;
> -    size_t desc_size;
> -    size_t desc_offset = 0;
> -    size_t iov_ofs = 0;
> -
> -    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
> -    size_t size = net_rx_pkt_get_total_len(pkt);
> -    size_t total_size = size + e1000x_fcs_len(core->mac);
> -    const E1000E_RingInfo *rxi = rxr->i;
> -    size_t bufsize = igb_rxbufsize(core, rxi);
> -
> +    const E1000E_RingInfo *rxi;
> +    size_t rx_desc_len;
> +
> +    igb_packet_tx_dma_state pdma_st = {0};
> +    pdma_st.is_first = true;
> +    pdma_st.size = net_rx_pkt_get_total_len(pkt);
> +    pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
> +
> +    rxi = rxr->i;
> +    rx_desc_len = core->rx_desc_len;
> +    pdma_st.rx_desc_packet_buf_size =
> +        igb_rxbufsize(core, rxi);
> +    pdma_st.rx_desc_header_buf_size =
> +        igb_get_queue_rx_header_buf_size(core, rxi);
> +    pdma_st.iov = net_rx_pkt_get_iovec(pkt);
>       d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
>       if (!d) {
>           d = core->owner;
>       }
>   
>       do {
> -        hwaddr ba;
> -        uint16_t written = 0;
> +        memset(&pdma_st.bastate, 0, sizeof(igb_ba_state));
>           bool is_last = false;
>   
> -        desc_size = total_size - desc_offset;
> -
> -        if (desc_size > bufsize) {
> -            desc_size = bufsize;
> -        }
> -
>           if (igb_ring_empty(core, rxi)) {
>               return;
>           }
>   
>           base = igb_ring_head_descr(core, rxi);
> +        pci_dma_read(d, base, &desc, rx_desc_len);
> +        trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
>   
> -        pci_dma_read(d, base, &desc, core->rx_desc_len);
> -
> -        trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
> -
> -        igb_read_rx_descr(core, &desc, &ba);
> -
> -        if (ba) {
> -            if (desc_offset < size) {
> -                static const uint32_t fcs_pad;
> -                size_t iov_copy;
> -                size_t copy_size = size - desc_offset;
> -                if (copy_size > bufsize) {
> -                    copy_size = bufsize;
> -                }
> -
> -                /* Copy packet payload */
> -                while (copy_size) {
> -                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
> -
> -                    igb_write_to_rx_buffers(core, d, ba, &written,
> -                                            iov->iov_base + iov_ofs, iov_copy);
> +        igb_read_rx_descr(core, &desc, pdma_st.ba, rxi);
>   
> -                    copy_size -= iov_copy;
> -                    iov_ofs += iov_copy;
> -                    if (iov_ofs == iov->iov_len) {
> -                        iov++;
> -                        iov_ofs = 0;
> -                    }
> -                }
> -
> -                if (desc_offset + desc_size >= total_size) {
> -                    /* Simulate FCS checksum presence in the last descriptor */
> -                    igb_write_to_rx_buffers(core, d, ba, &written,
> -                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
> -                }
> -            }
> -        } else { /* as per intel docs; skip descriptors with null buf addr */
> -            trace_e1000e_rx_null_descriptor();
> -        }
> -        desc_offset += desc_size;
> -        if (desc_offset >= total_size) {
> +        igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
> +        pdma_st.desc_offset += pdma_st.desc_size;
> +        if (pdma_st.desc_offset >= pdma_st.total_size) {
>               is_last = true;
>           }
>   
> -        igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
> -                           rss_info, etqf, ts, written);
> -        igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
> -
> -        igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
> -
> -    } while (desc_offset < total_size);
> -
> -    igb_update_rx_stats(core, rxi, size, total_size);
> +        igb_write_rx_descr(core, &desc,
> +                           is_last ? pkt : NULL,
> +                           rss_info,
> +                           etqf, ts,
> +                           &pdma_st.bastate.written,
> +                           rxi);
> +        pci_dma_write(d, base, &desc, rx_desc_len);
> +        igb_ring_advance(core, rxi,
> +                         rx_desc_len / E1000_MIN_RX_DESC_LEN);
> +    } while (pdma_st.desc_offset < pdma_st.total_size);
> +
> +    igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
>   }
>   
>   static bool
> diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
> index 82ff195dfc..b14ca834ad 100644
> --- a/hw/net/igb_regs.h
> +++ b/hw/net/igb_regs.h
> @@ -76,6 +76,15 @@ union e1000_adv_rx_desc {
>       } wb;  /* writeback */
>   };
>   
> +#define IGB_MAX_PS_BUFFERS  2
> +#define IGB_MAX_RX_DESC_LEN (sizeof(uint64_t) * MAX_PS_BUFFERS)
> +
> +#define RX_DESC_ADV_HDR_LEN_OFFSET    (21 - 16)
> +#define RX_DESC_ADV_HDR_LEN_MASK      ((BIT(10) - 1) << \
> +                                       RX_DESC_ADV_HDR_LEN_OFFSET)
> +#define RX_DESC_ADV_HDR_SPH           BIT(15)
> +#define RX_DESC_ADV_ST_ERR_HBO_OFFSET BIT(3 + 20)
> +

These should be added afer /* new */

And use the name ADVRXD to denote the advanced Rx descriptor to make it 
consistent with ADVTXD.

>   /* from igb/e1000_phy.h */
>   
>   /* IGP01E1000 Specific Registers */
> @@ -452,6 +461,7 @@ union e1000_adv_rx_desc {
>   #define E1000_SRRCTL_BSIZEHDRSIZE_MASK         0x00000F00
>   #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT        2  /* Shift _left_ */
>   #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF       0x02000000
> +#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT        0x04000000
>   #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
>   #define E1000_SRRCTL_DESCTYPE_MASK             0x0E000000
>   #define E1000_SRRCTL_DROP_EN                   0x80000000
> @@ -692,11 +702,12 @@ union e1000_adv_rx_desc {
>   
>   #define E1000_STATUS_NUM_VFS_SHIFT 14
>   
> -#define E1000_ADVRXD_PKT_IP4 BIT(4)
> -#define E1000_ADVRXD_PKT_IP6 BIT(6)
> -#define E1000_ADVRXD_PKT_TCP BIT(8)
> -#define E1000_ADVRXD_PKT_UDP BIT(9)
> -#define E1000_ADVRXD_PKT_SCTP BIT(10)
> +#define E1000_ADVRXD_PKT_IP4  BIT(0)
> +#define E1000_ADVRXD_PKT_IP6  BIT(2)
> +#define E1000_ADVRXD_PKT_IP6E BIT(3)
> +#define E1000_ADVRXD_PKT_TCP  BIT(4)
> +#define E1000_ADVRXD_PKT_UDP  BIT(5)
> +#define E1000_ADVRXD_PKT_SCTP BIT(6)
>   
>   static inline uint8_t igb_ivar_entry_rx(uint8_t i)
>   {
> diff --git a/hw/net/trace-events b/hw/net/trace-events
> index e4a98b2c7d..9a02261360 100644
> --- a/hw/net/trace-events
> +++ b/hw/net/trace-events
> @@ -277,9 +277,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
>   igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
>   
>   igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
> -igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
> +igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer #%u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
>   
> -igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
> +igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
>   
>   igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
>   igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
> @@ -294,6 +294,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
>   igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
>   igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
>   
> +igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
> +
>   # igbvf.c
>   igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
>   
> diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
> index a603468beb..4e720ee7e7 100644
> --- a/tests/qtest/libqos/igb.c
> +++ b/tests/qtest/libqos/igb.c
> @@ -55,6 +55,7 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
>   {
>       static const uint8_t address[] = E1000E_ADDRESS;
>       QE1000E_PCI *d = (QE1000E_PCI *) obj;
> +    uint32_t srrctl0;
>       uint32_t val;
>   
>       /* Enable the device */
> @@ -109,6 +110,10 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
>                           E1000_RAH_AV | E1000_RAH_POOL_1 |
>                           le16_to_cpu(*(uint16_t *)(address + 4)));
>   
> +    /* Set supported receive descriptor mode */
> +    srrctl0 = e1000e_macreg_read(&d->e1000e, E1000_SRRCTL(0));
> +    e1000e_macreg_write(&d->e1000e, E1000_SRRCTL(0), srrctl0 | BIT(25));

As I wrote in previous review:
 > You don't need to read the initial value. The Linux driver also
 > doesn't do that.
 >
 > Also use: E1000_SRRCTL_DESCTYPE_ADV_ONEBUF

> +
>       /* Enable receive */
>       e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
>       e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);
On Thu, Apr 27, 2023 at 04:24:10PM +0900, Akihiko Odaki wrote:
> On 2023/04/27 1:09, Tomasz Dzieciol wrote:
> > Format of Intel 82576 was changed in comparison to Intel 82574 extended
> > descriptors. This change updates filling of advanced descriptors fields
> > accordingly:
> > * remove TCP ACK detection
> > * add IPv6 with extensions traffic detection
> > * fragment checksum and IP ID is filled only when RXCSUM.IPPCSE is set (in
> >    addition to RXCSUM.PCSD bit cleared condition)
> > 
> > Refactoring is done in preparation for support of multiple advanced
> > descriptors RX modes, especially packet-split modes.
> > 
> > Signed-off-by: Tomasz Dzieciol <t.dzieciol@partner.samsung.com>
> 
> As I wrote earlier:
> > Please rebase to:
> > https://protect2.fireeye.com/v1/url?k=512318e9-30a80dd9-512293a6-000babffaa23-3bee4c44b5723bf0&q=1&e=355018c1-72f7-43ba-be71-fd86193bc66d&u=https%3A%2F%2Fpatchew.org%2FQEMU%2F20230420054657.50367-1-akihiko.odaki%40daynix.com%2F
> >
> > And add a cover letter with the following lines:
> > Based-on: <20230420054657.50367-1-akihiko.odaki@daynix.com>
> > ("[PATCH v2 00/41] igb: Fix for DPDK")
> >
> > I want the series to be merged first as it includes several bug fixes.
> 
> Also, next time CC: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> He is a reviewer of igb.

I rebased it to v3, just didn't added cover letter. Should I rebase it to v2 as pointed by the link you provided or to v4 as it it currently the newest?

I will apply rest of comments.

> 
> > ---
> >   hw/net/e1000e_core.c     |  16 +-
> >   hw/net/e1000x_regs.h     |   1 +
> >   hw/net/igb_core.c        | 505 ++++++++++++++++++++++++---------------
> >   hw/net/igb_regs.h        |  21 +-
> >   hw/net/trace-events      |   6 +-
> >   tests/qtest/libqos/igb.c |   5 +
> >   6 files changed, 344 insertions(+), 210 deletions(-)
> > 
> > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> > index 78373d7db7..d874809d93 100644
> > --- a/hw/net/e1000e_core.c
> > +++ b/hw/net/e1000e_core.c
> > @@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
> >   }
> >   static void
> > -e1000e_write_to_rx_buffers(E1000ECore *core,
> > -                           hwaddr ba[MAX_PS_BUFFERS],
> > -                           e1000e_ba_state *bastate,
> > -                           const char *data,
> > -                           dma_addr_t data_len)
> > +e1000e_write_payload_to_rx_buffers(E1000ECore *core,
> > +                                   hwaddr ba[MAX_PS_BUFFERS],
> > +                                   e1000e_ba_state *bastate,
> > +                                   const char *data,
> > +                                   dma_addr_t data_len)
> >   {
> >       while (data_len > 0) {
> >           uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
> > @@ -1594,8 +1594,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
> >                   while (copy_size) {
> >                       iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
> > -                    e1000e_write_to_rx_buffers(core, ba, &bastate,
> > -                                            iov->iov_base + iov_ofs, iov_copy);
> > +                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
> > +                                                       iov->iov_base + iov_ofs, iov_copy);
> >                       copy_size -= iov_copy;
> >                       iov_ofs += iov_copy;
> > @@ -1607,7 +1607,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
> >                   if (desc_offset + desc_size >= total_size) {
> >                       /* Simulate FCS checksum presence in the last descriptor */
> > -                    e1000e_write_to_rx_buffers(core, ba, &bastate,
> > +                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
> >                             (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
> >                   }
> >               }
> > diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
> > index 13760c66d3..344fd10359 100644
> > --- a/hw/net/e1000x_regs.h
> > +++ b/hw/net/e1000x_regs.h
> > @@ -827,6 +827,7 @@ union e1000_rx_desc_packet_split {
> >   /* Receive Checksum Control bits */
> >   #define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
> >   #define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
> > +#define E1000_RXCSUM_IPPCSE     0x1000  /* IP Payload Checksum enable */
> >   #define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
> >   #define E1000_RING_DESC_LEN       (16)
> > diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
> > index 96b7335b31..47c4a16e84 100644
> > --- a/hw/net/igb_core.c
> > +++ b/hw/net/igb_core.c
> > @@ -267,6 +267,21 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
> >       return false;
> >   }
> > +typedef struct E1000E_RingInfo_st {
> > +    int dbah;
> > +    int dbal;
> > +    int dlen;
> > +    int dh;
> > +    int dt;
> > +    int idx;
> > +} E1000E_RingInfo;
> > +
> > +static uint32_t
> > +igb_rx_queue_desctyp_get(IGBCore *core, const E1000E_RingInfo *r)
> > +{
> > +    return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
> > +}
> > +
> >   static inline bool
> >   igb_rss_enabled(IGBCore *core)
> >   {
> > @@ -694,15 +709,6 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
> >       return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
> >   }
> > -typedef struct E1000E_RingInfo_st {
> > -    int dbah;
> > -    int dbal;
> > -    int dlen;
> > -    int dh;
> > -    int dt;
> > -    int idx;
> > -} E1000E_RingInfo;
> > -
> >   static inline bool
> >   igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
> >   {
> > @@ -941,6 +947,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
> >                            bufsize;
> >   }
> > +static uint32_t
> > +igb_get_queue_rx_header_buf_size(IGBCore *core, const E1000E_RingInfo *r)
> > +{
> > +    uint32_t srr_size = (core->mac[E1000_SRRCTL(r->idx) >> 2] &
> > +                         E1000_SRRCTL_BSIZEHDR_MASK) >> 8;
> > +    return srr_size * 64;
> 
> Instead, do:
> uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
> return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >>
> E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
> 
> x >> E1000_SRRCTL_BSIZEHDRSIZE_SHIFT is identical with: (x >> 8) * 64
> 
> Also, you should use E1000_SRRCTL_BSIZEHDRSIZE_MASK rather than
> E1000_SRRCTL_BSIZEHDR_MASK here. Section 8.10.2 of the datasheet says the
> field is 3-bit.
> 
> > +}
> > +
> >   void
> >   igb_start_recv(IGBCore *core)
> >   {
> > @@ -1225,21 +1239,34 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
> >   }
> >   static inline void
> > -igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> > -                      hwaddr *buff_addr)
> > +igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> > +                                 hwaddr *buff_addr)
> >   {
> >       *buff_addr = le64_to_cpu(desc->read.pkt_addr);
> >   }
> >   static inline void
> >   igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
> > -                  hwaddr *buff_addr)
> > +                  hwaddr buff_addr[MAX_PS_BUFFERS],
> 
> Don't use an array here. Instead use a struct as e1000_adv_rx_desc does; it
> will be more explicit and easier to read. And I think this should be rather
> part of the next patch.
> 
> > +                  const E1000E_RingInfo *r)
> >   {
> > +    uint32_t desc_type;
> > +
> >       if (igb_rx_use_legacy_descriptor(core)) {
> > -        igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
> > -    } else {
> > -        igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
> > +        igb_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[1]);
> > +        buff_addr[0] = 0;
> > +        return;
> >       }
> > +
> > +    /* modes other than advanced single buffer descriptor not supported */
> > +    desc_type = igb_rx_queue_desctyp_get(core, r);
> > +    if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
> > +        trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
> > +    }
> > +
> > +    /* advanced single buffer descriptor */
> > +    igb_read_adv_rx_single_buf_descr(core, &desc->adv, &buff_addr[1]);
> > +    buff_addr[0] = 0;
> >   }
> >   static void
> > @@ -1281,15 +1308,11 @@ igb_verify_csum_in_sw(IGBCore *core,
> >   }
> >   static void
> > -igb_build_rx_metadata(IGBCore *core,
> > -                      struct NetRxPkt *pkt,
> > -                      bool is_eop,
> > -                      const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> > -                      uint16_t *pkt_info, uint16_t *hdr_info,
> > -                      uint32_t *rss,
> > -                      uint32_t *status_flags,
> > -                      uint16_t *ip_id,
> > -                      uint16_t *vlan_tag)
> > +igb_build_rx_metadata_common(IGBCore *core,
> > +                             struct NetRxPkt *pkt,
> > +                             bool is_eop,
> > +                             uint32_t *status_flags,
> > +                             uint16_t *vlan_tag)
> >   {
> >       struct virtio_net_hdr *vhdr;
> >       bool hasip4, hasip6, csum_valid;
> > @@ -1298,7 +1321,6 @@ igb_build_rx_metadata(IGBCore *core,
> >       *status_flags = E1000_RXD_STAT_DD;
> >       /* No additional metadata needed for non-EOP descriptors */
> > -    /* TODO: EOP apply only to status so don't skip whole function. */
> >       if (!is_eop) {
> >           goto func_exit;
> >       }
> > @@ -1315,64 +1337,6 @@ igb_build_rx_metadata(IGBCore *core,
> >           trace_e1000e_rx_metadata_vlan(*vlan_tag);
> >       }
> > -    /* Packet parsing results */
> > -    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
> > -        if (rss_info->enabled) {
> > -            *rss = cpu_to_le32(rss_info->hash);
> > -            trace_igb_rx_metadata_rss(*rss);
> > -        }
> > -    } else if (hasip4) {
> > -            *status_flags |= E1000_RXD_STAT_IPIDV;
> > -            *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
> > -            trace_e1000e_rx_metadata_ip_id(*ip_id);
> > -    }
> > -
> > -    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
> > -        *status_flags |= E1000_RXD_STAT_ACK;
> > -        trace_e1000e_rx_metadata_ack();
> > -    }
> > -
> > -    if (pkt_info) {
> > -        *pkt_info = rss_info->enabled ? rss_info->type : 0;
> > -
> > -        if (etqf < 8) {
> > -            *pkt_info |= BIT(11) | (etqf << 4);
> > -        } else {
> > -            if (hasip4) {
> > -                *pkt_info |= E1000_ADVRXD_PKT_IP4;
> > -            }
> > -
> > -            if (hasip6) {
> > -                *pkt_info |= E1000_ADVRXD_PKT_IP6;
> > -            }
> > -
> > -            switch (l4hdr_proto) {
> > -            case ETH_L4_HDR_PROTO_TCP:
> > -                *pkt_info |= E1000_ADVRXD_PKT_TCP;
> > -                break;
> > -
> > -            case ETH_L4_HDR_PROTO_UDP:
> > -                *pkt_info |= E1000_ADVRXD_PKT_UDP;
> > -                break;
> > -
> > -            case ETH_L4_HDR_PROTO_SCTP:
> > -                *pkt_info |= E1000_ADVRXD_PKT_SCTP;
> > -                break;
> > -
> > -            default:
> > -                break;
> > -            }
> > -        }
> > -    }
> > -
> > -    if (hdr_info) {
> > -        *hdr_info = 0;
> > -    }
> > -
> > -    if (ts) {
> > -        *status_flags |= BIT(16);
> > -    }
> > -
> >       /* RX CSO information */
> >       if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
> >           trace_e1000e_rx_metadata_ipv6_sum_disabled();
> > @@ -1426,58 +1390,125 @@ func_exit:
> >   }
> >   static inline void
> > -igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
> > +igb_write_lgcy_rx_descr(IGBCore *core,
> > +                        struct e1000_rx_desc *d,
> 
> Why renaming desc to d?
> 
> >                           struct NetRxPkt *pkt,
> > -                        const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> > +                        const E1000E_RSSInfo *rss_info,
> >                           uint16_t length)
> >   {
> > -    uint32_t status_flags, rss;
> > -    uint16_t ip_id;
> > +    uint32_t status_flags;
> >       assert(!rss_info->enabled);
> > -    desc->length = cpu_to_le16(length);
> > -    desc->csum = 0;
> > -    igb_build_rx_metadata(core, pkt, pkt != NULL,
> > -                          rss_info, etqf, ts,
> > -                          NULL, NULL, &rss,
> > -                          &status_flags, &ip_id,
> > -                          &desc->special);
> > -    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
> > -    desc->status = (uint8_t) le32_to_cpu(status_flags);
> > +    memset(d, 0, sizeof(*d));
> > +    d->length = cpu_to_le16(length);
> > +    igb_build_rx_metadata_common(core, pkt, pkt != NULL,
> > +                                 &status_flags,
> > +                                 &d->special);
> > +
> > +    d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
> > +    d->status = (uint8_t) le32_to_cpu(status_flags);
> > +}
> > +
> > +static uint16_t
> > +igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
> > +{
> > +    uint16_t pkt_type = 0;
> > +    bool hasip4, hasip6;
> > +    EthL4HdrProto l4hdr_proto;
> > +
> > +    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
> > +
> > +    if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
> > +        eth_ip6_hdr_info *ip6hdr_info  = net_rx_pkt_get_ip6_info(pkt);
> > +        pkt_type |= ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
> > +                                                E1000_ADVRXD_PKT_IP6;
> > +    } else if (hasip4) {
> > +        pkt_type = E1000_ADVRXD_PKT_IP4;
> > +    }
> > +
> > +    if (etqf < 8) {
> > +        pkt_type |= (BIT(11) >> 4) | etqf;
> > +        return pkt_type;
> > +    }
> > +
> > +    switch (l4hdr_proto) {
> > +    case ETH_L4_HDR_PROTO_TCP:
> > +        pkt_type |= E1000_ADVRXD_PKT_TCP;
> > +        break;
> > +    case ETH_L4_HDR_PROTO_UDP:
> > +        pkt_type |= E1000_ADVRXD_PKT_UDP;
> > +        break;
> > +    case ETH_L4_HDR_PROTO_SCTP:
> > +        pkt_type |= E1000_ADVRXD_PKT_SCTP;
> > +        break;
> > +    default:
> > +        break;
> > +    }
> > +
> > +    return pkt_type;
> >   }
> >   static inline void
> > -igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
> > +igb_write_adv_rx_descr(IGBCore *core,
> > +                       union e1000_adv_rx_desc *d,
> >                          struct NetRxPkt *pkt,
> > -                       const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
> > +                       const E1000E_RSSInfo *rss_info,
> > +                       uint16_t etqf,
> > +                       bool ts,
> >                          uint16_t length)
> >   {
> > -    memset(&desc->wb, 0, sizeof(desc->wb));
> > +    bool hasip4, hasip6;
> > +    EthL4HdrProto l4hdr_proto;
> > +    uint16_t rss_type = 0, pkt_type;
> > +    bool eop = (pkt != NULL);
> > +    memset(&d->wb, 0, sizeof(d->wb));
> > +
> > +    d->wb.upper.length = cpu_to_le16(length);
> > +    igb_build_rx_metadata_common(core, pkt, eop,
> > +                                 &d->wb.upper.status_error,
> > +                                 &d->wb.upper.vlan);
> > +
> > +    if (!eop) {
> > +        return;
> > +    }
> > +
> > +    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
> > -    desc->wb.upper.length = cpu_to_le16(length);
> > +    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) && rss_info->enabled) {
> 
> I think the older style better described that RSS and checksumming is
> mutally exclusive.
> 
> > +        d->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
> > +        rss_type = rss_info->type;
> > +        trace_igb_rx_metadata_rss(d->wb.lower.hi_dword.rss, rss_type);
> > +    } else if (!(core->mac[RXCSUM] & E1000_RXCSUM_PCSD) &&
> > +               core->mac[RXCSUM] & E1000_RXCSUM_IPPCSE &&
> > +               hasip4) {
> > +        d->wb.lower.hi_dword.csum_ip.ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
> > +        trace_e1000e_rx_metadata_ip_id(d->wb.lower.hi_dword.csum_ip.ip_id);
> > +    }
> > -    igb_build_rx_metadata(core, pkt, pkt != NULL,
> > -                          rss_info, etqf, ts,
> > -                          &desc->wb.lower.lo_dword.pkt_info,
> > -                          &desc->wb.lower.lo_dword.hdr_info,
> > -                          &desc->wb.lower.hi_dword.rss,
> > -                          &desc->wb.upper.status_error,
> > -                          &desc->wb.lower.hi_dword.csum_ip.ip_id,
> > -                          &desc->wb.upper.vlan);
> > +    if (ts) {
> > +        d->wb.upper.status_error |= BIT(16);
> > +    }
> > +
> > +    pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
> > +    trace_e1000e_rx_metadata_pkt_type(pkt_type);
> > +    d->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
> >   }
> >   static inline void
> > -igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
> > -                   struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
> > -                   uint16_t etqf, bool ts, uint16_t length)
> > +igb_write_rx_descr(IGBCore *core,
> > +                   union e1000_rx_desc_union *desc,
> > +                   struct NetRxPkt *pkt,
> > +                   const E1000E_RSSInfo *rss_info,
> > +                   uint16_t etqf,
> > +                   bool ts,
> > +                   uint16_t(*written)[MAX_PS_BUFFERS],
> > +                   const E1000E_RingInfo *r)
> >   {
> >       if (igb_rx_use_legacy_descriptor(core)) {
> > -        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
> > -                                etqf, ts, length);
> > +        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, (*written)[1]);
> >       } else {
> > -        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
> > -                               etqf, ts, length);
> > +        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, etqf, ts, (*written)[1]);
> >       }
> >   }
> > @@ -1513,19 +1544,6 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
> >       }
> >   }
> > -static void
> > -igb_write_to_rx_buffers(IGBCore *core,
> > -                        PCIDevice *d,
> > -                        hwaddr ba,
> > -                        uint16_t *written,
> > -                        const char *data,
> > -                        dma_addr_t data_len)
> > -{
> > -    trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
> > -    pci_dma_write(d, ba + *written, data, data_len);
> > -    *written += data_len;
> > -}
> > -
> >   static void
> >   igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
> >                       size_t pkt_size, size_t pkt_fcs_size)
> > @@ -1551,6 +1569,136 @@ igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
> >              ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
> >   }
> > +typedef struct igb_ba_state_st {
> > +    uint16_t written[MAX_PS_BUFFERS];
> > +    uint8_t cur_idx;
> > +} igb_ba_state;
> > +
> > +typedef struct {
> > +    size_t size;
> > +    size_t total_size;
> > +    size_t ps_hdr_len;
> > +    size_t desc_size;
> > +    size_t desc_offset;
> > +    uint32_t rx_desc_packet_buf_size;
> > +    uint32_t rx_desc_header_buf_size;
> > +    struct iovec *iov;
> > +    size_t iov_ofs;
> > +    bool is_first;
> > +    igb_ba_state bastate;
> > +    hwaddr ba[MAX_PS_BUFFERS];
> > +} igb_packet_tx_dma_state;
> 
> Follow the convention described in docs/devel/style.rst for new typedefs.
> 
> Also, it says tx but it's used for rx.
> 
> > +
> > +static void
> > +igb_truncate_to_descriptor_size(igb_packet_tx_dma_state *pdma_st, size_t *size)
> > +{
> > +    if (*size > pdma_st->rx_desc_packet_buf_size) {
> > +        *size = pdma_st->rx_desc_packet_buf_size;
> > +    }
> > +}
> > +
> > +static void
> > +igb_write_payload_to_rx_buffers(IGBCore *core,
> > +                                PCIDevice *d,
> > +                                hwaddr (*ba)[MAX_PS_BUFFERS],
> 
> 
> > +                                igb_ba_state *bastate,
> > +                                uint32_t cur_buf_len,
> > +                                const char *data,
> > +                                dma_addr_t data_len)
> > +{
> > +    while (data_len > 0) {
> > +        assert(bastate->cur_idx < MAX_PS_BUFFERS);
> > +
> > +        uint32_t cur_buf_bytes_left = cur_buf_len -
> > +                                      bastate->written[bastate->cur_idx];
> > +        uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
> > +
> > +        trace_igb_rx_desc_buff_write(bastate->cur_idx,
> > +                                     (*ba)[bastate->cur_idx],
> > +                                     bastate->written[bastate->cur_idx],
> > +                                     data,
> > +                                     bytes_to_write);
> > +
> > +        pci_dma_write(d,
> > +                      (*ba)[bastate->cur_idx] +
> > +                      bastate->written[bastate->cur_idx],
> > +                      data, bytes_to_write);
> > +
> > +        bastate->written[bastate->cur_idx] += bytes_to_write;
> > +        data += bytes_to_write;
> > +        data_len -= bytes_to_write;
> > +
> > +        if (bastate->written[bastate->cur_idx] == cur_buf_len) {
> > +            bastate->cur_idx++;
> > +        }
> > +    }
> > +}
> > +
> > +static void
> > +igb_write_packet_payload_to_descr_addr(IGBCore *core,
> 
> The names of igb_write_payload_to_rx_buffers() and
> igb_write_packet_payload_to_descr_addr() don't disambiguate themselves.
> Since igb_write_payload_to_rx_buffers() only writes a framgent of the
> payload, you may rename it to igb_write_payload_fragment_to_rx_buffers(),
> and rename igb_write_packet_payload_to_descr_addr() to
> igb_write_payload_to_rx_buffers().
> 
> > +                                       struct NetRxPkt *pkt,
> > +                                       PCIDevice *d,
> > +                                       igb_packet_tx_dma_state *pdma_st,
> > +                                       size_t *copy_size)
> > +{
> > +    static const uint32_t fcs_pad;
> > +    size_t iov_copy;
> > +
> > +    /* Copy packet payload */
> > +    while (*copy_size) {
> > +        iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
> > +        igb_write_payload_to_rx_buffers(core, d,
> > +                                        &pdma_st->ba,
> > +                                        &pdma_st->bastate,
> > +                                        pdma_st->rx_desc_packet_buf_size,
> > +                                        pdma_st->iov->iov_base + pdma_st->iov_ofs,
> > +                                        iov_copy);
> > +
> > +        *copy_size -= iov_copy;
> > +        pdma_st->iov_ofs += iov_copy;
> > +        if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
> > +            pdma_st->iov++;
> > +            pdma_st->iov_ofs = 0;
> > +        }
> > +    }
> > +
> > +    if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
> > +        /* Simulate FCS checksum presence in the last descriptor */
> > +        igb_write_payload_to_rx_buffers(core, d,
> > +                                        &pdma_st->ba,
> > +                                        &pdma_st->bastate,
> > +                                        pdma_st->rx_desc_packet_buf_size,
> > +                                        (const char *) &fcs_pad,
> > +                                        e1000x_fcs_len(core->mac));
> > +    }
> > +}
> > +
> > +static void
> > +igb_write_to_rx_buffers(IGBCore *core,
> > +                        struct NetRxPkt *pkt,
> > +                        PCIDevice *d,
> > +                        igb_packet_tx_dma_state *pdma_st)
> > +{
> > +    size_t copy_size;
> > +
> > +    if (!(pdma_st->ba)[1]) {
> > +        /* as per intel docs; skip descriptors with null buf addr */
> > +        trace_e1000e_rx_null_descriptor();
> > +        return;
> > +    }
> > +
> > +    if (pdma_st->desc_offset >= pdma_st->size) {
> > +        return;
> > +    }
> > +
> > +    pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
> > +    igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
> > +    copy_size = pdma_st->size - pdma_st->desc_offset;
> > +    igb_truncate_to_descriptor_size(pdma_st, &copy_size);
> > +    pdma_st->bastate.cur_idx = 1;
> > +    igb_write_packet_payload_to_descr_addr(core, pkt, d, pdma_st, &copy_size);
> > +}
> > +
> >   static void
> >   igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
> >                             const E1000E_RxRing *rxr,
> > @@ -1560,91 +1708,58 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
> >       PCIDevice *d;
> >       dma_addr_t base;
> >       union e1000_rx_desc_union desc;
> > -    size_t desc_size;
> > -    size_t desc_offset = 0;
> > -    size_t iov_ofs = 0;
> > -
> > -    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
> > -    size_t size = net_rx_pkt_get_total_len(pkt);
> > -    size_t total_size = size + e1000x_fcs_len(core->mac);
> > -    const E1000E_RingInfo *rxi = rxr->i;
> > -    size_t bufsize = igb_rxbufsize(core, rxi);
> > -
> > +    const E1000E_RingInfo *rxi;
> > +    size_t rx_desc_len;
> > +
> > +    igb_packet_tx_dma_state pdma_st = {0};
> > +    pdma_st.is_first = true;
> > +    pdma_st.size = net_rx_pkt_get_total_len(pkt);
> > +    pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
> > +
> > +    rxi = rxr->i;
> > +    rx_desc_len = core->rx_desc_len;
> > +    pdma_st.rx_desc_packet_buf_size =
> > +        igb_rxbufsize(core, rxi);
> > +    pdma_st.rx_desc_header_buf_size =
> > +        igb_get_queue_rx_header_buf_size(core, rxi);
> > +    pdma_st.iov = net_rx_pkt_get_iovec(pkt);
> >       d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
> >       if (!d) {
> >           d = core->owner;
> >       }
> >       do {
> > -        hwaddr ba;
> > -        uint16_t written = 0;
> > +        memset(&pdma_st.bastate, 0, sizeof(igb_ba_state));
> >           bool is_last = false;
> > -        desc_size = total_size - desc_offset;
> > -
> > -        if (desc_size > bufsize) {
> > -            desc_size = bufsize;
> > -        }
> > -
> >           if (igb_ring_empty(core, rxi)) {
> >               return;
> >           }
> >           base = igb_ring_head_descr(core, rxi);
> > +        pci_dma_read(d, base, &desc, rx_desc_len);
> > +        trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
> > -        pci_dma_read(d, base, &desc, core->rx_desc_len);
> > -
> > -        trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
> > -
> > -        igb_read_rx_descr(core, &desc, &ba);
> > -
> > -        if (ba) {
> > -            if (desc_offset < size) {
> > -                static const uint32_t fcs_pad;
> > -                size_t iov_copy;
> > -                size_t copy_size = size - desc_offset;
> > -                if (copy_size > bufsize) {
> > -                    copy_size = bufsize;
> > -                }
> > -
> > -                /* Copy packet payload */
> > -                while (copy_size) {
> > -                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
> > -
> > -                    igb_write_to_rx_buffers(core, d, ba, &written,
> > -                                            iov->iov_base + iov_ofs, iov_copy);
> > +        igb_read_rx_descr(core, &desc, pdma_st.ba, rxi);
> > -                    copy_size -= iov_copy;
> > -                    iov_ofs += iov_copy;
> > -                    if (iov_ofs == iov->iov_len) {
> > -                        iov++;
> > -                        iov_ofs = 0;
> > -                    }
> > -                }
> > -
> > -                if (desc_offset + desc_size >= total_size) {
> > -                    /* Simulate FCS checksum presence in the last descriptor */
> > -                    igb_write_to_rx_buffers(core, d, ba, &written,
> > -                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
> > -                }
> > -            }
> > -        } else { /* as per intel docs; skip descriptors with null buf addr */
> > -            trace_e1000e_rx_null_descriptor();
> > -        }
> > -        desc_offset += desc_size;
> > -        if (desc_offset >= total_size) {
> > +        igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
> > +        pdma_st.desc_offset += pdma_st.desc_size;
> > +        if (pdma_st.desc_offset >= pdma_st.total_size) {
> >               is_last = true;
> >           }
> > -        igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
> > -                           rss_info, etqf, ts, written);
> > -        igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
> > -
> > -        igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
> > -
> > -    } while (desc_offset < total_size);
> > -
> > -    igb_update_rx_stats(core, rxi, size, total_size);
> > +        igb_write_rx_descr(core, &desc,
> > +                           is_last ? pkt : NULL,
> > +                           rss_info,
> > +                           etqf, ts,
> > +                           &pdma_st.bastate.written,
> > +                           rxi);
> > +        pci_dma_write(d, base, &desc, rx_desc_len);
> > +        igb_ring_advance(core, rxi,
> > +                         rx_desc_len / E1000_MIN_RX_DESC_LEN);
> > +    } while (pdma_st.desc_offset < pdma_st.total_size);
> > +
> > +    igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
> >   }
> >   static bool
> > diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
> > index 82ff195dfc..b14ca834ad 100644
> > --- a/hw/net/igb_regs.h
> > +++ b/hw/net/igb_regs.h
> > @@ -76,6 +76,15 @@ union e1000_adv_rx_desc {
> >       } wb;  /* writeback */
> >   };
> > +#define IGB_MAX_PS_BUFFERS  2
> > +#define IGB_MAX_RX_DESC_LEN (sizeof(uint64_t) * MAX_PS_BUFFERS)
> > +
> > +#define RX_DESC_ADV_HDR_LEN_OFFSET    (21 - 16)
> > +#define RX_DESC_ADV_HDR_LEN_MASK      ((BIT(10) - 1) << \
> > +                                       RX_DESC_ADV_HDR_LEN_OFFSET)
> > +#define RX_DESC_ADV_HDR_SPH           BIT(15)
> > +#define RX_DESC_ADV_ST_ERR_HBO_OFFSET BIT(3 + 20)
> > +
> 
> These should be added afer /* new */
> 
> And use the name ADVRXD to denote the advanced Rx descriptor to make it
> consistent with ADVTXD.
> 
> >   /* from igb/e1000_phy.h */
> >   /* IGP01E1000 Specific Registers */
> > @@ -452,6 +461,7 @@ union e1000_adv_rx_desc {
> >   #define E1000_SRRCTL_BSIZEHDRSIZE_MASK         0x00000F00
> >   #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT        2  /* Shift _left_ */
> >   #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF       0x02000000
> > +#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT        0x04000000
> >   #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
> >   #define E1000_SRRCTL_DESCTYPE_MASK             0x0E000000
> >   #define E1000_SRRCTL_DROP_EN                   0x80000000
> > @@ -692,11 +702,12 @@ union e1000_adv_rx_desc {
> >   #define E1000_STATUS_NUM_VFS_SHIFT 14
> > -#define E1000_ADVRXD_PKT_IP4 BIT(4)
> > -#define E1000_ADVRXD_PKT_IP6 BIT(6)
> > -#define E1000_ADVRXD_PKT_TCP BIT(8)
> > -#define E1000_ADVRXD_PKT_UDP BIT(9)
> > -#define E1000_ADVRXD_PKT_SCTP BIT(10)
> > +#define E1000_ADVRXD_PKT_IP4  BIT(0)
> > +#define E1000_ADVRXD_PKT_IP6  BIT(2)
> > +#define E1000_ADVRXD_PKT_IP6E BIT(3)
> > +#define E1000_ADVRXD_PKT_TCP  BIT(4)
> > +#define E1000_ADVRXD_PKT_UDP  BIT(5)
> > +#define E1000_ADVRXD_PKT_SCTP BIT(6)
> >   static inline uint8_t igb_ivar_entry_rx(uint8_t i)
> >   {
> > diff --git a/hw/net/trace-events b/hw/net/trace-events
> > index e4a98b2c7d..9a02261360 100644
> > --- a/hw/net/trace-events
> > +++ b/hw/net/trace-events
> > @@ -277,9 +277,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
> >   igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
> >   igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
> > -igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
> > +igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer #%u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
> > -igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
> > +igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
> >   igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
> >   igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
> > @@ -294,6 +294,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
> >   igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
> >   igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
> > +igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
> > +
> >   # igbvf.c
> >   igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
> > diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
> > index a603468beb..4e720ee7e7 100644
> > --- a/tests/qtest/libqos/igb.c
> > +++ b/tests/qtest/libqos/igb.c
> > @@ -55,6 +55,7 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
> >   {
> >       static const uint8_t address[] = E1000E_ADDRESS;
> >       QE1000E_PCI *d = (QE1000E_PCI *) obj;
> > +    uint32_t srrctl0;
> >       uint32_t val;
> >       /* Enable the device */
> > @@ -109,6 +110,10 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
> >                           E1000_RAH_AV | E1000_RAH_POOL_1 |
> >                           le16_to_cpu(*(uint16_t *)(address + 4)));
> > +    /* Set supported receive descriptor mode */
> > +    srrctl0 = e1000e_macreg_read(&d->e1000e, E1000_SRRCTL(0));
> > +    e1000e_macreg_write(&d->e1000e, E1000_SRRCTL(0), srrctl0 | BIT(25));
> 
> As I wrote in previous review:
> > You don't need to read the initial value. The Linux driver also
> > doesn't do that.
> >
> > Also use: E1000_SRRCTL_DESCTYPE_ADV_ONEBUF
> 
> > +
> >       /* Enable receive */
> >       e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
> >       e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);
>
diff mbox series

Patch

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index 78373d7db7..d874809d93 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1418,11 +1418,11 @@  e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
 }
 
 static void
-e1000e_write_to_rx_buffers(E1000ECore *core,
-                           hwaddr ba[MAX_PS_BUFFERS],
-                           e1000e_ba_state *bastate,
-                           const char *data,
-                           dma_addr_t data_len)
+e1000e_write_payload_to_rx_buffers(E1000ECore *core,
+                                   hwaddr ba[MAX_PS_BUFFERS],
+                                   e1000e_ba_state *bastate,
+                                   const char *data,
+                                   dma_addr_t data_len)
 {
     while (data_len > 0) {
         uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
@@ -1594,8 +1594,8 @@  e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                 while (copy_size) {
                     iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
 
-                    e1000e_write_to_rx_buffers(core, ba, &bastate,
-                                            iov->iov_base + iov_ofs, iov_copy);
+                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
+                                                       iov->iov_base + iov_ofs, iov_copy);
 
                     copy_size -= iov_copy;
                     iov_ofs += iov_copy;
@@ -1607,7 +1607,7 @@  e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
 
                 if (desc_offset + desc_size >= total_size) {
                     /* Simulate FCS checksum presence in the last descriptor */
-                    e1000e_write_to_rx_buffers(core, ba, &bastate,
+                    e1000e_write_payload_to_rx_buffers(core, ba, &bastate,
                           (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
                 }
             }
diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
index 13760c66d3..344fd10359 100644
--- a/hw/net/e1000x_regs.h
+++ b/hw/net/e1000x_regs.h
@@ -827,6 +827,7 @@  union e1000_rx_desc_packet_split {
 /* Receive Checksum Control bits */
 #define E1000_RXCSUM_IPOFLD     0x100   /* IP Checksum Offload Enable */
 #define E1000_RXCSUM_TUOFLD     0x200   /* TCP/UDP Checksum Offload Enable */
+#define E1000_RXCSUM_IPPCSE     0x1000  /* IP Payload Checksum enable */
 #define E1000_RXCSUM_PCSD       0x2000  /* Packet Checksum Disable */
 
 #define E1000_RING_DESC_LEN       (16)
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 96b7335b31..47c4a16e84 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -267,6 +267,21 @@  igb_rx_use_legacy_descriptor(IGBCore *core)
     return false;
 }
 
+typedef struct E1000E_RingInfo_st {
+    int dbah;
+    int dbal;
+    int dlen;
+    int dh;
+    int dt;
+    int idx;
+} E1000E_RingInfo;
+
+static uint32_t
+igb_rx_queue_desctyp_get(IGBCore *core, const E1000E_RingInfo *r)
+{
+    return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
+}
+
 static inline bool
 igb_rss_enabled(IGBCore *core)
 {
@@ -694,15 +709,6 @@  static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
     return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
 }
 
-typedef struct E1000E_RingInfo_st {
-    int dbah;
-    int dbal;
-    int dlen;
-    int dh;
-    int dt;
-    int idx;
-} E1000E_RingInfo;
-
 static inline bool
 igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
 {
@@ -941,6 +947,14 @@  igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
                          bufsize;
 }
 
+static uint32_t
+igb_get_queue_rx_header_buf_size(IGBCore *core, const E1000E_RingInfo *r)
+{
+    uint32_t srr_size = (core->mac[E1000_SRRCTL(r->idx) >> 2] &
+                         E1000_SRRCTL_BSIZEHDR_MASK) >> 8;
+    return srr_size * 64;
+}
+
 void
 igb_start_recv(IGBCore *core)
 {
@@ -1225,21 +1239,34 @@  igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
 }
 
 static inline void
-igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
-                      hwaddr *buff_addr)
+igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+                                 hwaddr *buff_addr)
 {
     *buff_addr = le64_to_cpu(desc->read.pkt_addr);
 }
 
 static inline void
 igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
-                  hwaddr *buff_addr)
+                  hwaddr buff_addr[MAX_PS_BUFFERS],
+                  const E1000E_RingInfo *r)
 {
+    uint32_t desc_type;
+
     if (igb_rx_use_legacy_descriptor(core)) {
-        igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
-    } else {
-        igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
+        igb_read_lgcy_rx_descr(core, &desc->legacy, &buff_addr[1]);
+        buff_addr[0] = 0;
+        return;
     }
+
+    /* modes other than advanced single buffer descriptor not supported */
+    desc_type = igb_rx_queue_desctyp_get(core, r);
+    if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
+        trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
+    }
+
+    /* advanced single buffer descriptor */
+    igb_read_adv_rx_single_buf_descr(core, &desc->adv, &buff_addr[1]);
+    buff_addr[0] = 0;
 }
 
 static void
@@ -1281,15 +1308,11 @@  igb_verify_csum_in_sw(IGBCore *core,
 }
 
 static void
-igb_build_rx_metadata(IGBCore *core,
-                      struct NetRxPkt *pkt,
-                      bool is_eop,
-                      const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
-                      uint16_t *pkt_info, uint16_t *hdr_info,
-                      uint32_t *rss,
-                      uint32_t *status_flags,
-                      uint16_t *ip_id,
-                      uint16_t *vlan_tag)
+igb_build_rx_metadata_common(IGBCore *core,
+                             struct NetRxPkt *pkt,
+                             bool is_eop,
+                             uint32_t *status_flags,
+                             uint16_t *vlan_tag)
 {
     struct virtio_net_hdr *vhdr;
     bool hasip4, hasip6, csum_valid;
@@ -1298,7 +1321,6 @@  igb_build_rx_metadata(IGBCore *core,
     *status_flags = E1000_RXD_STAT_DD;
 
     /* No additional metadata needed for non-EOP descriptors */
-    /* TODO: EOP apply only to status so don't skip whole function. */
     if (!is_eop) {
         goto func_exit;
     }
@@ -1315,64 +1337,6 @@  igb_build_rx_metadata(IGBCore *core,
         trace_e1000e_rx_metadata_vlan(*vlan_tag);
     }
 
-    /* Packet parsing results */
-    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
-        if (rss_info->enabled) {
-            *rss = cpu_to_le32(rss_info->hash);
-            trace_igb_rx_metadata_rss(*rss);
-        }
-    } else if (hasip4) {
-            *status_flags |= E1000_RXD_STAT_IPIDV;
-            *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
-            trace_e1000e_rx_metadata_ip_id(*ip_id);
-    }
-
-    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
-        *status_flags |= E1000_RXD_STAT_ACK;
-        trace_e1000e_rx_metadata_ack();
-    }
-
-    if (pkt_info) {
-        *pkt_info = rss_info->enabled ? rss_info->type : 0;
-
-        if (etqf < 8) {
-            *pkt_info |= BIT(11) | (etqf << 4);
-        } else {
-            if (hasip4) {
-                *pkt_info |= E1000_ADVRXD_PKT_IP4;
-            }
-
-            if (hasip6) {
-                *pkt_info |= E1000_ADVRXD_PKT_IP6;
-            }
-
-            switch (l4hdr_proto) {
-            case ETH_L4_HDR_PROTO_TCP:
-                *pkt_info |= E1000_ADVRXD_PKT_TCP;
-                break;
-
-            case ETH_L4_HDR_PROTO_UDP:
-                *pkt_info |= E1000_ADVRXD_PKT_UDP;
-                break;
-
-            case ETH_L4_HDR_PROTO_SCTP:
-                *pkt_info |= E1000_ADVRXD_PKT_SCTP;
-                break;
-
-            default:
-                break;
-            }
-        }
-    }
-
-    if (hdr_info) {
-        *hdr_info = 0;
-    }
-
-    if (ts) {
-        *status_flags |= BIT(16);
-    }
-
     /* RX CSO information */
     if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
         trace_e1000e_rx_metadata_ipv6_sum_disabled();
@@ -1426,58 +1390,125 @@  func_exit:
 }
 
 static inline void
-igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
+igb_write_lgcy_rx_descr(IGBCore *core,
+                        struct e1000_rx_desc *d,
                         struct NetRxPkt *pkt,
-                        const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
+                        const E1000E_RSSInfo *rss_info,
                         uint16_t length)
 {
-    uint32_t status_flags, rss;
-    uint16_t ip_id;
+    uint32_t status_flags;
 
     assert(!rss_info->enabled);
-    desc->length = cpu_to_le16(length);
-    desc->csum = 0;
 
-    igb_build_rx_metadata(core, pkt, pkt != NULL,
-                          rss_info, etqf, ts,
-                          NULL, NULL, &rss,
-                          &status_flags, &ip_id,
-                          &desc->special);
-    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
-    desc->status = (uint8_t) le32_to_cpu(status_flags);
+    memset(d, 0, sizeof(*d));
+    d->length = cpu_to_le16(length);
+    igb_build_rx_metadata_common(core, pkt, pkt != NULL,
+                                 &status_flags,
+                                 &d->special);
+
+    d->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
+    d->status = (uint8_t) le32_to_cpu(status_flags);
+}
+
+static uint16_t
+igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
+{
+    uint16_t pkt_type = 0;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+    if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+        eth_ip6_hdr_info *ip6hdr_info  = net_rx_pkt_get_ip6_info(pkt);
+        pkt_type |= ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
+                                                E1000_ADVRXD_PKT_IP6;
+    } else if (hasip4) {
+        pkt_type = E1000_ADVRXD_PKT_IP4;
+    }
+
+    if (etqf < 8) {
+        pkt_type |= (BIT(11) >> 4) | etqf;
+        return pkt_type;
+    }
+
+    switch (l4hdr_proto) {
+    case ETH_L4_HDR_PROTO_TCP:
+        pkt_type |= E1000_ADVRXD_PKT_TCP;
+        break;
+    case ETH_L4_HDR_PROTO_UDP:
+        pkt_type |= E1000_ADVRXD_PKT_UDP;
+        break;
+    case ETH_L4_HDR_PROTO_SCTP:
+        pkt_type |= E1000_ADVRXD_PKT_SCTP;
+        break;
+    default:
+        break;
+    }
+
+    return pkt_type;
 }
 
 static inline void
-igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+igb_write_adv_rx_descr(IGBCore *core,
+                       union e1000_adv_rx_desc *d,
                        struct NetRxPkt *pkt,
-                       const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
+                       const E1000E_RSSInfo *rss_info,
+                       uint16_t etqf,
+                       bool ts,
                        uint16_t length)
 {
-    memset(&desc->wb, 0, sizeof(desc->wb));
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+    uint16_t rss_type = 0, pkt_type;
+    bool eop = (pkt != NULL);
+    memset(&d->wb, 0, sizeof(d->wb));
+
+    d->wb.upper.length = cpu_to_le16(length);
+    igb_build_rx_metadata_common(core, pkt, eop,
+                                 &d->wb.upper.status_error,
+                                 &d->wb.upper.vlan);
+
+    if (!eop) {
+        return;
+    }
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
 
-    desc->wb.upper.length = cpu_to_le16(length);
+    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) && rss_info->enabled) {
+        d->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
+        rss_type = rss_info->type;
+        trace_igb_rx_metadata_rss(d->wb.lower.hi_dword.rss, rss_type);
+    } else if (!(core->mac[RXCSUM] & E1000_RXCSUM_PCSD) &&
+               core->mac[RXCSUM] & E1000_RXCSUM_IPPCSE &&
+               hasip4) {
+        d->wb.lower.hi_dword.csum_ip.ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
+        trace_e1000e_rx_metadata_ip_id(d->wb.lower.hi_dword.csum_ip.ip_id);
+    }
 
-    igb_build_rx_metadata(core, pkt, pkt != NULL,
-                          rss_info, etqf, ts,
-                          &desc->wb.lower.lo_dword.pkt_info,
-                          &desc->wb.lower.lo_dword.hdr_info,
-                          &desc->wb.lower.hi_dword.rss,
-                          &desc->wb.upper.status_error,
-                          &desc->wb.lower.hi_dword.csum_ip.ip_id,
-                          &desc->wb.upper.vlan);
+    if (ts) {
+        d->wb.upper.status_error |= BIT(16);
+    }
+
+    pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
+    trace_e1000e_rx_metadata_pkt_type(pkt_type);
+    d->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
 }
 
 static inline void
-igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
-                   struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
-                   uint16_t etqf, bool ts, uint16_t length)
+igb_write_rx_descr(IGBCore *core,
+                   union e1000_rx_desc_union *desc,
+                   struct NetRxPkt *pkt,
+                   const E1000E_RSSInfo *rss_info,
+                   uint16_t etqf,
+                   bool ts,
+                   uint16_t(*written)[MAX_PS_BUFFERS],
+                   const E1000E_RingInfo *r)
 {
     if (igb_rx_use_legacy_descriptor(core)) {
-        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
-                                etqf, ts, length);
+        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, (*written)[1]);
     } else {
-        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
-                               etqf, ts, length);
+        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, etqf, ts, (*written)[1]);
     }
 }
 
@@ -1513,19 +1544,6 @@  igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
     }
 }
 
-static void
-igb_write_to_rx_buffers(IGBCore *core,
-                        PCIDevice *d,
-                        hwaddr ba,
-                        uint16_t *written,
-                        const char *data,
-                        dma_addr_t data_len)
-{
-    trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
-    pci_dma_write(d, ba + *written, data, data_len);
-    *written += data_len;
-}
-
 static void
 igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
                     size_t pkt_size, size_t pkt_fcs_size)
@@ -1551,6 +1569,136 @@  igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
            ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
 }
 
+typedef struct igb_ba_state_st {
+    uint16_t written[MAX_PS_BUFFERS];
+    uint8_t cur_idx;
+} igb_ba_state;
+
+typedef struct {
+    size_t size;
+    size_t total_size;
+    size_t ps_hdr_len;
+    size_t desc_size;
+    size_t desc_offset;
+    uint32_t rx_desc_packet_buf_size;
+    uint32_t rx_desc_header_buf_size;
+    struct iovec *iov;
+    size_t iov_ofs;
+    bool is_first;
+    igb_ba_state bastate;
+    hwaddr ba[MAX_PS_BUFFERS];
+} igb_packet_tx_dma_state;
+
+static void
+igb_truncate_to_descriptor_size(igb_packet_tx_dma_state *pdma_st, size_t *size)
+{
+    if (*size > pdma_st->rx_desc_packet_buf_size) {
+        *size = pdma_st->rx_desc_packet_buf_size;
+    }
+}
+
+static void
+igb_write_payload_to_rx_buffers(IGBCore *core,
+                                PCIDevice *d,
+                                hwaddr (*ba)[MAX_PS_BUFFERS],
+                                igb_ba_state *bastate,
+                                uint32_t cur_buf_len,
+                                const char *data,
+                                dma_addr_t data_len)
+{
+    while (data_len > 0) {
+        assert(bastate->cur_idx < MAX_PS_BUFFERS);
+
+        uint32_t cur_buf_bytes_left = cur_buf_len -
+                                      bastate->written[bastate->cur_idx];
+        uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
+
+        trace_igb_rx_desc_buff_write(bastate->cur_idx,
+                                     (*ba)[bastate->cur_idx],
+                                     bastate->written[bastate->cur_idx],
+                                     data,
+                                     bytes_to_write);
+
+        pci_dma_write(d,
+                      (*ba)[bastate->cur_idx] +
+                      bastate->written[bastate->cur_idx],
+                      data, bytes_to_write);
+
+        bastate->written[bastate->cur_idx] += bytes_to_write;
+        data += bytes_to_write;
+        data_len -= bytes_to_write;
+
+        if (bastate->written[bastate->cur_idx] == cur_buf_len) {
+            bastate->cur_idx++;
+        }
+    }
+}
+
+static void
+igb_write_packet_payload_to_descr_addr(IGBCore *core,
+                                       struct NetRxPkt *pkt,
+                                       PCIDevice *d,
+                                       igb_packet_tx_dma_state *pdma_st,
+                                       size_t *copy_size)
+{
+    static const uint32_t fcs_pad;
+    size_t iov_copy;
+
+    /* Copy packet payload */
+    while (*copy_size) {
+        iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
+        igb_write_payload_to_rx_buffers(core, d,
+                                        &pdma_st->ba,
+                                        &pdma_st->bastate,
+                                        pdma_st->rx_desc_packet_buf_size,
+                                        pdma_st->iov->iov_base + pdma_st->iov_ofs,
+                                        iov_copy);
+
+        *copy_size -= iov_copy;
+        pdma_st->iov_ofs += iov_copy;
+        if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+            pdma_st->iov++;
+            pdma_st->iov_ofs = 0;
+        }
+    }
+
+    if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
+        /* Simulate FCS checksum presence in the last descriptor */
+        igb_write_payload_to_rx_buffers(core, d,
+                                        &pdma_st->ba,
+                                        &pdma_st->bastate,
+                                        pdma_st->rx_desc_packet_buf_size,
+                                        (const char *) &fcs_pad,
+                                        e1000x_fcs_len(core->mac));
+    }
+}
+
+static void
+igb_write_to_rx_buffers(IGBCore *core,
+                        struct NetRxPkt *pkt,
+                        PCIDevice *d,
+                        igb_packet_tx_dma_state *pdma_st)
+{
+    size_t copy_size;
+
+    if (!(pdma_st->ba)[1]) {
+        /* as per intel docs; skip descriptors with null buf addr */
+        trace_e1000e_rx_null_descriptor();
+        return;
+    }
+
+    if (pdma_st->desc_offset >= pdma_st->size) {
+        return;
+    }
+
+    pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
+    igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
+    copy_size = pdma_st->size - pdma_st->desc_offset;
+    igb_truncate_to_descriptor_size(pdma_st, &copy_size);
+    pdma_st->bastate.cur_idx = 1;
+    igb_write_packet_payload_to_descr_addr(core, pkt, d, pdma_st, &copy_size);
+}
+
 static void
 igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
                           const E1000E_RxRing *rxr,
@@ -1560,91 +1708,58 @@  igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
     PCIDevice *d;
     dma_addr_t base;
     union e1000_rx_desc_union desc;
-    size_t desc_size;
-    size_t desc_offset = 0;
-    size_t iov_ofs = 0;
-
-    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
-    size_t size = net_rx_pkt_get_total_len(pkt);
-    size_t total_size = size + e1000x_fcs_len(core->mac);
-    const E1000E_RingInfo *rxi = rxr->i;
-    size_t bufsize = igb_rxbufsize(core, rxi);
-
+    const E1000E_RingInfo *rxi;
+    size_t rx_desc_len;
+
+    igb_packet_tx_dma_state pdma_st = {0};
+    pdma_st.is_first = true;
+    pdma_st.size = net_rx_pkt_get_total_len(pkt);
+    pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
+
+    rxi = rxr->i;
+    rx_desc_len = core->rx_desc_len;
+    pdma_st.rx_desc_packet_buf_size =
+        igb_rxbufsize(core, rxi);
+    pdma_st.rx_desc_header_buf_size =
+        igb_get_queue_rx_header_buf_size(core, rxi);
+    pdma_st.iov = net_rx_pkt_get_iovec(pkt);
     d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
     if (!d) {
         d = core->owner;
     }
 
     do {
-        hwaddr ba;
-        uint16_t written = 0;
+        memset(&pdma_st.bastate, 0, sizeof(igb_ba_state));
         bool is_last = false;
 
-        desc_size = total_size - desc_offset;
-
-        if (desc_size > bufsize) {
-            desc_size = bufsize;
-        }
-
         if (igb_ring_empty(core, rxi)) {
             return;
         }
 
         base = igb_ring_head_descr(core, rxi);
+        pci_dma_read(d, base, &desc, rx_desc_len);
+        trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
 
-        pci_dma_read(d, base, &desc, core->rx_desc_len);
-
-        trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
-
-        igb_read_rx_descr(core, &desc, &ba);
-
-        if (ba) {
-            if (desc_offset < size) {
-                static const uint32_t fcs_pad;
-                size_t iov_copy;
-                size_t copy_size = size - desc_offset;
-                if (copy_size > bufsize) {
-                    copy_size = bufsize;
-                }
-
-                /* Copy packet payload */
-                while (copy_size) {
-                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
-
-                    igb_write_to_rx_buffers(core, d, ba, &written,
-                                            iov->iov_base + iov_ofs, iov_copy);
+        igb_read_rx_descr(core, &desc, pdma_st.ba, rxi);
 
-                    copy_size -= iov_copy;
-                    iov_ofs += iov_copy;
-                    if (iov_ofs == iov->iov_len) {
-                        iov++;
-                        iov_ofs = 0;
-                    }
-                }
-
-                if (desc_offset + desc_size >= total_size) {
-                    /* Simulate FCS checksum presence in the last descriptor */
-                    igb_write_to_rx_buffers(core, d, ba, &written,
-                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
-                }
-            }
-        } else { /* as per intel docs; skip descriptors with null buf addr */
-            trace_e1000e_rx_null_descriptor();
-        }
-        desc_offset += desc_size;
-        if (desc_offset >= total_size) {
+        igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
+        pdma_st.desc_offset += pdma_st.desc_size;
+        if (pdma_st.desc_offset >= pdma_st.total_size) {
             is_last = true;
         }
 
-        igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
-                           rss_info, etqf, ts, written);
-        igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
-
-        igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
-
-    } while (desc_offset < total_size);
-
-    igb_update_rx_stats(core, rxi, size, total_size);
+        igb_write_rx_descr(core, &desc,
+                           is_last ? pkt : NULL,
+                           rss_info,
+                           etqf, ts,
+                           &pdma_st.bastate.written,
+                           rxi);
+        pci_dma_write(d, base, &desc, rx_desc_len);
+        igb_ring_advance(core, rxi,
+                         rx_desc_len / E1000_MIN_RX_DESC_LEN);
+    } while (pdma_st.desc_offset < pdma_st.total_size);
+
+    igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
 }
 
 static bool
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index 82ff195dfc..b14ca834ad 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -76,6 +76,15 @@  union e1000_adv_rx_desc {
     } wb;  /* writeback */
 };
 
+#define IGB_MAX_PS_BUFFERS  2
+#define IGB_MAX_RX_DESC_LEN (sizeof(uint64_t) * MAX_PS_BUFFERS)
+
+#define RX_DESC_ADV_HDR_LEN_OFFSET    (21 - 16)
+#define RX_DESC_ADV_HDR_LEN_MASK      ((BIT(10) - 1) << \
+                                       RX_DESC_ADV_HDR_LEN_OFFSET)
+#define RX_DESC_ADV_HDR_SPH           BIT(15)
+#define RX_DESC_ADV_ST_ERR_HBO_OFFSET BIT(3 + 20)
+
 /* from igb/e1000_phy.h */
 
 /* IGP01E1000 Specific Registers */
@@ -452,6 +461,7 @@  union e1000_adv_rx_desc {
 #define E1000_SRRCTL_BSIZEHDRSIZE_MASK         0x00000F00
 #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT        2  /* Shift _left_ */
 #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF       0x02000000
+#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT        0x04000000
 #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
 #define E1000_SRRCTL_DESCTYPE_MASK             0x0E000000
 #define E1000_SRRCTL_DROP_EN                   0x80000000
@@ -692,11 +702,12 @@  union e1000_adv_rx_desc {
 
 #define E1000_STATUS_NUM_VFS_SHIFT 14
 
-#define E1000_ADVRXD_PKT_IP4 BIT(4)
-#define E1000_ADVRXD_PKT_IP6 BIT(6)
-#define E1000_ADVRXD_PKT_TCP BIT(8)
-#define E1000_ADVRXD_PKT_UDP BIT(9)
-#define E1000_ADVRXD_PKT_SCTP BIT(10)
+#define E1000_ADVRXD_PKT_IP4  BIT(0)
+#define E1000_ADVRXD_PKT_IP6  BIT(2)
+#define E1000_ADVRXD_PKT_IP6E BIT(3)
+#define E1000_ADVRXD_PKT_TCP  BIT(4)
+#define E1000_ADVRXD_PKT_UDP  BIT(5)
+#define E1000_ADVRXD_PKT_SCTP BIT(6)
 
 static inline uint8_t igb_ivar_entry_rx(uint8_t i)
 {
diff --git a/hw/net/trace-events b/hw/net/trace-events
index e4a98b2c7d..9a02261360 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -277,9 +277,9 @@  igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
 igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
 
 igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
-igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
+igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer #%u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
 
-igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
+igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
 
 igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
 igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
@@ -294,6 +294,8 @@  igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
 igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
 igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
 
+igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
+
 # igbvf.c
 igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64
 
diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
index a603468beb..4e720ee7e7 100644
--- a/tests/qtest/libqos/igb.c
+++ b/tests/qtest/libqos/igb.c
@@ -55,6 +55,7 @@  static void igb_pci_start_hw(QOSGraphObject *obj)
 {
     static const uint8_t address[] = E1000E_ADDRESS;
     QE1000E_PCI *d = (QE1000E_PCI *) obj;
+    uint32_t srrctl0;
     uint32_t val;
 
     /* Enable the device */
@@ -109,6 +110,10 @@  static void igb_pci_start_hw(QOSGraphObject *obj)
                         E1000_RAH_AV | E1000_RAH_POOL_1 |
                         le16_to_cpu(*(uint16_t *)(address + 4)));
 
+    /* Set supported receive descriptor mode */
+    srrctl0 = e1000e_macreg_read(&d->e1000e, E1000_SRRCTL(0));
+    e1000e_macreg_write(&d->e1000e, E1000_SRRCTL(0), srrctl0 | BIT(25));
+
     /* Enable receive */
     e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
     e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);