diff mbox series

[ovs-dev,v2,10/11] dp-packet-gso: Refactor software segmentation code.

Message ID 20251112170420.3155127-11-david.marchand@redhat.com
State Changes Requested
Headers show
Series Outer UDP checksum optimisations. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

David Marchand Nov. 12, 2025, 5:04 p.m. UTC
Refactor software segmentation dp_packet_gso() using two helpers:
- dp_packet_gso_seg_new: create a segment from the original packet,
  copying headers and relevant payload,
- dp_packet_gso_update_segment: update the headers,

For dp_packet_gso_seg_new, prefer working on data offset instead of
passing a pointer to the packet data.
This simplifies calling this helper: segments data are either of size
tso_segsz, or the remaining data for the last segment.

For dp_packet_gso_update_segment, (inner and outer) ip_id is based
on the value copied from the original packet + the segment index.
The first and last segments handling are separated from the other
segments:
- rather than copying data for the first segment, we can simply resize the
  original packet (once all segments have been copied), this small
  optimisation saves one tso_segsz copy,
- the last segment needs special handling wrt its length and TCP flags,

Finally, replace the check on tso_segsz presence with an ovs_assert() as
this code should only be invoked when segmenting.
Thanks to this, dp_packet_gso() can be declared void, and the
netdev_soft_seg_drops counter has no reason to exist anymore.

The code is easier to read this way, and it will be beneficial to the next
commit.

Note: this patch is easier to read if ignoring whitespace changes.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
 lib/dp-packet-gso.c | 235 ++++++++++++++++++++++----------------------
 lib/dp-packet-gso.h |   2 +-
 lib/netdev.c        |  10 +-
 3 files changed, 122 insertions(+), 125 deletions(-)

Comments

0-day Robot Nov. 18, 2025, 8:09 a.m. UTC | #1
git sha: 20251112170420.3155127-11-david.marchand@redhat.com
Author: David Marchand <david.marchand@redhat.com>
One line subject: dp-packet-gso: Refactor software segmentation code.

This commit refactors the software segmentation code in dp_packet_gso() by
extracting two helper functions and optimizing the first segment handling.
The refactoring changes dp_packet_gso() from returning bool to void and
removes the netdev_soft_seg_drops counter.

> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index fe7186ddf4..6b8da8b370 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -29,19 +29,19 @@ VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
>   *
>   * The new packet is initialized with 'hdr_len' bytes from the
>   * start of packet 'p' and then appended with 'data_len' bytes
> - * from the 'data' buffer.
> + * from the packet 'p' at offset 'data_off'.
>   *
>   * Note: The packet headers are not updated. */
>  static struct dp_packet *
>  dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
> -                      const char *data, size_t data_len)
> +                      size_t data_off, size_t data_len)
>  {
>      struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
>                                                          dp_packet_headroom(p));
>  
>      /* Append the original packet headers and then the payload. */
>      dp_packet_put(seg, dp_packet_data(p), hdr_len);
> -    dp_packet_put(seg, data, data_len);
> +    dp_packet_put(seg, (char *) dp_packet_data(p) + data_off, data_len);

[ ... ]

> -/* Perform software segmentation on packet 'p'.
> - *
> - * Segments packet 'p' into the array of preallocated batches in 'batches',
> - * updating the 'batches' pointer as needed and returns true.
> - *
> - * Returns false if the packet cannot be segmented. */
> -bool
> +/* Perform software segmentation on packet 'p'.
> + *
> + * Segments packet 'p' into the array of preallocated batches in 'batches',
> + * updating the 'batches' pointer as needed. */
> +void
>  dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
>  {
> -    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>      struct dp_packet_batch *curr_batch = *batches;
> [ ... ]
>      struct dp_packet *seg;
>      uint16_t tso_segsz;
> -    uint16_t inner_ip_id = 0;
> -    uint16_t outer_ip_id = 0;
> -    struct dp_packet *seg;
> -    uint16_t tcp_offset;
> -    uint32_t tcp_seq;
> -    bool outer_ipv4;
> -    int hdr_len;
> -    int seg_len;
> -    bool udp_tnl = dp_packet_tunnel_vxlan(p)
> -                   || dp_packet_tunnel_geneve(p);
> -    bool gre_tnl = dp_packet_tunnel_gre(p);
> +    size_t data_len;
> +    size_t hdr_len;
> +    int n_segs;
>  
>      tso_segsz = dp_packet_get_tso_segsz(p);
> -    if (!tso_segsz) {
> -        VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.",
> -                     dp_packet_size(p));
> -        return false;
> -    }
> +    ovs_assert(tso_segsz);

Does this code risk hitting the assert when processing legitimate packets?
The original function returned false for packets without tso_segsz, but now
it would abort. Are there any conditions where packets might reach this
function without tso_segsz being set?

> +    n_segs = dp_packet_gso_nr_segs(p);
> +
> +    /* Put back the first segment in the batch, it will be trimmed after
> +     * all segments have been copied. */
> +    if (dp_packet_batch_is_full(curr_batch)) {
> +        curr_batch++;
> +    }
> +    dp_packet_batch_add(curr_batch, p);

[ ... ]

> +    for (int i = 1; i < n_segs - 1; i++) {
> +        seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
> +                                    tso_segsz);

Does this arithmetic for the data_off parameter handle edge cases correctly?
When i=1, data_off becomes hdr_len + tso_segsz, which should point to the
second segment's payload start.

> +        dp_packet_gso_update_segment(seg, i, n_segs, tso_segsz);
> +
> +        if (dp_packet_batch_is_full(curr_batch)) {
> +            curr_batch++;
> +        }
> +        dp_packet_batch_add(curr_batch, seg);
> +    }
> +
> +    /* Create the last segment. */
> +    seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
> +                                data_len - (n_segs - 1) * tso_segsz);

Can this arithmetic for the last segment's data_len result in negative
values or underflow? If data_len is smaller than (n_segs - 1) * tso_segsz,
the subtraction could wrap around since data_len is size_t.

> +    dp_packet_gso_update_segment(seg, n_segs - 1, n_segs, tso_segsz);
> +
> +    if (dp_packet_batch_is_full(curr_batch)) {
> +        curr_batch++;
> +    }
> +    dp_packet_batch_add(curr_batch, seg);
> +
> +    /* Trim the first segment and reset TSO. */
> +    dp_packet_set_size(p, hdr_len + tso_segsz);
> +    dp_packet_set_tso_segsz(p, 0);
> +    dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);

[ ... ]

> diff --git a/lib/netdev.c b/lib/netdev.c
> index 6a05e9a7e5..f493dab005 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -71,7 +71,6 @@ COVERAGE_DEFINE(netdev_add_router);
>  COVERAGE_DEFINE(netdev_get_stats);
>  COVERAGE_DEFINE(netdev_push_header_drops);
>  COVERAGE_DEFINE(netdev_soft_seg_good);
> -COVERAGE_DEFINE(netdev_soft_seg_drops);

[ ... ]

> @@ -843,17 +842,12 @@ netdev_send_tso(struct netdev *netdev, int qid,
>      curr_batch = batches;
>      DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
>          if (dp_packet_get_tso_segsz(packet)) {
> -            if (dp_packet_gso(packet, &curr_batch)) {
> -                COVERAGE_INC(netdev_soft_seg_good);
> -            } else {
> -                COVERAGE_INC(netdev_soft_seg_drops);
> -            }
> -            dp_packet_delete(packet);
> +            dp_packet_gso(packet, &curr_batch);
> +            COVERAGE_INC(netdev_soft_seg_good);

Does this code leak the original packet? The previous version called
dp_packet_delete(packet) after dp_packet_gso(), but now the packet p is
added to curr_batch and reused as the first segment. Is this intentional
behavior change properly handled?

>          } else {
>              if (dp_packet_batch_is_full(curr_batch)) {
>                  curr_batch++;
>              }
> -
>              dp_packet_batch_add(curr_batch, packet);
>          }
>      }
Kevin Traynor Dec. 5, 2025, 5:25 p.m. UTC | #2
On 12/11/2025 17:04, David Marchand via dev wrote:
> Refactor software segmentation dp_packet_gso() using two helpers:
> - dp_packet_gso_seg_new: create a segment from the original packet,
>   copying headers and relevant payload,
> - dp_packet_gso_update_segment: update the headers,
> 
> For dp_packet_gso_seg_new, prefer working on data offset instead of
> passing a pointer to the packet data.
> This simplifies calling this helper: segments data are either of size
> tso_segsz, or the remaining data for the last segment.
> 
> For dp_packet_gso_update_segment, (inner and outer) ip_id is based
> on the value copied from the original packet + the segment index.
> The first and last segments handling are separated from the other
> segments:
> - rather than copying data for the first segment, we can simply resize the
>   original packet (once all segments have been copied), this small
>   optimisation saves one tso_segsz copy,
> - the last segment needs special handling wrt its length and TCP flags,
> 
> Finally, replace the check on tso_segsz presence with an ovs_assert() as
> this code should only be invoked when segmenting.
> Thanks to this, dp_packet_gso() can be declared void, and the
> netdev_soft_seg_drops counter has no reason to exist anymore.
> 
> The code is easier to read this way, and it will be beneficial to the next
> commit.
> 

> Note: this patch is easier to read if ignoring whitespace changes.
> 

Can drop this from commit msg

One comment below, but LGTM
Acked-by: Kevin Traynor <ktraynor@redhat.com>

> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
>  lib/dp-packet-gso.c | 235 ++++++++++++++++++++++----------------------
>  lib/dp-packet-gso.h |   2 +-
>  lib/netdev.c        |  10 +-
>  3 files changed, 122 insertions(+), 125 deletions(-)
> 
> diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> index fe7186ddf4..6b8da8b370 100644
> --- a/lib/dp-packet-gso.c
> +++ b/lib/dp-packet-gso.c
> @@ -29,19 +29,19 @@ VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
>   *
>   * The new packet is initialized with 'hdr_len' bytes from the
>   * start of packet 'p' and then appended with 'data_len' bytes
> - * from the 'data' buffer.
> + * from the packet 'p' at offset 'data_off'.
>   *
>   * Note: The packet headers are not updated. */
>  static struct dp_packet *
>  dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
> -                      const char *data, size_t data_len)
> +                      size_t data_off, size_t data_len)
>  {
>      struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
>                                                          dp_packet_headroom(p));
>  
>      /* Append the original packet headers and then the payload. */
>      dp_packet_put(seg, dp_packet_data(p), hdr_len);
> -    dp_packet_put(seg, data, data_len);
> +    dp_packet_put(seg, (char *) dp_packet_data(p) + data_off, data_len);
>  
>      /* The new segment should have the same offsets. */
>      seg->l2_5_ofs = p->l2_5_ofs;
> @@ -77,151 +77,154 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
>      return DIV_ROUND_UP(data_length, segsz);
>  }
>  
> -/* Perform software segmentation on packet 'p'.
> - *
> - * Segments packet 'p' into the array of preallocated batches in 'batches',
> - * updating the 'batches' pointer as needed and returns true.
> - *
> - * Returns false if the packet cannot be segmented. */
> -bool
> -dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> +static void
> +dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
> +                             uint16_t tso_segsz)
>  {
> -    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> -    struct dp_packet_batch *curr_batch = *batches;

> +    bool udp_tnl = dp_packet_tunnel_vxlan(seg)
> +                   || dp_packet_tunnel_geneve(seg);
> +    bool gre_tnl = dp_packet_tunnel_gre(seg);

Only disadvantage I can see from change is that these have to be read
for every update. I guess it shouldn't make any real difference and
adding them to the API doesn't seem very nice either

>      struct tcp_header *tcp_hdr;
>      struct ip_header *ip_hdr;
> -    uint16_t inner_ip_id = 0;
> -    uint16_t outer_ip_id = 0;
> -    struct dp_packet *seg;
> -    uint16_t tcp_offset;
> -    uint16_t tso_segsz;
>      uint32_t tcp_seq;
> -    bool outer_ipv4;
> -    int hdr_len;
> -    int seg_len;
> -    bool udp_tnl = dp_packet_tunnel_vxlan(p)
> -                   || dp_packet_tunnel_geneve(p);
> -    bool gre_tnl = dp_packet_tunnel_gre(p);
>  
> -    tso_segsz = dp_packet_get_tso_segsz(p);
> -    if (!tso_segsz) {
> -        VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.",
> -                     dp_packet_size(p));
> -        return false;
> +    if (udp_tnl) {
> +        /* Update tunnel UDP header length. */
> +        struct udp_header *tnl_hdr;
> +
> +        tnl_hdr = dp_packet_l4(seg);
> +        tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
>      }
>  
>      if (udp_tnl || gre_tnl) {
> -        ip_hdr = dp_packet_inner_l3(p);
> +        /* Update tunnel inner L3 header. */
> +        ip_hdr = dp_packet_inner_l3(seg);
>          if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
> -            inner_ip_id = ntohs(ip_hdr->ip_id);
> +            ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg));
> +            ip_hdr->ip_id = htons(ntohs(ip_hdr->ip_id) + seg_no);
> +            ip_hdr->ip_csum = 0;
> +            dp_packet_inner_ip_checksum_set_partial(seg);
> +        } else {
> +            struct ovs_16aligned_ip6_hdr *ip6_hdr;
> +
> +            ip6_hdr = dp_packet_inner_l3(seg);
> +            ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> +                = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
>          }
> +    }
>  
> -        tcp_hdr = dp_packet_inner_l4(p);
> +    /* Update L3 header. */
> +    ip_hdr = dp_packet_l3(seg);
> +    if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
> +        ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));
> +        ip_hdr->ip_id = htons(ntohs(ip_hdr->ip_id) + seg_no);
> +        ip_hdr->ip_csum = 0;
> +        dp_packet_ip_checksum_set_partial(seg);
>      } else {
> -        tcp_hdr = dp_packet_l4(p);
> -    }
> +        struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
>  
> -    ip_hdr = dp_packet_l3(p);
> -    outer_ipv4 = IP_VER(ip_hdr->ip_ihl_ver) == 4;
> -    if (outer_ipv4) {
> -        outer_ip_id = ntohs(ip_hdr->ip_id);
> +        ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> +            = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr);
>      }
>  
> -    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> +    /* Update L4 header. */
> +    if (udp_tnl || gre_tnl) {
> +        tcp_hdr = dp_packet_inner_l4(seg);
> +        dp_packet_inner_l4_checksum_set_partial(seg);
> +    } else {
> +        tcp_hdr = dp_packet_l4(seg);
> +        dp_packet_l4_checksum_set_partial(seg);
> +    }
>      tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> -    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
> -              + tcp_offset * 4;
> -    const char *data_tail = (char *) dp_packet_tail(p)
> -                            - dp_packet_l2_pad_size(p);
> -    const char *data_pos = (char *) tcp_hdr + tcp_offset * 4;
> -    int n_segs = dp_packet_gso_nr_segs(p);
> -
> -    for (int i = 0; i < n_segs; i++) {
> -        seg_len = data_tail - data_pos;
> -        if (seg_len > tso_segsz) {
> -            seg_len = tso_segsz;
> -        }
> -
> -        seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
> -        data_pos += seg_len;
> +    tcp_seq += seg_no * tso_segsz;
> +    put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> +
> +    if (OVS_LIKELY(seg_no < (n_segs - 1))) {
> +        uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> +        /* Reset flags PUSH and FIN unless it is the last segment. */
> +        uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
> +                             & ~(TCP_PSH | TCP_FIN);
> +        tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
> +    }
>  
> -        if (udp_tnl) {
> -            /* Update tunnel UDP header length. */
> -            struct udp_header *tnl_hdr;
> +    if (gre_tnl) {
> +        struct gre_base_hdr *ghdr;
>  
> -            tnl_hdr = dp_packet_l4(seg);
> -            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
> -        }
> +        ghdr = dp_packet_l4(seg);
>  
> -        if (udp_tnl || gre_tnl) {
> -            /* Update tunnel inner L3 header. */
> -            ip_hdr = dp_packet_inner_l3(seg);
> -            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
> -                ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg));
> -                ip_hdr->ip_id = htons(inner_ip_id);
> -                ip_hdr->ip_csum = 0;
> -                dp_packet_inner_ip_checksum_set_partial(seg);
> -                inner_ip_id++;
> -            } else {
> -                struct ovs_16aligned_ip6_hdr *ip6_hdr;
> -
> -                ip6_hdr = dp_packet_inner_l3(seg);
> -                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> -                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
> -            }
> +        if (ghdr->flags & htons(GRE_CSUM)) {
> +            ovs_be16 *csum_opt = (ovs_be16 *) (ghdr + 1);
> +            *csum_opt = 0;
> +            *csum_opt = csum(ghdr, dp_packet_l4_size(seg));
>          }
> +    }
> +}
>  
> -        /* Update L3 header. */
> -        if (outer_ipv4) {
> -            ip_hdr = dp_packet_l3(seg);
> -            ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));
> -            ip_hdr->ip_id = htons(outer_ip_id);
> -            ip_hdr->ip_csum = 0;
> -            dp_packet_ip_checksum_set_partial(seg);
> -            outer_ip_id++;
> -        } else {
> -            struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
> +/* Perform software segmentation on packet 'p'.
> + *
> + * Segments packet 'p' into the array of preallocated batches in 'batches',
> + * updating the 'batches' pointer as needed. */
> +void
> +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
> +{
> +    struct dp_packet_batch *curr_batch = *batches;
> +    struct dp_packet *seg;
> +    uint16_t tso_segsz;
> +    size_t data_len;
> +    size_t hdr_len;
> +    int n_segs;
>  
> -            ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> -                = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr);
> -        }
> +    tso_segsz = dp_packet_get_tso_segsz(p);
> +    ovs_assert(tso_segsz);
> +    n_segs = dp_packet_gso_nr_segs(p);
>  
> -        /* Update L4 header. */
> -        if (udp_tnl || gre_tnl) {
> -            tcp_hdr = dp_packet_inner_l4(seg);
> -            dp_packet_inner_l4_checksum_set_partial(seg);
> -        } else {
> -            tcp_hdr = dp_packet_l4(seg);
> -            dp_packet_l4_checksum_set_partial(seg);
> -        }
> -        put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> -        tcp_seq += seg_len;
> -        if (OVS_LIKELY(i < (n_segs - 1))) {
> -            /* Reset flags PUSH and FIN unless it is the last segment. */
> -            uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
> -                                 & ~(TCP_PSH | TCP_FIN);
> -            tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
> -        }
> +    /* Put back the first segment in the batch, it will be trimmed after
> +     * all segments have been copied. */
> +    if (dp_packet_batch_is_full(curr_batch)) {
> +        curr_batch++;
> +    }
> +    dp_packet_batch_add(curr_batch, p);
>  
> -        if (gre_tnl) {
> -            struct gre_base_hdr *ghdr;
> +    if (n_segs == 1) {
> +        goto out;
> +    }
>  
> -            ghdr = dp_packet_l4(seg);
> +    if (dp_packet_tunnel(p)) {
> +        hdr_len = (char *) dp_packet_get_inner_tcp_payload(p)
> +                  - (char *) dp_packet_eth(p);
> +        data_len = dp_packet_get_inner_tcp_payload_length(p);
> +    } else {
> +        hdr_len = (char *) dp_packet_get_tcp_payload(p)
> +                  - (char *) dp_packet_eth(p);
> +        data_len = dp_packet_get_tcp_payload_length(p);
> +    }
>  
> -            if (ghdr->flags & htons(GRE_CSUM)) {
> -                ovs_be16 *csum_opt = (ovs_be16 *) (ghdr + 1);
> -                *csum_opt = 0;
> -                *csum_opt = csum(ghdr, dp_packet_l4_size(seg));
> -            }
> -        }
> +    for (int i = 1; i < n_segs - 1; i++) {
> +        seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
> +                                    tso_segsz);
> +        dp_packet_gso_update_segment(seg, i, n_segs, tso_segsz);
>  
>          if (dp_packet_batch_is_full(curr_batch)) {
>              curr_batch++;
>          }
> -
>          dp_packet_batch_add(curr_batch, seg);
>      }
>  
> +    /* Create the last segment. */
> +    seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
> +                                data_len - (n_segs - 1) * tso_segsz);
> +    dp_packet_gso_update_segment(seg, n_segs - 1, n_segs, tso_segsz);
> +
> +    if (dp_packet_batch_is_full(curr_batch)) {
> +        curr_batch++;
> +    }
> +    dp_packet_batch_add(curr_batch, seg);
> +
> +    /* Trim the first segment and reset TSO. */
> +    dp_packet_set_size(p, hdr_len + tso_segsz);
> +    dp_packet_set_tso_segsz(p, 0);
> +    dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
> +
> +out:
>      *batches = curr_batch;
> -    return true;
>  }
> diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
> index 9c282fb86c..2abc19ea31 100644
> --- a/lib/dp-packet-gso.h
> +++ b/lib/dp-packet-gso.h
> @@ -17,7 +17,7 @@
>  #ifndef DP_PACKET_GSO_H
>  #define DP_PACKET_GSO_H 1
>  
> -bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
> +void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
>  int dp_packet_gso_nr_segs(struct dp_packet *);
>  
>  #endif /* dp-packet-gso.h */
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 6a05e9a7e5..f493dab005 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -71,7 +71,6 @@ COVERAGE_DEFINE(netdev_add_router);
>  COVERAGE_DEFINE(netdev_get_stats);
>  COVERAGE_DEFINE(netdev_push_header_drops);
>  COVERAGE_DEFINE(netdev_soft_seg_good);
> -COVERAGE_DEFINE(netdev_soft_seg_drops);
>  
>  struct netdev_saved_flags {
>      struct netdev *netdev;
> @@ -843,17 +842,12 @@ netdev_send_tso(struct netdev *netdev, int qid,
>      curr_batch = batches;
>      DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
>          if (dp_packet_get_tso_segsz(packet)) {
> -            if (dp_packet_gso(packet, &curr_batch)) {
> -                COVERAGE_INC(netdev_soft_seg_good);
> -            } else {
> -                COVERAGE_INC(netdev_soft_seg_drops);
> -            }
> -            dp_packet_delete(packet);
> +            dp_packet_gso(packet, &curr_batch);
> +            COVERAGE_INC(netdev_soft_seg_good);
>          } else {
>              if (dp_packet_batch_is_full(curr_batch)) {
>                  curr_batch++;
>              }
> -
>              dp_packet_batch_add(curr_batch, packet);
>          }
>      }
David Marchand Dec. 9, 2025, 9:59 a.m. UTC | #3
On Fri, 5 Dec 2025 at 18:25, Kevin Traynor <ktraynor@redhat.com> wrote:
>
> On 12/11/2025 17:04, David Marchand via dev wrote:
> > Refactor software segmentation dp_packet_gso() using two helpers:
> > - dp_packet_gso_seg_new: create a segment from the original packet,
> >   copying headers and relevant payload,
> > - dp_packet_gso_update_segment: update the headers,
> >
> > For dp_packet_gso_seg_new, prefer working on data offset instead of
> > passing a pointer to the packet data.
> > This simplifies calling this helper: segments data are either of size
> > tso_segsz, or the remaining data for the last segment.
> >
> > For dp_packet_gso_update_segment, (inner and outer) ip_id is based
> > on the value copied from the original packet + the segment index.
> > The first and last segments handling are separated from the other
> > segments:
> > - rather than copying data for the first segment, we can simply resize the
> >   original packet (once all segments have been copied), this small
> >   optimisation saves one tso_segsz copy,
> > - the last segment needs special handling wrt its length and TCP flags,
> >
> > Finally, replace the check on tso_segsz presence with an ovs_assert() as
> > this code should only be invoked when segmenting.
> > Thanks to this, dp_packet_gso() can be declared void, and the
> > netdev_soft_seg_drops counter has no reason to exist anymore.
> >
> > The code is easier to read this way, and it will be beneficial to the next
> > commit.
> >
>
> > Note: this patch is easier to read if ignoring whitespace changes.
> >
>
> Can drop this from commit msg

Yes, will do.

[snip]

> > +static void
> > +dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
> > +                             uint16_t tso_segsz)
> >  {
> > -    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> > -    struct dp_packet_batch *curr_batch = *batches;
>
> > +    bool udp_tnl = dp_packet_tunnel_vxlan(seg)
> > +                   || dp_packet_tunnel_geneve(seg);
> > +    bool gre_tnl = dp_packet_tunnel_gre(seg);
>
> Only disadvantage I can see from change is that these have to be read
> for every update. I guess it shouldn't make any real difference and
> adding them to the API doesn't seem very nice either

I agree, reading the current segment properties is unneeded, because
the properties of the original packet are known from the start.

This is an internal API, I don't mind updating it: we are already
passing segment numbers which are not that elegant to me :-).
Maybe a cleaner API would be to pass the original packet, and let the
compiler notice we always refer to the same info (idem with nr_segs
and tso_segsz) for all segments?
David Marchand Dec. 9, 2025, 11:03 a.m. UTC | #4
On Tue, 9 Dec 2025 at 10:59, David Marchand <david.marchand@redhat.com> wrote:
> > > +static void
> > > +dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
> > > +                             uint16_t tso_segsz)
> > >  {
> > > -    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> > > -    struct dp_packet_batch *curr_batch = *batches;
> >
> > > +    bool udp_tnl = dp_packet_tunnel_vxlan(seg)
> > > +                   || dp_packet_tunnel_geneve(seg);
> > > +    bool gre_tnl = dp_packet_tunnel_gre(seg);
> >
> > Only disadvantage I can see from change is that these have to be read
> > for every update. I guess it shouldn't make any real difference and
> > adding them to the API doesn't seem very nice either
>
> I agree, reading the current segment properties is unneeded, because
> the properties of the original packet are known from the start.
>
> This is an internal API, I don't mind updating it: we are already
> passing segment numbers which are not that elegant to me :-).
> Maybe a cleaner API would be to pass the original packet, and let the
> compiler notice we always refer to the same info (idem with nr_segs
> and tso_segsz) for all segments?

This triggers a divide by 0 as the original packet get tso reset.
I could defer this after call to dp_packet_gso_update_segment(), but
still, this is awkward with next patch.

I'll simply pass all needed info as input parameters.
diff mbox series

Patch

diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index fe7186ddf4..6b8da8b370 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -29,19 +29,19 @@  VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
  *
  * The new packet is initialized with 'hdr_len' bytes from the
  * start of packet 'p' and then appended with 'data_len' bytes
- * from the 'data' buffer.
+ * from the packet 'p' at offset 'data_off'.
  *
  * Note: The packet headers are not updated. */
 static struct dp_packet *
 dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
-                      const char *data, size_t data_len)
+                      size_t data_off, size_t data_len)
 {
     struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
                                                         dp_packet_headroom(p));
 
     /* Append the original packet headers and then the payload. */
     dp_packet_put(seg, dp_packet_data(p), hdr_len);
-    dp_packet_put(seg, data, data_len);
+    dp_packet_put(seg, (char *) dp_packet_data(p) + data_off, data_len);
 
     /* The new segment should have the same offsets. */
     seg->l2_5_ofs = p->l2_5_ofs;
@@ -77,151 +77,154 @@  dp_packet_gso_nr_segs(struct dp_packet *p)
     return DIV_ROUND_UP(data_length, segsz);
 }
 
-/* Perform software segmentation on packet 'p'.
- *
- * Segments packet 'p' into the array of preallocated batches in 'batches',
- * updating the 'batches' pointer as needed and returns true.
- *
- * Returns false if the packet cannot be segmented. */
-bool
-dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+static void
+dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
+                             uint16_t tso_segsz)
 {
-    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-    struct dp_packet_batch *curr_batch = *batches;
+    bool udp_tnl = dp_packet_tunnel_vxlan(seg)
+                   || dp_packet_tunnel_geneve(seg);
+    bool gre_tnl = dp_packet_tunnel_gre(seg);
     struct tcp_header *tcp_hdr;
     struct ip_header *ip_hdr;
-    uint16_t inner_ip_id = 0;
-    uint16_t outer_ip_id = 0;
-    struct dp_packet *seg;
-    uint16_t tcp_offset;
-    uint16_t tso_segsz;
     uint32_t tcp_seq;
-    bool outer_ipv4;
-    int hdr_len;
-    int seg_len;
-    bool udp_tnl = dp_packet_tunnel_vxlan(p)
-                   || dp_packet_tunnel_geneve(p);
-    bool gre_tnl = dp_packet_tunnel_gre(p);
 
-    tso_segsz = dp_packet_get_tso_segsz(p);
-    if (!tso_segsz) {
-        VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.",
-                     dp_packet_size(p));
-        return false;
+    if (udp_tnl) {
+        /* Update tunnel UDP header length. */
+        struct udp_header *tnl_hdr;
+
+        tnl_hdr = dp_packet_l4(seg);
+        tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
     }
 
     if (udp_tnl || gre_tnl) {
-        ip_hdr = dp_packet_inner_l3(p);
+        /* Update tunnel inner L3 header. */
+        ip_hdr = dp_packet_inner_l3(seg);
         if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
-            inner_ip_id = ntohs(ip_hdr->ip_id);
+            ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg));
+            ip_hdr->ip_id = htons(ntohs(ip_hdr->ip_id) + seg_no);
+            ip_hdr->ip_csum = 0;
+            dp_packet_inner_ip_checksum_set_partial(seg);
+        } else {
+            struct ovs_16aligned_ip6_hdr *ip6_hdr;
+
+            ip6_hdr = dp_packet_inner_l3(seg);
+            ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
+                = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
         }
+    }
 
-        tcp_hdr = dp_packet_inner_l4(p);
+    /* Update L3 header. */
+    ip_hdr = dp_packet_l3(seg);
+    if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
+        ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));
+        ip_hdr->ip_id = htons(ntohs(ip_hdr->ip_id) + seg_no);
+        ip_hdr->ip_csum = 0;
+        dp_packet_ip_checksum_set_partial(seg);
     } else {
-        tcp_hdr = dp_packet_l4(p);
-    }
+        struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
 
-    ip_hdr = dp_packet_l3(p);
-    outer_ipv4 = IP_VER(ip_hdr->ip_ihl_ver) == 4;
-    if (outer_ipv4) {
-        outer_ip_id = ntohs(ip_hdr->ip_id);
+        ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
+            = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr);
     }
 
-    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+    /* Update L4 header. */
+    if (udp_tnl || gre_tnl) {
+        tcp_hdr = dp_packet_inner_l4(seg);
+        dp_packet_inner_l4_checksum_set_partial(seg);
+    } else {
+        tcp_hdr = dp_packet_l4(seg);
+        dp_packet_l4_checksum_set_partial(seg);
+    }
     tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
-    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
-              + tcp_offset * 4;
-    const char *data_tail = (char *) dp_packet_tail(p)
-                            - dp_packet_l2_pad_size(p);
-    const char *data_pos = (char *) tcp_hdr + tcp_offset * 4;
-    int n_segs = dp_packet_gso_nr_segs(p);
-
-    for (int i = 0; i < n_segs; i++) {
-        seg_len = data_tail - data_pos;
-        if (seg_len > tso_segsz) {
-            seg_len = tso_segsz;
-        }
-
-        seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
-        data_pos += seg_len;
+    tcp_seq += seg_no * tso_segsz;
+    put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
+
+    if (OVS_LIKELY(seg_no < (n_segs - 1))) {
+        uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+        /* Reset flags PUSH and FIN unless it is the last segment. */
+        uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
+                             & ~(TCP_PSH | TCP_FIN);
+        tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
+    }
 
-        if (udp_tnl) {
-            /* Update tunnel UDP header length. */
-            struct udp_header *tnl_hdr;
+    if (gre_tnl) {
+        struct gre_base_hdr *ghdr;
 
-            tnl_hdr = dp_packet_l4(seg);
-            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
-        }
+        ghdr = dp_packet_l4(seg);
 
-        if (udp_tnl || gre_tnl) {
-            /* Update tunnel inner L3 header. */
-            ip_hdr = dp_packet_inner_l3(seg);
-            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
-                ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg));
-                ip_hdr->ip_id = htons(inner_ip_id);
-                ip_hdr->ip_csum = 0;
-                dp_packet_inner_ip_checksum_set_partial(seg);
-                inner_ip_id++;
-            } else {
-                struct ovs_16aligned_ip6_hdr *ip6_hdr;
-
-                ip6_hdr = dp_packet_inner_l3(seg);
-                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
-                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
-            }
+        if (ghdr->flags & htons(GRE_CSUM)) {
+            ovs_be16 *csum_opt = (ovs_be16 *) (ghdr + 1);
+            *csum_opt = 0;
+            *csum_opt = csum(ghdr, dp_packet_l4_size(seg));
         }
+    }
+}
 
-        /* Update L3 header. */
-        if (outer_ipv4) {
-            ip_hdr = dp_packet_l3(seg);
-            ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));
-            ip_hdr->ip_id = htons(outer_ip_id);
-            ip_hdr->ip_csum = 0;
-            dp_packet_ip_checksum_set_partial(seg);
-            outer_ip_id++;
-        } else {
-            struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
+/* Perform software segmentation on packet 'p'.
+ *
+ * Segments packet 'p' into the array of preallocated batches in 'batches',
+ * updating the 'batches' pointer as needed. */
+void
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+    struct dp_packet_batch *curr_batch = *batches;
+    struct dp_packet *seg;
+    uint16_t tso_segsz;
+    size_t data_len;
+    size_t hdr_len;
+    int n_segs;
 
-            ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
-                = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr);
-        }
+    tso_segsz = dp_packet_get_tso_segsz(p);
+    ovs_assert(tso_segsz);
+    n_segs = dp_packet_gso_nr_segs(p);
 
-        /* Update L4 header. */
-        if (udp_tnl || gre_tnl) {
-            tcp_hdr = dp_packet_inner_l4(seg);
-            dp_packet_inner_l4_checksum_set_partial(seg);
-        } else {
-            tcp_hdr = dp_packet_l4(seg);
-            dp_packet_l4_checksum_set_partial(seg);
-        }
-        put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
-        tcp_seq += seg_len;
-        if (OVS_LIKELY(i < (n_segs - 1))) {
-            /* Reset flags PUSH and FIN unless it is the last segment. */
-            uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
-                                 & ~(TCP_PSH | TCP_FIN);
-            tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
-        }
+    /* Put back the first segment in the batch, it will be trimmed after
+     * all segments have been copied. */
+    if (dp_packet_batch_is_full(curr_batch)) {
+        curr_batch++;
+    }
+    dp_packet_batch_add(curr_batch, p);
 
-        if (gre_tnl) {
-            struct gre_base_hdr *ghdr;
+    if (n_segs == 1) {
+        goto out;
+    }
 
-            ghdr = dp_packet_l4(seg);
+    if (dp_packet_tunnel(p)) {
+        hdr_len = (char *) dp_packet_get_inner_tcp_payload(p)
+                  - (char *) dp_packet_eth(p);
+        data_len = dp_packet_get_inner_tcp_payload_length(p);
+    } else {
+        hdr_len = (char *) dp_packet_get_tcp_payload(p)
+                  - (char *) dp_packet_eth(p);
+        data_len = dp_packet_get_tcp_payload_length(p);
+    }
 
-            if (ghdr->flags & htons(GRE_CSUM)) {
-                ovs_be16 *csum_opt = (ovs_be16 *) (ghdr + 1);
-                *csum_opt = 0;
-                *csum_opt = csum(ghdr, dp_packet_l4_size(seg));
-            }
-        }
+    for (int i = 1; i < n_segs - 1; i++) {
+        seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
+                                    tso_segsz);
+        dp_packet_gso_update_segment(seg, i, n_segs, tso_segsz);
 
         if (dp_packet_batch_is_full(curr_batch)) {
             curr_batch++;
         }
-
         dp_packet_batch_add(curr_batch, seg);
     }
 
+    /* Create the last segment. */
+    seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
+                                data_len - (n_segs - 1) * tso_segsz);
+    dp_packet_gso_update_segment(seg, n_segs - 1, n_segs, tso_segsz);
+
+    if (dp_packet_batch_is_full(curr_batch)) {
+        curr_batch++;
+    }
+    dp_packet_batch_add(curr_batch, seg);
+
+    /* Trim the first segment and reset TSO. */
+    dp_packet_set_size(p, hdr_len + tso_segsz);
+    dp_packet_set_tso_segsz(p, 0);
+    dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
+
+out:
     *batches = curr_batch;
-    return true;
 }
diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
index 9c282fb86c..2abc19ea31 100644
--- a/lib/dp-packet-gso.h
+++ b/lib/dp-packet-gso.h
@@ -17,7 +17,7 @@ 
 #ifndef DP_PACKET_GSO_H
 #define DP_PACKET_GSO_H 1
 
-bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
+void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
 int dp_packet_gso_nr_segs(struct dp_packet *);
 
 #endif /* dp-packet-gso.h */
diff --git a/lib/netdev.c b/lib/netdev.c
index 6a05e9a7e5..f493dab005 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -71,7 +71,6 @@  COVERAGE_DEFINE(netdev_add_router);
 COVERAGE_DEFINE(netdev_get_stats);
 COVERAGE_DEFINE(netdev_push_header_drops);
 COVERAGE_DEFINE(netdev_soft_seg_good);
-COVERAGE_DEFINE(netdev_soft_seg_drops);
 
 struct netdev_saved_flags {
     struct netdev *netdev;
@@ -843,17 +842,12 @@  netdev_send_tso(struct netdev *netdev, int qid,
     curr_batch = batches;
     DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
         if (dp_packet_get_tso_segsz(packet)) {
-            if (dp_packet_gso(packet, &curr_batch)) {
-                COVERAGE_INC(netdev_soft_seg_good);
-            } else {
-                COVERAGE_INC(netdev_soft_seg_drops);
-            }
-            dp_packet_delete(packet);
+            dp_packet_gso(packet, &curr_batch);
+            COVERAGE_INC(netdev_soft_seg_good);
         } else {
             if (dp_packet_batch_is_full(curr_batch)) {
                 curr_batch++;
             }
-
             dp_packet_batch_add(curr_batch, packet);
         }
     }