[net-next,01/12] net/mlx5e: Add UDP GSO support

Message ID 20180628215103.9141-2-saeedm@mellanox.com
State Accepted
Delegated to: David Miller
Headers show
Series
  • [net-next,01/12] net/mlx5e: Add UDP GSO support
Related show

Commit Message

Saeed Mahameed June 28, 2018, 9:50 p.m.
From: Boris Pismenny <borisp@mellanox.com>

This patch enables UDP GSO support. We enable this by using two WQEs
the first is a UDP LSO WQE for all segments with equal length, and the
second is for the last segment in case it has different length.
Due to HW limitation, before sending, we must adjust the packet length fields.

We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 @3.50GHz
machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
We compare single stream UDP, UDP GSO and UDP GSO with offload.
Performance:
		| MSS (bytes)	| Throughput (Gbps)	| CPU utilization (%)
UDP GSO offload	| 1472		| 35.6			| 8%
UDP GSO 	| 1472		| 25.5			| 17%
UDP 		| 1472		| 10.2			| 17%
UDP GSO offload	| 1024		| 35.6			| 8%
UDP GSO		| 1024		| 19.2			| 17%
UDP 		| 1024		| 5.7			| 17%
UDP GSO offload	| 512		| 33.8			| 16%
UDP GSO		| 512		| 10.4			| 17%
UDP 		| 512		| 3.5			| 17%

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |   4 +-
 .../mellanox/mlx5/core/en_accel/en_accel.h    |  11 +-
 .../mellanox/mlx5/core/en_accel/rxtx.c        | 108 ++++++++++++++++++
 .../mellanox/mlx5/core/en_accel/rxtx.h        |  14 +++
 .../net/ethernet/mellanox/mlx5/core/en_main.c |   3 +
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   8 +-
 6 files changed, 139 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.h

Comments

Willem de Bruijn June 29, 2018, 10:19 p.m. | #1
On Fri, Jun 29, 2018 at 2:24 AM Saeed Mahameed <saeedm@mellanox.com> wrote:
>
> From: Boris Pismenny <borisp@mellanox.com>
>
> This patch enables UDP GSO support. We enable this by using two WQEs
> the first is a UDP LSO WQE for all segments with equal length, and the
> second is for the last segment in case it has different length.
> Due to HW limitation, before sending, we must adjust the packet length fields.
>
> We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 @3.50GHz
> machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
> We compare single stream UDP, UDP GSO and UDP GSO with offload.
> Performance:
>                 | MSS (bytes)   | Throughput (Gbps)     | CPU utilization (%)
> UDP GSO offload | 1472          | 35.6                  | 8%
> UDP GSO         | 1472          | 25.5                  | 17%
> UDP             | 1472          | 10.2                  | 17%
> UDP GSO offload | 1024          | 35.6                  | 8%
> UDP GSO         | 1024          | 19.2                  | 17%
> UDP             | 1024          | 5.7                   | 17%
> UDP GSO offload | 512           | 33.8                  | 16%
> UDP GSO         | 512           | 10.4                  | 17%
> UDP             | 512           | 3.5                   | 17%

Very nice results :)

> +static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
> +                                          struct sk_buff *nskb,
> +                                          int remaining)
> +{
> +       int bytes_needed = remaining, remaining_headlen, remaining_page_offset;
> +       int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
> +       int payload_len = remaining + sizeof(struct udphdr);
> +       int k = 0, i, j;
> +
> +       skb_copy_bits(skb, 0, nskb->data, headlen);
> +       nskb->dev = skb->dev;
> +       skb_reset_mac_header(nskb);
> +       skb_set_network_header(nskb, skb_network_offset(skb));
> +       skb_set_transport_header(nskb, skb_transport_offset(skb));
> +       skb_set_tail_pointer(nskb, headlen);
> +
> +       /* How many frags do we need? */
> +       for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
> +               bytes_needed -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
> +               k++;
> +               if (bytes_needed <= 0)
> +                       break;
> +       }
> +
> +       /* Fill the first frag and split it if necessary */
> +       j = skb_shinfo(skb)->nr_frags - k;
> +       remaining_page_offset = -bytes_needed;
> +       skb_fill_page_desc(nskb, 0,
> +                          skb_shinfo(skb)->frags[j].page.p,
> +                          skb_shinfo(skb)->frags[j].page_offset + remaining_page_offset,
> +                          skb_shinfo(skb)->frags[j].size - remaining_page_offset);
> +
> +       skb_frag_ref(skb, j);
> +
> +       /* Fill the rest of the frags */
> +       for (i = 1; i < k; i++) {
> +               j = skb_shinfo(skb)->nr_frags - k + i;
> +
> +               skb_fill_page_desc(nskb, i,
> +                                  skb_shinfo(skb)->frags[j].page.p,
> +                                  skb_shinfo(skb)->frags[j].page_offset,
> +                                  skb_shinfo(skb)->frags[j].size);
> +               skb_frag_ref(skb, j);
> +       }
> +       skb_shinfo(nskb)->nr_frags = k;
> +
> +       remaining_headlen = remaining - skb->data_len;
> +
> +       /* headlen contains remaining data? */
> +       if (remaining_headlen > 0)
> +               skb_copy_bits(skb, skb->len - remaining, nskb->data + headlen,
> +                             remaining_headlen);
> +       nskb->len = remaining + headlen;
> +       nskb->data_len =  payload_len - sizeof(struct udphdr) +
> +               max_t(int, 0, remaining_headlen);
> +       nskb->protocol = skb->protocol;
> +       if (nskb->protocol == htons(ETH_P_IP)) {
> +               ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
> +                                        skb_shinfo(skb)->gso_segs);
> +               ip_hdr(nskb)->tot_len =
> +                       htons(payload_len + sizeof(struct iphdr));
> +       } else {
> +               ipv6_hdr(nskb)->payload_len = htons(payload_len);
> +       }
> +       udp_hdr(nskb)->len = htons(payload_len);
> +       skb_shinfo(nskb)->gso_size = 0;
> +       nskb->ip_summed = skb->ip_summed;
> +       nskb->csum_start = skb->csum_start;
> +       nskb->csum_offset = skb->csum_offset;
> +       nskb->queue_mapping = skb->queue_mapping;
> +}
> +
> +/* might send skbs and update wqe and pi */
> +struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
> +                                           struct mlx5e_txqsq *sq,
> +                                           struct sk_buff *skb,
> +                                           struct mlx5e_tx_wqe **wqe,
> +                                           u16 *pi)
> +{
> +       int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr);
> +       int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
> +       int remaining = (skb->len - headlen) % skb_shinfo(skb)->gso_size;
> +       struct sk_buff *nskb;
> +
> +       if (skb->protocol == htons(ETH_P_IP))
> +               ip_hdr(skb)->tot_len = htons(payload_len + sizeof(struct iphdr));
> +       else
> +               ipv6_hdr(skb)->payload_len = htons(payload_len);
> +       udp_hdr(skb)->len = htons(payload_len);
> +       if (!remaining)
> +               return skb;
> +
> +       nskb = alloc_skb(max_t(int, headlen, headlen + remaining - skb->data_len), GFP_ATOMIC);
> +       if (unlikely(!nskb)) {
> +               sq->stats->dropped++;
> +               return NULL;
> +       }
> +
> +       mlx5e_udp_gso_prepare_last_skb(skb, nskb, remaining);
> +
> +       skb_shinfo(skb)->gso_segs--;
> +       pskb_trim(skb, skb->len - remaining);
> +       mlx5e_sq_xmit(sq, skb, *wqe, *pi);
> +       mlx5e_sq_fetch_wqe(sq, wqe, pi);
> +       return nskb;
> +}

The device driver seems to be implementing the packet split here
similar to NETIF_F_GSO_PARTIAL. When advertising the right flag, the
stack should be able to do that for you and pass two packets to the
driver.
Boris Pismenny June 30, 2018, 4:06 p.m. | #2
On 06/30/18 01:19, Willem de Bruijn wrote:
> On Fri, Jun 29, 2018 at 2:24 AM Saeed Mahameed <saeedm@mellanox.com> wrote:
>> From: Boris Pismenny <borisp@mellanox.com>
>>
>> This patch enables UDP GSO support. We enable this by using two WQEs
>> the first is a UDP LSO WQE for all segments with equal length, and the
>> second is for the last segment in case it has different length.
>> Due to HW limitation, before sending, we must adjust the packet length fields.
>>
>> We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 @3.50GHz
>> machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
>> We compare single stream UDP, UDP GSO and UDP GSO with offload.
>> Performance:
>>                  | MSS (bytes)   | Throughput (Gbps)     | CPU utilization (%)
>> UDP GSO offload | 1472          | 35.6                  | 8%
>> UDP GSO         | 1472          | 25.5                  | 17%
>> UDP             | 1472          | 10.2                  | 17%
>> UDP GSO offload | 1024          | 35.6                  | 8%
>> UDP GSO         | 1024          | 19.2                  | 17%
>> UDP             | 1024          | 5.7                   | 17%
>> UDP GSO offload | 512           | 33.8                  | 16%
>> UDP GSO         | 512           | 10.4                  | 17%
>> UDP             | 512           | 3.5                   | 17%
> Very nice results :)
Thanks. We expect to have 100Gbps results by netdev0x12.

>> +static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
>> +                                          struct sk_buff *nskb,
>> +                                          int remaining)
>> +{
>> +       int bytes_needed = remaining, remaining_headlen, remaining_page_offset;
>> +       int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
>> +       int payload_len = remaining + sizeof(struct udphdr);
>> +       int k = 0, i, j;
>> +
>> +       skb_copy_bits(skb, 0, nskb->data, headlen);
>> +       nskb->dev = skb->dev;
>> +       skb_reset_mac_header(nskb);
>> +       skb_set_network_header(nskb, skb_network_offset(skb));
>> +       skb_set_transport_header(nskb, skb_transport_offset(skb));
>> +       skb_set_tail_pointer(nskb, headlen);
>> +
>> +       /* How many frags do we need? */
>> +       for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
>> +               bytes_needed -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
>> +               k++;
>> +               if (bytes_needed <= 0)
>> +                       break;
>> +       }
>> +
>> +       /* Fill the first frag and split it if necessary */
>> +       j = skb_shinfo(skb)->nr_frags - k;
>> +       remaining_page_offset = -bytes_needed;
>> +       skb_fill_page_desc(nskb, 0,
>> +                          skb_shinfo(skb)->frags[j].page.p,
>> +                          skb_shinfo(skb)->frags[j].page_offset + remaining_page_offset,
>> +                          skb_shinfo(skb)->frags[j].size - remaining_page_offset);
>> +
>> +       skb_frag_ref(skb, j);
>> +
>> +       /* Fill the rest of the frags */
>> +       for (i = 1; i < k; i++) {
>> +               j = skb_shinfo(skb)->nr_frags - k + i;
>> +
>> +               skb_fill_page_desc(nskb, i,
>> +                                  skb_shinfo(skb)->frags[j].page.p,
>> +                                  skb_shinfo(skb)->frags[j].page_offset,
>> +                                  skb_shinfo(skb)->frags[j].size);
>> +               skb_frag_ref(skb, j);
>> +       }
>> +       skb_shinfo(nskb)->nr_frags = k;
>> +
>> +       remaining_headlen = remaining - skb->data_len;
>> +
>> +       /* headlen contains remaining data? */
>> +       if (remaining_headlen > 0)
>> +               skb_copy_bits(skb, skb->len - remaining, nskb->data + headlen,
>> +                             remaining_headlen);
>> +       nskb->len = remaining + headlen;
>> +       nskb->data_len =  payload_len - sizeof(struct udphdr) +
>> +               max_t(int, 0, remaining_headlen);
>> +       nskb->protocol = skb->protocol;
>> +       if (nskb->protocol == htons(ETH_P_IP)) {
>> +               ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
>> +                                        skb_shinfo(skb)->gso_segs);
>> +               ip_hdr(nskb)->tot_len =
>> +                       htons(payload_len + sizeof(struct iphdr));
>> +       } else {
>> +               ipv6_hdr(nskb)->payload_len = htons(payload_len);
>> +       }
>> +       udp_hdr(nskb)->len = htons(payload_len);
>> +       skb_shinfo(nskb)->gso_size = 0;
>> +       nskb->ip_summed = skb->ip_summed;
>> +       nskb->csum_start = skb->csum_start;
>> +       nskb->csum_offset = skb->csum_offset;
>> +       nskb->queue_mapping = skb->queue_mapping;
>> +}
>> +
>> +/* might send skbs and update wqe and pi */
>> +struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
>> +                                           struct mlx5e_txqsq *sq,
>> +                                           struct sk_buff *skb,
>> +                                           struct mlx5e_tx_wqe **wqe,
>> +                                           u16 *pi)
>> +{
>> +       int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr);
>> +       int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
>> +       int remaining = (skb->len - headlen) % skb_shinfo(skb)->gso_size;
>> +       struct sk_buff *nskb;
>> +
>> +       if (skb->protocol == htons(ETH_P_IP))
>> +               ip_hdr(skb)->tot_len = htons(payload_len + sizeof(struct iphdr));
>> +       else
>> +               ipv6_hdr(skb)->payload_len = htons(payload_len);
>> +       udp_hdr(skb)->len = htons(payload_len);
>> +       if (!remaining)
>> +               return skb;
>> +
>> +       nskb = alloc_skb(max_t(int, headlen, headlen + remaining - skb->data_len), GFP_ATOMIC);
>> +       if (unlikely(!nskb)) {
>> +               sq->stats->dropped++;
>> +               return NULL;
>> +       }
>> +
>> +       mlx5e_udp_gso_prepare_last_skb(skb, nskb, remaining);
>> +
>> +       skb_shinfo(skb)->gso_segs--;
>> +       pskb_trim(skb, skb->len - remaining);
>> +       mlx5e_sq_xmit(sq, skb, *wqe, *pi);
>> +       mlx5e_sq_fetch_wqe(sq, wqe, pi);
>> +       return nskb;
>> +}
> The device driver seems to be implementing the packet split here
> similar to NETIF_F_GSO_PARTIAL. When advertising the right flag, the
> stack should be able to do that for you and pass two packets to the
> driver.

We've totally missed that!
Thank you. I'll fix and resubmit.
Boris Pismenny June 30, 2018, 7:40 p.m. | #3
On 06/30/18 19:06, Boris Pismenny wrote:
>
>
> On 06/30/18 01:19, Willem de Bruijn wrote:
>> On Fri, Jun 29, 2018 at 2:24 AM Saeed Mahameed <saeedm@mellanox.com> 
>> wrote:
>>> From: Boris Pismenny <borisp@mellanox.com>
>>>
>>> This patch enables UDP GSO support. We enable this by using two WQEs
>>> the first is a UDP LSO WQE for all segments with equal length, and the
>>> second is for the last segment in case it has different length.
>>> Due to HW limitation, before sending, we must adjust the packet 
>>> length fields.
>>>
>>> We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 
>>> @3.50GHz
>>> machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
>>> We compare single stream UDP, UDP GSO and UDP GSO with offload.
>>> Performance:
>>>                  | MSS (bytes)   | Throughput (Gbps)     | CPU 
>>> utilization (%)
>>> UDP GSO offload | 1472          | 35.6                  | 8%
>>> UDP GSO         | 1472          | 25.5                  | 17%
>>> UDP             | 1472          | 10.2                  | 17%
>>> UDP GSO offload | 1024          | 35.6                  | 8%
>>> UDP GSO         | 1024          | 19.2                  | 17%
>>> UDP             | 1024          | 5.7                   | 17%
>>> UDP GSO offload | 512           | 33.8                  | 16%
>>> UDP GSO         | 512           | 10.4                  | 17%
>>> UDP             | 512           | 3.5                   | 17%
>> Very nice results :)
> Thanks. We expect to have 100Gbps results by netdev0x12.
>
>>> +static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
>>> +                                          struct sk_buff *nskb,
>>> +                                          int remaining)
>>> +{
>>> +       int bytes_needed = remaining, remaining_headlen, 
>>> remaining_page_offset;
>>> +       int headlen = skb_transport_offset(skb) + sizeof(struct 
>>> udphdr);
>>> +       int payload_len = remaining + sizeof(struct udphdr);
>>> +       int k = 0, i, j;
>>> +
>>> +       skb_copy_bits(skb, 0, nskb->data, headlen);
>>> +       nskb->dev = skb->dev;
>>> +       skb_reset_mac_header(nskb);
>>> +       skb_set_network_header(nskb, skb_network_offset(skb));
>>> +       skb_set_transport_header(nskb, skb_transport_offset(skb));
>>> +       skb_set_tail_pointer(nskb, headlen);
>>> +
>>> +       /* How many frags do we need? */
>>> +       for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
>>> +               bytes_needed -= 
>>> skb_frag_size(&skb_shinfo(skb)->frags[i]);
>>> +               k++;
>>> +               if (bytes_needed <= 0)
>>> +                       break;
>>> +       }
>>> +
>>> +       /* Fill the first frag and split it if necessary */
>>> +       j = skb_shinfo(skb)->nr_frags - k;
>>> +       remaining_page_offset = -bytes_needed;
>>> +       skb_fill_page_desc(nskb, 0,
>>> + skb_shinfo(skb)->frags[j].page.p,
>>> + skb_shinfo(skb)->frags[j].page_offset + remaining_page_offset,
>>> +                          skb_shinfo(skb)->frags[j].size - 
>>> remaining_page_offset);
>>> +
>>> +       skb_frag_ref(skb, j);
>>> +
>>> +       /* Fill the rest of the frags */
>>> +       for (i = 1; i < k; i++) {
>>> +               j = skb_shinfo(skb)->nr_frags - k + i;
>>> +
>>> +               skb_fill_page_desc(nskb, i,
>>> + skb_shinfo(skb)->frags[j].page.p,
>>> + skb_shinfo(skb)->frags[j].page_offset,
>>> + skb_shinfo(skb)->frags[j].size);
>>> +               skb_frag_ref(skb, j);
>>> +       }
>>> +       skb_shinfo(nskb)->nr_frags = k;
>>> +
>>> +       remaining_headlen = remaining - skb->data_len;
>>> +
>>> +       /* headlen contains remaining data? */
>>> +       if (remaining_headlen > 0)
>>> +               skb_copy_bits(skb, skb->len - remaining, nskb->data 
>>> + headlen,
>>> +                             remaining_headlen);
>>> +       nskb->len = remaining + headlen;
>>> +       nskb->data_len =  payload_len - sizeof(struct udphdr) +
>>> +               max_t(int, 0, remaining_headlen);
>>> +       nskb->protocol = skb->protocol;
>>> +       if (nskb->protocol == htons(ETH_P_IP)) {
>>> +               ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
>>> + skb_shinfo(skb)->gso_segs);
>>> +               ip_hdr(nskb)->tot_len =
>>> +                       htons(payload_len + sizeof(struct iphdr));
>>> +       } else {
>>> +               ipv6_hdr(nskb)->payload_len = htons(payload_len);
>>> +       }
>>> +       udp_hdr(nskb)->len = htons(payload_len);
>>> +       skb_shinfo(nskb)->gso_size = 0;
>>> +       nskb->ip_summed = skb->ip_summed;
>>> +       nskb->csum_start = skb->csum_start;
>>> +       nskb->csum_offset = skb->csum_offset;
>>> +       nskb->queue_mapping = skb->queue_mapping;
>>> +}
>>> +
>>> +/* might send skbs and update wqe and pi */
>>> +struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
>>> +                                           struct mlx5e_txqsq *sq,
>>> +                                           struct sk_buff *skb,
>>> +                                           struct mlx5e_tx_wqe **wqe,
>>> +                                           u16 *pi)
>>> +{
>>> +       int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct 
>>> udphdr);
>>> +       int headlen = skb_transport_offset(skb) + sizeof(struct 
>>> udphdr);
>>> +       int remaining = (skb->len - headlen) % 
>>> skb_shinfo(skb)->gso_size;
>>> +       struct sk_buff *nskb;
>>> +
>>> +       if (skb->protocol == htons(ETH_P_IP))
>>> +               ip_hdr(skb)->tot_len = htons(payload_len + 
>>> sizeof(struct iphdr));
>>> +       else
>>> +               ipv6_hdr(skb)->payload_len = htons(payload_len);
>>> +       udp_hdr(skb)->len = htons(payload_len);
>>> +       if (!remaining)
>>> +               return skb;
>>> +
>>> +       nskb = alloc_skb(max_t(int, headlen, headlen + remaining - 
>>> skb->data_len), GFP_ATOMIC);
>>> +       if (unlikely(!nskb)) {
>>> +               sq->stats->dropped++;
>>> +               return NULL;
>>> +       }
>>> +
>>> +       mlx5e_udp_gso_prepare_last_skb(skb, nskb, remaining);
>>> +
>>> +       skb_shinfo(skb)->gso_segs--;
>>> +       pskb_trim(skb, skb->len - remaining);
>>> +       mlx5e_sq_xmit(sq, skb, *wqe, *pi);
>>> +       mlx5e_sq_fetch_wqe(sq, wqe, pi);
>>> +       return nskb;
>>> +}
>> The device driver seems to be implementing the packet split here
>> similar to NETIF_F_GSO_PARTIAL. When advertising the right flag, the
>> stack should be able to do that for you and pass two packets to the
>> driver.
>
> We've totally missed that!
> Thank you. I'll fix and resubmit.

I've noticed that we could get cleaner code in our driver if we remove 
these two lines from net/ipv4/udp_offload.c:
if (skb_is_gso(segs))
              mss *= skb_shinfo(segs)->gso_segs;

I think that this is correct in case of GSO_PARTIAL segmentation for the 
following reasons:
1. After this change the UDP payload field is consistent with the IP 
header payload length field. Currently, IPv4 length is 1500 and UDP 
total length is the full unsegmented length.
2. AFAIU, in GSO_PARTIAL no tunnel headers should be modified except the 
IP ID field, including the UDP length field.
What do you think?
Willem de Bruijn July 2, 2018, 1:45 a.m. | #4
>> I've noticed that we could get cleaner code in our driver if we remove
>> these two lines from net/ipv4/udp_offload.c:
>> if (skb_is_gso(segs))
>>               mss *= skb_shinfo(segs)->gso_segs;
>>
>> I think that this is correct in case of GSO_PARTIAL segmentation for the
>> following reasons:
>> 1. After this change the UDP payload field is consistent with the IP
>> header payload length field. Currently, IPv4 length is 1500 and UDP
>> total length is the full unsegmented length.

How does this simplify the driver? Does it currently have to
change the udph->length field to the mss on the wire, because the
device only splits + replicates the headers + computes the csum?

> I don’t recall that the UDP header length field will match the IP length
> field. I had intentionally left it at the original value used to compute
> the UDP header checksum. That way you could just adjust it by
> cancelling out the length from the partial checksum.

>> 2. AFAIU, in GSO_PARTIAL no tunnel headers should be modified except the
>> IP ID field, including the UDP length field.
>> What do you think?
>
>
> For outer headers this is correct. For inner headers this is not. The inner
> UDP header will need to have the length updated and the checksum
> recomputed as a part of the segmentation.
Boris Pismenny July 2, 2018, 5:29 a.m. | #5
On 7/2/2018 4:45 AM, Willem de Bruijn wrote:
>>> I've noticed that we could get cleaner code in our driver if we remove
>>> these two lines from net/ipv4/udp_offload.c:
>>> if (skb_is_gso(segs))
>>>                mss *= skb_shinfo(segs)->gso_segs;
>>>
>>> I think that this is correct in case of GSO_PARTIAL segmentation for the
>>> following reasons:
>>> 1. After this change the UDP payload field is consistent with the IP
>>> header payload length field. Currently, IPv4 length is 1500 and UDP
>>> total length is the full unsegmented length.
> 
> How does this simplify the driver? Does it currently have to
> change the udph->length field to the mss on the wire, because the
> device only splits + replicates the headers + computes the csum?

Yes, this is the code I have at the moment.

The device's limitation is more subtle than this. It could adjust the 
length, but then the checksum would be wrong.
Willem de Bruijn July 2, 2018, 1:34 p.m. | #6
On Mon, Jul 2, 2018 at 1:30 AM Boris Pismenny <borisp@mellanox.com> wrote:
>
>
>
> On 7/2/2018 4:45 AM, Willem de Bruijn wrote:
> >>> I've noticed that we could get cleaner code in our driver if we remove
> >>> these two lines from net/ipv4/udp_offload.c:
> >>> if (skb_is_gso(segs))
> >>>                mss *= skb_shinfo(segs)->gso_segs;
> >>>
> >>> I think that this is correct in case of GSO_PARTIAL segmentation for the
> >>> following reasons:
> >>> 1. After this change the UDP payload field is consistent with the IP
> >>> header payload length field. Currently, IPv4 length is 1500 and UDP
> >>> total length is the full unsegmented length.
> >
> > How does this simplify the driver? Does it currently have to
> > change the udph->length field to the mss on the wire, because the
> > device only splits + replicates the headers + computes the csum?
>
> Yes, this is the code I have at the moment.
>
> The device's limitation is more subtle than this. It could adjust the
> length, but then the checksum would be wrong.

I see. We do have to keep in mind other devices. Alexander's ixgbe
RFC patch does not have this logic, so that device must update the
field directly.

  https://patchwork.ozlabs.org/patch/908396/
Willem de Bruijn July 2, 2018, 2:45 p.m. | #7
On Mon, Jul 2, 2018 at 9:34 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Mon, Jul 2, 2018 at 1:30 AM Boris Pismenny <borisp@mellanox.com> wrote:
> >
> >
> >
> > On 7/2/2018 4:45 AM, Willem de Bruijn wrote:
> > >>> I've noticed that we could get cleaner code in our driver if we remove
> > >>> these two lines from net/ipv4/udp_offload.c:
> > >>> if (skb_is_gso(segs))
> > >>>                mss *= skb_shinfo(segs)->gso_segs;
> > >>>
> > >>> I think that this is correct in case of GSO_PARTIAL segmentation for the
> > >>> following reasons:
> > >>> 1. After this change the UDP payload field is consistent with the IP
> > >>> header payload length field. Currently, IPv4 length is 1500 and UDP
> > >>> total length is the full unsegmented length.
> > >
> > > How does this simplify the driver? Does it currently have to
> > > change the udph->length field to the mss on the wire, because the
> > > device only splits + replicates the headers + computes the csum?
> >
> > Yes, this is the code I have at the moment.
> >
> > The device's limitation is more subtle than this. It could adjust the
> > length, but then the checksum would be wrong.
>
> I see. We do have to keep in mind other devices. Alexander's ixgbe
> RFC patch does not have this logic, so that device must update the
> field directly.
>
>   https://patchwork.ozlabs.org/patch/908396/

To be clear, I think it's fine to remove these two lines if it does not cause
problems for the ixgbe. It's quite possible that that device sets the udp
length field unconditionally, ignoring the previous value. In which case both
devices will work after this change without additional driver logic.
Boris Pismenny July 2, 2018, 7:17 p.m. | #8
On 7/2/2018 6:32 PM, Alexander Duyck wrote:
> 
> 
> On Mon, Jul 2, 2018 at 7:46 AM Willem de Bruijn 
> <willemdebruijn.kernel@gmail.com 
> <mailto:willemdebruijn.kernel@gmail.com>> wrote:
> 
>     On Mon, Jul 2, 2018 at 9:34 AM Willem de Bruijn
>     <willemdebruijn.kernel@gmail.com
>     <mailto:willemdebruijn.kernel@gmail.com>> wrote:
>      >
>      > On Mon, Jul 2, 2018 at 1:30 AM Boris Pismenny
>     <borisp@mellanox.com <mailto:borisp@mellanox.com>> wrote:
>      > >
>      > >
>      > >
>      > > On 7/2/2018 4:45 AM, Willem de Bruijn wrote:
>      > > >>> I've noticed that we could get cleaner code in our driver
>     if we remove
>      > > >>> these two lines from net/ipv4/udp_offload.c:
>      > > >>> if (skb_is_gso(segs))
>      > > >>>                mss *= skb_shinfo(segs)->gso_segs;
>      > > >>>
>      > > >>> I think that this is correct in case of GSO_PARTIAL
>     segmentation for the
>      > > >>> following reasons:
>      > > >>> 1. After this change the UDP payload field is consistent
>     with the IP
>      > > >>> header payload length field. Currently, IPv4 length is 1500
>     and UDP
>      > > >>> total length is the full unsegmented length.
>      > > >
>      > > > How does this simplify the driver? Does it currently have to
>      > > > change the udph->length field to the mss on the wire, because the
>      > > > device only splits + replicates the headers + computes the csum?
>      > >
>      > > Yes, this is the code I have at the moment.
>      > >
>      > > The device's limitation is more subtle than this. It could
>     adjust the
>      > > length, but then the checksum would be wrong.
>      >
>      > I see. We do have to keep in mind other devices. Alexander's ixgbe
>      > RFC patch does not have this logic, so that device must update the
>      > field directly.
>      >
>      > https://patchwork.ozlabs.org/patch/908396/
> 
>     To be clear, I think it's fine to remove these two lines if it does
>     not cause
>     problems for the ixgbe. It's quite possible that that device sets
>     the udp
>     length field unconditionally, ignoring the previous value. In which
>     case both
>     devices will work after this change without additional driver logic.
> 
> 
> I would prefer we didn’t modify this. Otherwise we cannot cancel out the 
> length from the partial checksum when the time comes. The ixgbe code was 
> making use of it if I recall.
> 

AFAIU, the ixgbe patch doesn't use this. Instead the length is obtained 
by the following code for both TCP and UDP segmentation:
paylen = skb->len - l4_offset;

Could you please check to see if this is actually required?

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9efbf193ad5a..d923f2f58608 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,8 +14,8 @@  mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
 		fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
-		en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
-		en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
+		en_tx.o en_rx.o en_dim.o en_txrx.o en_accel/rxtx.o en_stats.o  \
+		vxlan.o en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
index f20074dbef32..39a5d13ba459 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
@@ -34,12 +34,11 @@ 
 #ifndef __MLX5E_EN_ACCEL_H__
 #define __MLX5E_EN_ACCEL_H__
 
-#ifdef CONFIG_MLX5_ACCEL
-
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include "en_accel/ipsec_rxtx.h"
 #include "en_accel/tls_rxtx.h"
+#include "en_accel/rxtx.h"
 #include "en.h"
 
 static inline struct sk_buff *mlx5e_accel_handle_tx(struct sk_buff *skb,
@@ -64,9 +63,13 @@  static inline struct sk_buff *mlx5e_accel_handle_tx(struct sk_buff *skb,
 	}
 #endif
 
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		skb = mlx5e_udp_gso_handle_tx_skb(dev, sq, skb, wqe, pi);
+		if (unlikely(!skb))
+			return NULL;
+	}
+
 	return skb;
 }
 
-#endif /* CONFIG_MLX5_ACCEL */
-
 #endif /* __MLX5E_EN_ACCEL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.c
new file mode 100644
index 000000000000..4bb1f3b12b96
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.c
@@ -0,0 +1,108 @@ 
+#include "en_accel/rxtx.h"
+
+static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
+					   struct sk_buff *nskb,
+					   int remaining)
+{
+	int bytes_needed = remaining, remaining_headlen, remaining_page_offset;
+	int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
+	int payload_len = remaining + sizeof(struct udphdr);
+	int k = 0, i, j;
+
+	skb_copy_bits(skb, 0, nskb->data, headlen);
+	nskb->dev = skb->dev;
+	skb_reset_mac_header(nskb);
+	skb_set_network_header(nskb, skb_network_offset(skb));
+	skb_set_transport_header(nskb, skb_transport_offset(skb));
+	skb_set_tail_pointer(nskb, headlen);
+
+	/* How many frags do we need? */
+	for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
+		bytes_needed -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+		k++;
+		if (bytes_needed <= 0)
+			break;
+	}
+
+	/* Fill the first frag and split it if necessary */
+	j = skb_shinfo(skb)->nr_frags - k;
+	remaining_page_offset = -bytes_needed;
+	skb_fill_page_desc(nskb, 0,
+			   skb_shinfo(skb)->frags[j].page.p,
+			   skb_shinfo(skb)->frags[j].page_offset + remaining_page_offset,
+			   skb_shinfo(skb)->frags[j].size - remaining_page_offset);
+
+	skb_frag_ref(skb, j);
+
+	/* Fill the rest of the frags */
+	for (i = 1; i < k; i++) {
+		j = skb_shinfo(skb)->nr_frags - k + i;
+
+		skb_fill_page_desc(nskb, i,
+				   skb_shinfo(skb)->frags[j].page.p,
+				   skb_shinfo(skb)->frags[j].page_offset,
+				   skb_shinfo(skb)->frags[j].size);
+		skb_frag_ref(skb, j);
+	}
+	skb_shinfo(nskb)->nr_frags = k;
+
+	remaining_headlen = remaining - skb->data_len;
+
+	/* headlen contains remaining data? */
+	if (remaining_headlen > 0)
+		skb_copy_bits(skb, skb->len - remaining, nskb->data + headlen,
+			      remaining_headlen);
+	nskb->len = remaining + headlen;
+	nskb->data_len =  payload_len - sizeof(struct udphdr) +
+		max_t(int, 0, remaining_headlen);
+	nskb->protocol = skb->protocol;
+	if (nskb->protocol == htons(ETH_P_IP)) {
+		ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
+					 skb_shinfo(skb)->gso_segs);
+		ip_hdr(nskb)->tot_len =
+			htons(payload_len + sizeof(struct iphdr));
+	} else {
+		ipv6_hdr(nskb)->payload_len = htons(payload_len);
+	}
+	udp_hdr(nskb)->len = htons(payload_len);
+	skb_shinfo(nskb)->gso_size = 0;
+	nskb->ip_summed = skb->ip_summed;
+	nskb->csum_start = skb->csum_start;
+	nskb->csum_offset = skb->csum_offset;
+	nskb->queue_mapping = skb->queue_mapping;
+}
+
+/* might send skbs and update wqe and pi */
+struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
+					    struct mlx5e_txqsq *sq,
+					    struct sk_buff *skb,
+					    struct mlx5e_tx_wqe **wqe,
+					    u16 *pi)
+{
+	int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr);
+	int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
+	int remaining = (skb->len - headlen) % skb_shinfo(skb)->gso_size;
+	struct sk_buff *nskb;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		ip_hdr(skb)->tot_len = htons(payload_len + sizeof(struct iphdr));
+	else
+		ipv6_hdr(skb)->payload_len = htons(payload_len);
+	udp_hdr(skb)->len = htons(payload_len);
+	if (!remaining)
+		return skb;
+
+	nskb = alloc_skb(max_t(int, headlen, headlen + remaining - skb->data_len), GFP_ATOMIC);
+	if (unlikely(!nskb)) {
+		sq->stats->dropped++;
+		return NULL;
+	}
+
+	mlx5e_udp_gso_prepare_last_skb(skb, nskb, remaining);
+
+	skb_shinfo(skb)->gso_segs--;
+	pskb_trim(skb, skb->len - remaining);
+	mlx5e_sq_xmit(sq, skb, *wqe, *pi);
+	mlx5e_sq_fetch_wqe(sq, wqe, pi);
+	return nskb;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.h
new file mode 100644
index 000000000000..ed42699a78b3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/rxtx.h
@@ -0,0 +1,14 @@ 
+
+#ifndef __MLX5E_EN_ACCEL_RX_TX_H__
+#define __MLX5E_EN_ACCEL_RX_TX_H__
+
+#include <linux/skbuff.h>
+#include "en.h"
+
+struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
+					    struct mlx5e_txqsq *sq,
+					    struct sk_buff *skb,
+					    struct mlx5e_tx_wqe **wqe,
+					    u16 *pi);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 134f20a182b5..e2ef68b1daa2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4592,6 +4592,9 @@  static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->features         |= NETIF_F_HIGHDMA;
 	netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
 
+	netdev->features         |= NETIF_F_GSO_UDP_L4;
+	netdev->hw_features      |= NETIF_F_GSO_UDP_L4;
+
 	netdev->priv_flags       |= IFF_UNICAST_FLT;
 
 	mlx5e_set_netdev_dev_addr(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index f29deb44bf3b..f450d9ca31fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -228,7 +228,10 @@  mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb)
 		stats->tso_inner_packets++;
 		stats->tso_inner_bytes += skb->len - ihs;
 	} else {
-		ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+			ihs = skb_transport_offset(skb) + sizeof(struct udphdr);
+		else
+			ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
 		stats->tso_packets++;
 		stats->tso_bytes += skb->len - ihs;
 	}
@@ -443,12 +446,11 @@  netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev)
 	sq = priv->txq2sq[skb_get_queue_mapping(skb)];
 	mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
 
-#ifdef CONFIG_MLX5_ACCEL
 	/* might send skbs and update wqe and pi */
 	skb = mlx5e_accel_handle_tx(skb, sq, dev, &wqe, &pi);
 	if (unlikely(!skb))
 		return NETDEV_TX_OK;
-#endif
+
 	return mlx5e_sq_xmit(sq, skb, wqe, pi);
 }