[net-next,v2,2/4] net: Enable Tx queue selection based on Rx queues

Message ID 152643400925.4991.5029989601625953592.stgit@anamdev.jf.intel.com
State Changes Requested
Delegated to: David Miller
Headers show
Series
  • Symmetric queue selection using XPS for Rx queues
Related show

Commit Message

Nambiar, Amritha May 16, 2018, 1:26 a.m.
This patch adds support to pick Tx queue based on the Rx queue map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive
queue map does not apply, then the Tx queue selection falls back
to CPU map based selection and finally to hashing.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/sock.h       |   18 ++++++++++++++++++
 net/core/dev.c           |   36 +++++++++++++++++++++++++++++-------
 net/core/sock.c          |    5 +++++
 net/ipv4/tcp_input.c     |    7 +++++++
 net/ipv4/tcp_ipv4.c      |    1 +
 net/ipv4/tcp_minisocks.c |    1 +
 6 files changed, 61 insertions(+), 7 deletions(-)

Comments

Tom Herbert May 18, 2018, 4:03 a.m. | #1
On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
<amritha.nambiar@intel.com> wrote:
> This patch adds support to pick Tx queue based on the Rx queue map
> configuration set by the admin through the sysfs attribute
> for each Tx queue. If the user configuration for receive
> queue map does not apply, then the Tx queue selection falls back
> to CPU map based selection and finally to hashing.
>
> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  include/net/sock.h       |   18 ++++++++++++++++++
>  net/core/dev.c           |   36 +++++++++++++++++++++++++++++-------
>  net/core/sock.c          |    5 +++++
>  net/ipv4/tcp_input.c     |    7 +++++++
>  net/ipv4/tcp_ipv4.c      |    1 +
>  net/ipv4/tcp_minisocks.c |    1 +
>  6 files changed, 61 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4f7c584..0613f63 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_rx_queue_mapping: rx queue number for this connection
> + *     @skc_rx_ifindex: rx ifindex for this connection
>   *     @skc_flags: place holder for sk_flags
>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
> @@ -215,6 +217,10 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +#ifdef CONFIG_XPS
> +       int                     skc_rx_queue_mapping;
> +       int                     skc_rx_ifindex;

Isn't this increasing size of sock_common for a narrow use case functionality?

> +#endif
>         union {
>                 int             skc_incoming_cpu;
>                 u32             skc_rcv_wnd;
> @@ -326,6 +332,10 @@ struct sock {
>  #define sk_nulls_node          __sk_common.skc_nulls_node
>  #define sk_refcnt              __sk_common.skc_refcnt
>  #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
> +#ifdef CONFIG_XPS
> +#define sk_rx_queue_mapping    __sk_common.skc_rx_queue_mapping
> +#define sk_rx_ifindex          __sk_common.skc_rx_ifindex
> +#endif
>
>  #define sk_dontcopy_begin      __sk_common.skc_dontcopy_begin
>  #define sk_dontcopy_end                __sk_common.skc_dontcopy_end
> @@ -1696,6 +1706,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
>         return sk ? sk->sk_tx_queue_mapping : -1;
>  }
>
> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
> +{
> +#ifdef CONFIG_XPS
> +       sk->sk_rx_ifindex = skb->skb_iif;
> +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
> +#endif
> +}
> +
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>         sk_tx_queue_clear(sk);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 7e5dfdb..4030368 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3458,18 +3458,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>  }
>  #endif /* CONFIG_NET_EGRESS */
>
> -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
> -{
>  #ifdef CONFIG_XPS
> -       struct xps_dev_maps *dev_maps;
> +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
> +                              struct xps_dev_maps *dev_maps, unsigned int tci)
> +{
>         struct xps_map *map;
>         int queue_index = -1;
>
> -       rcu_read_lock();
> -       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>         if (dev_maps) {
> -               unsigned int tci = skb->sender_cpu - 1;
> -
>                 if (dev->num_tc) {
>                         tci *= dev->num_tc;
>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
> @@ -3486,6 +3482,32 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>                                 queue_index = -1;
>                 }
>         }
> +       return queue_index;
> +}
> +#endif
> +
> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
> +{
> +#ifdef CONFIG_XPS
> +       enum xps_map_type i = XPS_MAP_RXQS;
> +       struct xps_dev_maps *dev_maps;
> +       struct sock *sk = skb->sk;
> +       int queue_index = -1;
> +       unsigned int tci = 0;
> +
> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
> +           dev->ifindex == sk->sk_rx_ifindex)
> +               tci = sk->sk_rx_queue_mapping;
> +
> +       rcu_read_lock();
> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
> +               if (i == XPS_MAP_CPUS)

This while loop typifies exactly why I don't think the XPS maps should
be an array. There's only two and we really don't want this to be an
open ended invitation for people to create new mapping methods. The
code is much simpler and potentially more efficient if the two maps
are just separate pointers. It should look something like this:

dev_maps = rcu_dereference(dev->xps_rxqs_map);
if (dev_maps) {
        queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
        if (queue_index < 0) {
              dev_maps = rcu_dereference(dev->xps_rxqs_map);
              if (dev_maps) {
                     queue_index = __get_xps_queue_idx(dev, skb,
dev_rxqs_maps, tci);
               ...

Also, the rxqs is a pretty narrow use case and it's likely to be
rarely configured (relative to a CPU map). A static_key could be used
to eliminate the cost of the extra map check (the static_key could
also be used on for CPU maps, this is analogous to how there are
static keys for RPS and RFS).

> +                       tci = skb->sender_cpu - 1;
> +               dev_maps = rcu_dereference(dev->xps_maps[i]);
> +               queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
> +               i++;
> +       }
> +
>         rcu_read_unlock();
>
>         return queue_index;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 042cfc6..73d7fa8 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2824,6 +2824,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>         sk->sk_pacing_rate = ~0U;
>         sk->sk_pacing_shift = 10;
>         sk->sk_incoming_cpu = -1;
> +
> +#ifdef CONFIG_XPS
> +       sk->sk_rx_ifindex = -1;
> +       sk->sk_rx_queue_mapping = -1;
> +#endif
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index b188e0d..d33911c 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -78,6 +78,7 @@
>  #include <linux/errqueue.h>
>  #include <trace/events/tcp.h>
>  #include <linux/static_key.h>
> +#include <net/busy_poll.h>
>
>  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
>
> @@ -5559,6 +5560,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
>                 __tcp_fast_path_on(tp, tp->snd_wnd);
>         else
>                 tp->pred_flags = 0;
> +
> +       if (skb) {
> +               sk_mark_napi_id(sk, skb);
> +               sk_mark_rx_queue(sk, skb);
> +       }
>  }
>
>  static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
> @@ -6371,6 +6377,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>         tcp_rsk(req)->snt_isn = isn;
>         tcp_rsk(req)->txhash = net_tx_rndhash();
>         tcp_openreq_init_rwin(req, sk, dst);
> +       sk_mark_rx_queue(req_to_sk(req), skb);
>         if (!want_cookie) {
>                 tcp_reqsk_record_syn(sk, req, skb);
>                 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index caf23de..abdf02e 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1479,6 +1479,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
>
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_mark_rx_queue(sk, skb);

Can this be done in sock_rps_save_rxhash?

>                 if (dst) {
>                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
>                             !dst->ops->check(dst, 0)) {
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index f867658..4939c28 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -836,6 +836,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
>
>         /* record NAPI ID of child */
>         sk_mark_napi_id(child, skb);
> +       sk_mark_rx_queue(child, skb);
>
>         tcp_segs_in(tcp_sk(child), skb);
>         if (!sock_owned_by_user(child)) {
>
Willem de Bruijn May 19, 2018, 8:13 p.m. | #2
On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
> <amritha.nambiar@intel.com> wrote:
>> This patch adds support to pick Tx queue based on the Rx queue map
>> configuration set by the admin through the sysfs attribute
>> for each Tx queue. If the user configuration for receive
>> queue map does not apply, then the Tx queue selection falls back
>> to CPU map based selection and finally to hashing.
>>
>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>>  include/net/sock.h       |   18 ++++++++++++++++++
>>  net/core/dev.c           |   36 +++++++++++++++++++++++++++++-------
>>  net/core/sock.c          |    5 +++++
>>  net/ipv4/tcp_input.c     |    7 +++++++
>>  net/ipv4/tcp_ipv4.c      |    1 +
>>  net/ipv4/tcp_minisocks.c |    1 +
>>  6 files changed, 61 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 4f7c584..0613f63 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
>>   *     @skc_node: main hash linkage for various protocol lookup tables
>>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>   *     @skc_tx_queue_mapping: tx queue number for this connection
>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>   *     @skc_flags: place holder for sk_flags
>>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>> @@ -215,6 +217,10 @@ struct sock_common {
>>                 struct hlist_nulls_node skc_nulls_node;
>>         };
>>         int                     skc_tx_queue_mapping;
>> +#ifdef CONFIG_XPS
>> +       int                     skc_rx_queue_mapping;
>> +       int                     skc_rx_ifindex;
>
> Isn't this increasing size of sock_common for a narrow use case functionality?

You can get the device from the already recorded sk_napi_id.
Sadly, not the queue number as far as I can see.


>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
>> +{
>> +#ifdef CONFIG_XPS
>> +       sk->sk_rx_ifindex = skb->skb_iif;
>> +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>> +#endif
>> +}
>> +

Instead of adding this function and calls to it in many locations in
the stack, you can expand sk_mark_napi_id.

Also, it is not clear why this should be called in locations where
sk_mark_napi_id is not.


>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +#ifdef CONFIG_XPS
>> +       enum xps_map_type i = XPS_MAP_RXQS;
>> +       struct xps_dev_maps *dev_maps;
>> +       struct sock *sk = skb->sk;
>> +       int queue_index = -1;
>> +       unsigned int tci = 0;
>> +
>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>> +           dev->ifindex == sk->sk_rx_ifindex)
>> +               tci = sk->sk_rx_queue_mapping;
>> +
>> +       rcu_read_lock();
>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>> +               if (i == XPS_MAP_CPUS)
>
> This while loop typifies exactly why I don't think the XPS maps should
> be an array.

+1
Willem de Bruijn May 19, 2018, 8:27 p.m. | #3
On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>> <amritha.nambiar@intel.com> wrote:
>>> This patch adds support to pick Tx queue based on the Rx queue map
>>> configuration set by the admin through the sysfs attribute
>>> for each Tx queue. If the user configuration for receive
>>> queue map does not apply, then the Tx queue selection falls back
>>> to CPU map based selection and finally to hashing.
>>>
>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>> ---

>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>> +{
>>> +#ifdef CONFIG_XPS
>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>> +       struct xps_dev_maps *dev_maps;
>>> +       struct sock *sk = skb->sk;
>>> +       int queue_index = -1;
>>> +       unsigned int tci = 0;
>>> +
>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>> +               tci = sk->sk_rx_queue_mapping;
>>> +
>>> +       rcu_read_lock();
>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>> +               if (i == XPS_MAP_CPUS)
>>
>> This while loop typifies exactly why I don't think the XPS maps should
>> be an array.
>
> +1

as a matter of fact, as enabling both cpu and rxqueue map at the same
time makes no sense, only one map is needed at any one time. The
only difference is in how it is indexed. It should probably not be possible
to configure both at the same time. Keeping a single map probably also
significantly simplifies patch 1/4.
Tom Herbert May 21, 2018, 2:51 p.m. | #4
On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>> <amritha.nambiar@intel.com> wrote:
>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>> configuration set by the admin through the sysfs attribute
>>>> for each Tx queue. If the user configuration for receive
>>>> queue map does not apply, then the Tx queue selection falls back
>>>> to CPU map based selection and finally to hashing.
>>>>
>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>> ---
>
>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>> +{
>>>> +#ifdef CONFIG_XPS
>>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>>> +       struct xps_dev_maps *dev_maps;
>>>> +       struct sock *sk = skb->sk;
>>>> +       int queue_index = -1;
>>>> +       unsigned int tci = 0;
>>>> +
>>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>>> +               tci = sk->sk_rx_queue_mapping;
>>>> +
>>>> +       rcu_read_lock();
>>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>> +               if (i == XPS_MAP_CPUS)
>>>
>>> This while loop typifies exactly why I don't think the XPS maps should
>>> be an array.
>>
>> +1
>
> as a matter of fact, as enabling both cpu and rxqueue map at the same
> time makes no sense, only one map is needed at any one time. The
> only difference is in how it is indexed. It should probably not be possible
> to configure both at the same time. Keeping a single map probably also
> significantly simplifies patch 1/4.

Willem,

I think it might makes sense to have them both. Maybe one application
is spin polling that needs this, where others might be happy with
normal CPU mappings as default.

Tom
Willem de Bruijn May 21, 2018, 3:12 p.m. | #5
On Mon, May 21, 2018 at 10:51 AM, Tom Herbert <tom@herbertland.com> wrote:
> On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>>> <amritha.nambiar@intel.com> wrote:
>>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>>> configuration set by the admin through the sysfs attribute
>>>>> for each Tx queue. If the user configuration for receive
>>>>> queue map does not apply, then the Tx queue selection falls back
>>>>> to CPU map based selection and finally to hashing.
>>>>>
>>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>> ---
>>
>>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>> +{
>>>>> +#ifdef CONFIG_XPS
>>>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>>>> +       struct xps_dev_maps *dev_maps;
>>>>> +       struct sock *sk = skb->sk;
>>>>> +       int queue_index = -1;
>>>>> +       unsigned int tci = 0;
>>>>> +
>>>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>>>> +               tci = sk->sk_rx_queue_mapping;
>>>>> +
>>>>> +       rcu_read_lock();
>>>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>>> +               if (i == XPS_MAP_CPUS)
>>>>
>>>> This while loop typifies exactly why I don't think the XPS maps should
>>>> be an array.
>>>
>>> +1
>>
>> as a matter of fact, as enabling both cpu and rxqueue map at the same
>> time makes no sense, only one map is needed at any one time. The
>> only difference is in how it is indexed. It should probably not be possible
>> to configure both at the same time. Keeping a single map probably also
>> significantly simplifies patch 1/4.
>
> Willem,
>
> I think it might makes sense to have them both. Maybe one application
> is spin polling that needs this, where others might be happy with
> normal CPU mappings as default.

Some entries in the rx_queue table have queue_pair affinity
configured, the others return -1 to fall through to the cpu
affinity table?

I guess that implies flow steering to those special purpose
queues. I wonder whether this would be used this in practice.
I does make the code more complex by having to duplicate
the map lookup logic (mostly, patch 1/4).
Tom Herbert May 22, 2018, 2:09 p.m. | #6
On Mon, May 21, 2018 at 8:12 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Mon, May 21, 2018 at 10:51 AM, Tom Herbert <tom@herbertland.com> wrote:
>> On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>>>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>>>> <amritha.nambiar@intel.com> wrote:
>>>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>>>> configuration set by the admin through the sysfs attribute
>>>>>> for each Tx queue. If the user configuration for receive
>>>>>> queue map does not apply, then the Tx queue selection falls back
>>>>>> to CPU map based selection and finally to hashing.
>>>>>>
>>>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>>> ---
>>>
>>>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>>> +{
>>>>>> +#ifdef CONFIG_XPS
>>>>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>>>>> +       struct xps_dev_maps *dev_maps;
>>>>>> +       struct sock *sk = skb->sk;
>>>>>> +       int queue_index = -1;
>>>>>> +       unsigned int tci = 0;
>>>>>> +
>>>>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>>>>> +               tci = sk->sk_rx_queue_mapping;
>>>>>> +
>>>>>> +       rcu_read_lock();
>>>>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>>>> +               if (i == XPS_MAP_CPUS)
>>>>>
>>>>> This while loop typifies exactly why I don't think the XPS maps should
>>>>> be an array.
>>>>
>>>> +1
>>>
>>> as a matter of fact, as enabling both cpu and rxqueue map at the same
>>> time makes no sense, only one map is needed at any one time. The
>>> only difference is in how it is indexed. It should probably not be possible
>>> to configure both at the same time. Keeping a single map probably also
>>> significantly simplifies patch 1/4.
>>
>> Willem,
>>
>> I think it might makes sense to have them both. Maybe one application
>> is spin polling that needs this, where others might be happy with
>> normal CPU mappings as default.
>
> Some entries in the rx_queue table have queue_pair affinity
> configured, the others return -1 to fall through to the cpu
> affinity table?
>
Right, that's the intent of the while loop.

> I guess that implies flow steering to those special purpose
> queues. I wonder whether this would be used this in practice.
> I does make the code more complex by having to duplicate
> the map lookup logic (mostly, patch 1/4).

That's a good pont. I think we need more information on how the
feature is going to be used in practice. My assumption is that there
are some number of "special" queues for which spin polling is being
done.

Tom
Nambiar, Amritha May 23, 2018, 7:19 p.m. | #7
On 5/19/2018 1:13 PM, Willem de Bruijn wrote:
> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>> <amritha.nambiar@intel.com> wrote:
>>> This patch adds support to pick Tx queue based on the Rx queue map
>>> configuration set by the admin through the sysfs attribute
>>> for each Tx queue. If the user configuration for receive
>>> queue map does not apply, then the Tx queue selection falls back
>>> to CPU map based selection and finally to hashing.
>>>
>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>> ---
>>>  include/net/sock.h       |   18 ++++++++++++++++++
>>>  net/core/dev.c           |   36 +++++++++++++++++++++++++++++-------
>>>  net/core/sock.c          |    5 +++++
>>>  net/ipv4/tcp_input.c     |    7 +++++++
>>>  net/ipv4/tcp_ipv4.c      |    1 +
>>>  net/ipv4/tcp_minisocks.c |    1 +
>>>  6 files changed, 61 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 4f7c584..0613f63 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
>>>   *     @skc_node: main hash linkage for various protocol lookup tables
>>>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>>   *     @skc_tx_queue_mapping: tx queue number for this connection
>>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>>   *     @skc_flags: place holder for sk_flags
>>>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>> @@ -215,6 +217,10 @@ struct sock_common {
>>>                 struct hlist_nulls_node skc_nulls_node;
>>>         };
>>>         int                     skc_tx_queue_mapping;
>>> +#ifdef CONFIG_XPS
>>> +       int                     skc_rx_queue_mapping;
>>> +       int                     skc_rx_ifindex;
>>
>> Isn't this increasing size of sock_common for a narrow use case functionality?
> 
> You can get the device from the already recorded sk_napi_id.
> Sadly, not the queue number as far as I can see.
> 
I plan to not have the ifindex cached in the sock_common, but retain the
rx_queue only. This way, it'll look similar to skb_tx_hash where
rx_queue recorded is used and if not, fall through to flow hash
calculation. Likewise, we use the rx_queue mapped and fall through to
CPU map on failures.
> 
>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
>>> +{
>>> +#ifdef CONFIG_XPS
>>> +       sk->sk_rx_ifindex = skb->skb_iif;
>>> +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>> +#endif
>>> +}
>>> +
> 
> Instead of adding this function and calls to it in many locations in
> the stack, you can expand sk_mark_napi_id.
> 
> Also, it is not clear why this should be called in locations where
> sk_mark_napi_id is not.
> 
Makes sense, I will add this as part of sk_mark_napi_id.

> 
>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>> +{
>>> +#ifdef CONFIG_XPS
>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>> +       struct xps_dev_maps *dev_maps;
>>> +       struct sock *sk = skb->sk;
>>> +       int queue_index = -1;
>>> +       unsigned int tci = 0;
>>> +
>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>> +               tci = sk->sk_rx_queue_mapping;
>>> +
>>> +       rcu_read_lock();
>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>> +               if (i == XPS_MAP_CPUS)
>>
>> This while loop typifies exactly why I don't think the XPS maps should
>> be an array.
> 
> +1
> 
Okay, I will change this to two maps with separate pointers.
Nambiar, Amritha May 23, 2018, 7:31 p.m. | #8
On 5/22/2018 7:09 AM, Tom Herbert wrote:
> On Mon, May 21, 2018 at 8:12 AM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Mon, May 21, 2018 at 10:51 AM, Tom Herbert <tom@herbertland.com> wrote:
>>> On Sat, May 19, 2018 at 1:27 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Sat, May 19, 2018 at 4:13 PM, Willem de Bruijn
>>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>>> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>>>>>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>>>>>> <amritha.nambiar@intel.com> wrote:
>>>>>>> This patch adds support to pick Tx queue based on the Rx queue map
>>>>>>> configuration set by the admin through the sysfs attribute
>>>>>>> for each Tx queue. If the user configuration for receive
>>>>>>> queue map does not apply, then the Tx queue selection falls back
>>>>>>> to CPU map based selection and finally to hashing.
>>>>>>>
>>>>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>>>> ---
>>>>
>>>>>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>>>>>> +{
>>>>>>> +#ifdef CONFIG_XPS
>>>>>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>>>>>> +       struct xps_dev_maps *dev_maps;
>>>>>>> +       struct sock *sk = skb->sk;
>>>>>>> +       int queue_index = -1;
>>>>>>> +       unsigned int tci = 0;
>>>>>>> +
>>>>>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>>>>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>>>>>> +               tci = sk->sk_rx_queue_mapping;
>>>>>>> +
>>>>>>> +       rcu_read_lock();
>>>>>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>>>>>> +               if (i == XPS_MAP_CPUS)
>>>>>>
>>>>>> This while loop typifies exactly why I don't think the XPS maps should
>>>>>> be an array.
>>>>>
>>>>> +1
>>>>
>>>> as a matter of fact, as enabling both cpu and rxqueue map at the same
>>>> time makes no sense, only one map is needed at any one time. The
>>>> only difference is in how it is indexed. It should probably not be possible
>>>> to configure both at the same time. Keeping a single map probably also
>>>> significantly simplifies patch 1/4.
>>>
>>> Willem,
>>>
>>> I think it might makes sense to have them both. Maybe one application
>>> is spin polling that needs this, where others might be happy with
>>> normal CPU mappings as default.
>>
>> Some entries in the rx_queue table have queue_pair affinity
>> configured, the others return -1 to fall through to the cpu
>> affinity table?
>>
> Right, that's the intent of the while loop.
> 

Yes, by default, rx queue maps are not configured for the tx queue.
These maps have to be explicitly configured mapping rx queue(s) to tx
queue(s). If the rx queue map configuration does not apply, then the tx
queue is selected based on the CPUs map.

>> I guess that implies flow steering to those special purpose
>> queues. I wonder whether this would be used this in practice.
>> I does make the code more complex by having to duplicate
>> the map lookup logic (mostly, patch 1/4).
> 
> That's a good pont. I think we need more information on how the
> feature is going to be used in practice. My assumption is that there
> are some number of "special" queues for which spin polling is being
> done.

Will submit v3 with testing hints and performance results.

> 
> Tom
>

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 4f7c584..0613f63 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,8 @@  typedef __u64 __bitwise __addrpair;
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_rx_queue_mapping: rx queue number for this connection
+ *	@skc_rx_ifindex: rx ifindex for this connection
  *	@skc_flags: place holder for sk_flags
  *		%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *		%SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +217,10 @@  struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	int			skc_rx_queue_mapping;
+	int			skc_rx_ifindex;
+#endif
 	union {
 		int		skc_incoming_cpu;
 		u32		skc_rcv_wnd;
@@ -326,6 +332,10 @@  struct sock {
 #define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping	__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex		__sk_common.skc_rx_ifindex
+#endif
 
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
@@ -1696,6 +1706,14 @@  static inline int sk_tx_queue_get(const struct sock *sk)
 	return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+	sk->sk_rx_ifindex = skb->skb_iif;
+	sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7e5dfdb..4030368 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3458,18 +3458,14 @@  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
 #ifdef CONFIG_XPS
-	struct xps_dev_maps *dev_maps;
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+			       struct xps_dev_maps *dev_maps, unsigned int tci)
+{
 	struct xps_map *map;
 	int queue_index = -1;
 
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
 	if (dev_maps) {
-		unsigned int tci = skb->sender_cpu - 1;
-
 		if (dev->num_tc) {
 			tci *= dev->num_tc;
 			tci += netdev_get_prio_tc_map(dev, skb->priority);
@@ -3486,6 +3482,32 @@  static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 				queue_index = -1;
 		}
 	}
+	return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+	enum xps_map_type i = XPS_MAP_RXQS;
+	struct xps_dev_maps *dev_maps;
+	struct sock *sk = skb->sk;
+	int queue_index = -1;
+	unsigned int tci = 0;
+
+	if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
+	    dev->ifindex == sk->sk_rx_ifindex)
+		tci = sk->sk_rx_queue_mapping;
+
+	rcu_read_lock();
+	while (queue_index < 0 && i < __XPS_MAP_MAX) {
+		if (i == XPS_MAP_CPUS)
+			tci = skb->sender_cpu - 1;
+		dev_maps = rcu_dereference(dev->xps_maps[i]);
+		queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
+		i++;
+	}
+
 	rcu_read_unlock();
 
 	return queue_index;
diff --git a/net/core/sock.c b/net/core/sock.c
index 042cfc6..73d7fa8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2824,6 +2824,11 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_pacing_rate = ~0U;
 	sk->sk_pacing_shift = 10;
 	sk->sk_incoming_cpu = -1;
+
+#ifdef CONFIG_XPS
+	sk->sk_rx_ifindex = -1;
+	sk->sk_rx_queue_mapping = -1;
+#endif
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b188e0d..d33911c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@ 
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
 #include <linux/static_key.h>
+#include <net/busy_poll.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -5559,6 +5560,11 @@  void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 		__tcp_fast_path_on(tp, tp->snd_wnd);
 	else
 		tp->pred_flags = 0;
+
+	if (skb) {
+		sk_mark_napi_id(sk, skb);
+		sk_mark_rx_queue(sk, skb);
+	}
 }
 
 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -6371,6 +6377,7 @@  int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_openreq_init_rwin(req, sk, dst);
+	sk_mark_rx_queue(req_to_sk(req), skb);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
 		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index caf23de..abdf02e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1479,6 +1479,7 @@  int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_mark_rx_queue(sk, skb);
 		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    !dst->ops->check(dst, 0)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f867658..4939c28 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -836,6 +836,7 @@  int tcp_child_process(struct sock *parent, struct sock *child,
 
 	/* record NAPI ID of child */
 	sk_mark_napi_id(child, skb);
+	sk_mark_rx_queue(child, skb);
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {