diff mbox

[tcpdump-workers] vlan tagged packets and libpcap breakage

Message ID alpine.OSX.2.00.1212121205040.78903@animac.local
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Ani Sinha Dec. 12, 2012, 9:53 p.m. UTC
>
> unsigned int netdev_8021q_inskb = 1;
>
> ...
> 	{
> 		.ctl_name	= NET_CORE_8021q_INSKB,
> 		.procname	= "netdev_8021q_inskb",
> 		.data		= &netdev_8021q_inskb,
> 		.maxlen		= sizeof(int),
> 		.mode		= 0444,
> 		.proc_handler	= proc_dointvec
> 	},
>
> would seem to do it to me.
> Then pcap can fopen("/proc/sys/net/core/netdev_8021q_inskb") and if it
> finds it, and it is >0, then do the cmsg thing.
>

Does this work? This is just an experimental patch and by no means final.
I just want to have an idea what everyone thought about it. Once we debate
and discusss, I can cook up a final patch that would be worth commiting.

Also instead of having this /proc interface, we can perhaps check for a
specific
kernel version that :

(a) has the vlan tag info in the skb metadata (as opposed to in the packet
itself)
(b) has the following patch that adds the capability to generate a filter
based on the tag value :

commit f3335031b9452baebfe49b8b5e55d3fe0c4677d1
Author: Eric Dumazet <edumazet@google.com>
Date:   Sat Oct 27 02:26:17 2012 +0000

    net: filter: add vlan tag access

WE need both of the above two things for the userland to generate a filter
code that compares vlan tag values in the skb metadata. For kernels that
has the vlan tag in
the skb metadata but does not have the above commit (b), there is nothing
that can be done. For older kernels that had the vlan tag info in the
packet itself, the filter code can be generated differently to look at
specific offsets within the packet (something that libpcap does
currently).

We have already ruled out the idea of generating a filter and trying to
load and see if that fails (see previous emails on this thread).

Hope this makes sense.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ani Sinha Dec. 12, 2012, 10:16 p.m. UTC | #1
+ Eric B.


On Wed, Dec 12, 2012 at 1:53 PM, Ani Sinha <ani@aristanetworks.com> wrote:
>
>>
>> unsigned int netdev_8021q_inskb = 1;
>>
>> ...
>>       {
>>               .ctl_name       = NET_CORE_8021q_INSKB,
>>               .procname       = "netdev_8021q_inskb",
>>               .data           = &netdev_8021q_inskb,
>>               .maxlen         = sizeof(int),
>>               .mode           = 0444,
>>               .proc_handler   = proc_dointvec
>>       },
>>
>> would seem to do it to me.
>> Then pcap can fopen("/proc/sys/net/core/netdev_8021q_inskb") and if it
>> finds it, and it is >0, then do the cmsg thing.
>>
>
> Does this work? This is just an experimental patch and by no means final.
> I just want to have an idea what everyone thought about it. Once we debate
> and discusss, I can cook up a final patch that would be worth commiting.
>
> Also instead of having this /proc interface, we can perhaps check for a
> specific
> kernel version that :
>
> (a) has the vlan tag info in the skb metadata (as opposed to in the packet
> itself)
> (b) has the following patch that adds the capability to generate a filter
> based on the tag value :
>
> commit f3335031b9452baebfe49b8b5e55d3fe0c4677d1
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Sat Oct 27 02:26:17 2012 +0000
>
>     net: filter: add vlan tag access
>
> WE need both of the above two things for the userland to generate a filter
> code that compares vlan tag values in the skb metadata. For kernels that
> has the vlan tag in
> the skb metadata but does not have the above commit (b), there is nothing
> that can be done. For older kernels that had the vlan tag info in the
> packet itself, the filter code can be generated differently to look at
> specific offsets within the packet (something that libpcap does
> currently).
>
> We have already ruled out the idea of generating a filter and trying to
> load and see if that fails (see previous emails on this thread).
>
> Hope this makes sense.
>
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index c45eabc..91e2ba3 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -36,6 +36,7 @@ static inline unsigned int sk_filter_len(const struct sk_filter *fp)
>         return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
>  }
>
> +extern bool sysctl_8021q_inskb;
>  extern int sk_filter(struct sock *sk, struct sk_buff *skb);
>  extern unsigned int sk_run_filter(const struct sk_buff *skb,
>                                   const struct sock_filter *filter);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c23543c..4f5a657 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -41,6 +41,8 @@
>  #include <linux/seccomp.h>
>  #include <linux/if_vlan.h>
>
> +bool sysctl_8021q_inskb = 1;
> +
>  /* No hurry in this branch
>   *
>   * Exported for the bpf jit load helper.
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index d1b0804..f9a3700 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c
> @@ -15,6 +15,7 @@
>  #include <linux/init.h>
>  #include <linux/slab.h>
>  #include <linux/kmemleak.h>
> +#include <linux/filter.h>
>
>  #include <net/ip.h>
>  #include <net/sock.h>
> @@ -189,6 +190,13 @@ static struct ctl_table net_core_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = proc_dointvec
>         },
> +       {
> +               .procname       = "8021q_inskb",
> +               .data           = &sysctl_8021q_inskb,
> +               .maxlen         = sizeof(bool),
> +               .mode           = 0444,
> +               .proc_handler   = proc_dointvec
> +       },
>         { }
>  };
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Dec. 13, 2012, 8:35 a.m. UTC | #2
On 12/12/2012 10:53 PM, Ani Sinha wrote:
>> unsigned int netdev_8021q_inskb = 1;
>>
>> ...
>> 	{
>> 		.ctl_name	= NET_CORE_8021q_INSKB,
>> 		.procname	= "netdev_8021q_inskb",
>> 		.data		= &netdev_8021q_inskb,
>> 		.maxlen		= sizeof(int),
>> 		.mode		= 0444,
>> 		.proc_handler	= proc_dointvec
>> 	},
>>
>> would seem to do it to me.
>> Then pcap can fopen("/proc/sys/net/core/netdev_8021q_inskb") and if it
>> finds it, and it is >0, then do the cmsg thing.
>>
>
> Does this work? This is just an experimental patch and by no means final.
> I just want to have an idea what everyone thought about it. Once we debate
> and discusss, I can cook up a final patch that would be worth commiting.
>
> Also instead of having this /proc interface, we can perhaps check for a
> specific
> kernel version that :
>
> (a) has the vlan tag info in the skb metadata (as opposed to in the packet
> itself)
> (b) has the following patch that adds the capability to generate a filter
> based on the tag value :
>
> commit f3335031b9452baebfe49b8b5e55d3fe0c4677d1
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Sat Oct 27 02:26:17 2012 +0000
>
>      net: filter: add vlan tag access
>
> WE need both of the above two things for the userland to generate a filter
> code that compares vlan tag values in the skb metadata. For kernels that
> has the vlan tag in
> the skb metadata but does not have the above commit (b), there is nothing
> that can be done. For older kernels that had the vlan tag info in the
> packet itself, the filter code can be generated differently to look at
> specific offsets within the packet (something that libpcap does
> currently).
>
> We have already ruled out the idea of generating a filter and trying to
> load and see if that fails (see previous emails on this thread).
>
> Hope this makes sense.

I think it doesn't. Because then you are obviously considering adding one
procfs file into /proc/sys/net/core/ *for each* feature that is added into
the ancillary ops which cannot be the right way ...

> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index c45eabc..91e2ba3 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -36,6 +36,7 @@ static inline unsigned int sk_filter_len(const struct sk_filter *fp)
>   	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
>   }
>
> +extern bool sysctl_8021q_inskb;
>   extern int sk_filter(struct sock *sk, struct sk_buff *skb);
>   extern unsigned int sk_run_filter(const struct sk_buff *skb,
>   				  const struct sock_filter *filter);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c23543c..4f5a657 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -41,6 +41,8 @@
>   #include <linux/seccomp.h>
>   #include <linux/if_vlan.h>
>
> +bool sysctl_8021q_inskb = 1;
> +
>   /* No hurry in this branch
>    *
>    * Exported for the bpf jit load helper.
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index d1b0804..f9a3700 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c
> @@ -15,6 +15,7 @@
>   #include <linux/init.h>
>   #include <linux/slab.h>
>   #include <linux/kmemleak.h>
> +#include <linux/filter.h>
>
>   #include <net/ip.h>
>   #include <net/sock.h>
> @@ -189,6 +190,13 @@ static struct ctl_table net_core_table[] = {
>   		.mode		= 0644,
>   		.proc_handler	= proc_dointvec
>   	},
> +	{
> +		.procname	= "8021q_inskb",
> +		.data		= &sysctl_8021q_inskb,
> +		.maxlen		= sizeof(bool),
> +		.mode		= 0444,
> +		.proc_handler	= proc_dointvec
> +	},
>   	{ }
>   };
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ani Sinha Dec. 13, 2012, 5:34 p.m. UTC | #3
On Thu, Dec 13, 2012 at 12:35 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 12/12/2012 10:53 PM, Ani Sinha wrote:
>>>
>>> unsigned int netdev_8021q_inskb = 1;
>>>
>>> ...
>>>         {
>>>                 .ctl_name       = NET_CORE_8021q_INSKB,
>>>                 .procname       = "netdev_8021q_inskb",
>>>                 .data           = &netdev_8021q_inskb,
>>>                 .maxlen         = sizeof(int),
>>>                 .mode           = 0444,
>>>                 .proc_handler   = proc_dointvec
>>>         },
>>>
>>> would seem to do it to me.
>>> Then pcap can fopen("/proc/sys/net/core/netdev_8021q_inskb") and if it
>>> finds it, and it is >0, then do the cmsg thing.
>>>
>>
>
> I think it doesn't. Because then you are obviously considering adding one
> procfs file into /proc/sys/net/core/ *for each* feature that is added into
> the ancillary ops which cannot be the right way ...

We had already brought up this topic previously in the same thread. A
suggestion was made to add that proc entry and no one from netdev
responded to it saying that it did not make any sense. Therefore
before I went ahead and made the fixes in libpcap, I wanted to run
this by your guys again to make sure we are still on the same page.

I do agree that instead of a /proc entry, we should check for a kenrel
version >= X where X is the upstream version that first started
supporting all the features needed by libpcap for vlan filtering. This
is not a compile time check but a run time one. Does anyone see any
issues with this? Is there any long term implications of this, like if
you backport patches to an older long term supported kernel? Are there
other better ways to do this, like may be returning feature bits from
an ioctl call? This is something we need to deal with on a continuous
basis as we keep supporting newer AUX fields and libpcap and other
user land code needs to make use of it. At the same time, they need to
handle backward compatibility issues with older kernels.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
danborkmann@iogearbox.net Dec. 13, 2012, 9:49 p.m. UTC | #4
On Thu, Dec 13, 2012 at 6:34 PM, Ani Sinha <ani@aristanetworks.com> wrote:
> On Thu, Dec 13, 2012 at 12:35 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> On 12/12/2012 10:53 PM, Ani Sinha wrote:
>>>>
>>>> unsigned int netdev_8021q_inskb = 1;
>>>>
>>>> ...
>>>>         {
>>>>                 .ctl_name       = NET_CORE_8021q_INSKB,
>>>>                 .procname       = "netdev_8021q_inskb",
>>>>                 .data           = &netdev_8021q_inskb,
>>>>                 .maxlen         = sizeof(int),
>>>>                 .mode           = 0444,
>>>>                 .proc_handler   = proc_dointvec
>>>>         },
>>>>
>>>> would seem to do it to me.
>>>> Then pcap can fopen("/proc/sys/net/core/netdev_8021q_inskb") and if it
>>>> finds it, and it is >0, then do the cmsg thing.
>>>>
>>
>> I think it doesn't. Because then you are obviously considering adding one
>> procfs file into /proc/sys/net/core/ *for each* feature that is added into
>> the ancillary ops which cannot be the right way ...
>
> We had already brought up this topic previously in the same thread. A
> suggestion was made to add that proc entry and no one from netdev
> responded to it saying that it did not make any sense. Therefore
> before I went ahead and made the fixes in libpcap, I wanted to run
> this by your guys again to make sure we are still on the same page.

Well, not everyone on netdev might be following this thread (resp.
following fully). The best way to get responses for a patch is to go
through the normal patch submission process on netdev, and if you like
to request for comments, then mark it as RFC in the subject. This way,
people will know and likely comment on it if it makes sense or not.

> I do agree that instead of a /proc entry, we should check for a kenrel
> version >= X where X is the upstream version that first started
> supporting all the features needed by libpcap for vlan filtering. This
> is not a compile time check but a run time one. Does anyone see any
> issues with this? Is there any long term implications of this, like if
> you backport patches to an older long term supported kernel? Are there
> other better ways to do this, like may be returning feature bits from
> an ioctl call? This is something we need to deal with on a continuous
> basis as we keep supporting newer AUX fields and libpcap and other
> user land code needs to make use of it. At the same time, they need to
> handle backward compatibility issues with older kernels.

As Eric mentioned earlier, for now there seems not to be a reliable
way to get to know which ops are present and which not. It's not
really nice, but if you want to make use of those new (ANC*) features,
probably checking kernel version might be the only way if I'm not
missing something. Now net-next is closed, but if it reopens, I'll
submit a version 2 of my patch where you've been CC'd to. If it gets
in, then at least it's for sure that since kernel <xyz> this kind of
feature test is present.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ani Sinha Dec. 13, 2012, 10:07 p.m. UTC | #5
On Thu, Dec 13, 2012 at 1:49 PM, Daniel Borkmann
<danborkmann@iogearbox.net> wrote:

> Well, not everyone on netdev might be following this thread (resp.
> following fully). The best way to get responses for a patch is to go
> through the normal patch submission process on netdev, and if you like
> to request for comments, then mark it as RFC in the subject. This way,
> people will know and likely comment on it if it makes sense or not.

OK good to know.

> As Eric mentioned earlier, for now there seems not to be a reliable
> way to get to know which ops are present and which not. It's not
> really nice, but if you want to make use of those new (ANC*) features,
> probably checking kernel version might be the only way if I'm not
> missing something. Now net-next is closed, but if it reopens, I'll
> submit a version 2 of my patch where you've been CC'd to. If it gets
> in, then at least it's for sure that since kernel <xyz> this kind of
> feature test is present.

thanks, yes, I believe we do need some sort of validation on the
ancilliary features.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight Dec. 17, 2012, 9:50 a.m. UTC | #6
> > I do agree that instead of a /proc entry, we should check for a kenrel
> > version >= X where X is the upstream version that first started
> > supporting all the features needed by libpcap for vlan filtering. This
> > is not a compile time check but a run time one. Does anyone see any
> > issues with this? Is there any long term implications of this, like if
> > you backport patches to an older long term supported kernel? Are there
> > other better ways to do this, like may be returning feature bits from
> > an ioctl call? This is something we need to deal with on a continuous
> > basis as we keep supporting newer AUX fields and libpcap and other
> > user land code needs to make use of it. At the same time, they need to
> > handle backward compatibility issues with older kernels.
> 
> As Eric mentioned earlier, for now there seems not to be a reliable
> way to get to know which ops are present and which not. It's not
> really nice, but if you want to make use of those new (ANC*) features,
> probably checking kernel version might be the only way if I'm not
> missing something. Now net-next is closed, but if it reopens, I'll
> submit a version 2 of my patch where you've been CC'd to. If it gets
> in, then at least it's for sure that since kernel <xyz> this kind of
> feature test is present.

How are you going to tell whether a feature is present in a non-Linux
kernel ?

Testing kernel versions is somewhat suboptimal as support
could be patched into a much older kernel (maybe not for
this but ...)

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guy Harris Dec. 17, 2012, 10:35 a.m. UTC | #7
On Dec 17, 2012, at 1:50 AM, "David Laight" <David.Laight@ACULAB.COM> wrote:

> How are you going to tell whether a feature is present in a non-Linux
> kernel ?

The Linux memory-mapped capture mechanism is not present in a non-Linux kernel, so all the libpcap work involved here would, if necessary on other platforms, have to be done differently on those platforms.  Those platforms would have to have their own mechanisms to indicate whether any changes to filter code, processing of VLAN tags supplied out of band, etc. would need to be done.

The same would apply to other additional features of the Linux memory-mapped capture mechanism that require changes in libpcap.  (Ideally, those changes would only require changes in order to use them, and would not break existing userland code, including but not limited to libpcap - your reply was to Daniel Borkmann, who is, I believe, the originator of netsniff-ng:

	http://netsniff-ng.org

which has its own code using PF_PACKET sockets.)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
danborkmann@iogearbox.net Dec. 17, 2012, 11:08 a.m. UTC | #8
On Mon, Dec 17, 2012 at 11:35 AM, Guy Harris <guy@alum.mit.edu> wrote:
> On Dec 17, 2012, at 1:50 AM, "David Laight" <David.Laight@ACULAB.COM> wrote:
>
>> How are you going to tell whether a feature is present in a non-Linux
>> kernel ?
>
> The Linux memory-mapped capture mechanism is not present in a non-Linux kernel, so all the libpcap work involved here would, if necessary on other platforms, have to be done differently on those platforms.  Those platforms would have to have their own mechanisms to indicate whether any changes to filter code, processing of VLAN tags supplied out of band, etc. would need to be done.
>
> The same would apply to other additional features of the Linux memory-mapped capture mechanism that require changes in libpcap.

Exactly.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ani Sinha Dec. 17, 2012, 7:49 p.m. UTC | #9
On Mon, Dec 17, 2012 at 2:35 AM, Guy Harris <guy@alum.mit.edu> wrote:
>
> On Dec 17, 2012, at 1:50 AM, "David Laight" <David.Laight@ACULAB.COM> wrote:
>
>> How are you going to tell whether a feature is present in a non-Linux
>> kernel ?
>
> The Linux memory-mapped capture mechanism is not present in a non-Linux kernel, so all the libpcap work involved here would, if necessary on other platforms, have to be done >differently on those platforms.  Those platforms would have to have their own mechanisms to indicate whether any changes to filter code, processing of VLAN tags supplied out of >band, etc. would need to be done.

Actually lib-pcap has these pcap-<platform>.c files that are kind of
like platform specific drivers that plug into platform independent
code like gencode.c or bpf_filter.c. These platform specific drivers
are responsible for getting packets from the kernel and running
filters (kernel or userland) on it. So all linux specific code to get
a packet and packet metadata from the kernel can neatly reside in
pcap-linux.c.

Unfortunately though, in this specific problem involving filtering
with vlan tags, both code generation (gentags.c) and code running the
filter (bpf_filter.c) will have to be aware of linux specific
semantics. Due to the issues that Bill had explained earlier in the
thread, we can not rely on post processing before installing the
kernel filter. Therefore, we need to generate a filter that can be
directly installed in the kernel. For the same reason, bpf_filter()
code also needs to change - be aware of linux specific semantics.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c45eabc..91e2ba3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -36,6 +36,7 @@  static inline unsigned int sk_filter_len(const struct sk_filter *fp)
 	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
 }

+extern bool sysctl_8021q_inskb;
 extern int sk_filter(struct sock *sk, struct sk_buff *skb);
 extern unsigned int sk_run_filter(const struct sk_buff *skb,
 				  const struct sock_filter *filter);
diff --git a/net/core/filter.c b/net/core/filter.c
index c23543c..4f5a657 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -41,6 +41,8 @@ 
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>

+bool sysctl_8021q_inskb = 1;
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d1b0804..f9a3700 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -15,6 +15,7 @@ 
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
+#include <linux/filter.h>

 #include <net/ip.h>
 #include <net/sock.h>
@@ -189,6 +190,13 @@  static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "8021q_inskb",
+		.data		= &sysctl_8021q_inskb,
+		.maxlen		= sizeof(bool),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };