diff mbox series

[bpf,3/3] selftests/bpf: Test freeing sockmap/sockhash with a socket in it

Message ID 20200206111652.694507-4-jakub@cloudflare.com
State Accepted
Delegated to: BPF Maintainers
Headers show
Series Fix locking order and synchronization on sockmap/sockhash tear-down | expand

Commit Message

Jakub Sitnicki Feb. 6, 2020, 11:16 a.m. UTC
Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
down") introduced sleeping issues inside RCU critical sections and while
holding a spinlock on sockmap/sockhash tear-down. There has to be at least
one socket in the map for the problem to surface.

This adds a test that triggers the warnings for broken locking rules. Not a
fix per se, but rather tooling to verify the accompanying fixes. Run on a
VM with 1 vCPU to reproduce the warnings.

Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c  | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

Comments

John Fastabend Feb. 6, 2020, 7:03 p.m. UTC | #1
Jakub Sitnicki wrote:
> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
> down") introduced sleeping issues inside RCU critical sections and while
> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
> one socket in the map for the problem to surface.
> 
> This adds a test that triggers the warnings for broken locking rules. Not a
> fix per se, but rather tooling to verify the accompanying fixes. Run on a
> VM with 1 vCPU to reproduce the warnings.
> 
> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
> ---
>  .../selftests/bpf/prog_tests/sockmap_basic.c  | 74 +++++++++++++++++++
>  1 file changed, 74 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
> new file mode 100644
> index 000000000000..07f5b462c2ef
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

Yes! This helps a lot now we will get some coverege on these cases.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Alexei Starovoitov Feb. 9, 2020, 2:41 a.m. UTC | #2
On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
> down") introduced sleeping issues inside RCU critical sections and while
> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
> one socket in the map for the problem to surface.
>
> This adds a test that triggers the warnings for broken locking rules. Not a
> fix per se, but rather tooling to verify the accompanying fixes. Run on a
> VM with 1 vCPU to reproduce the warnings.
>
> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>

selftests/bpf no longer builds for me.
make
  BINARY   test_maps
  TEST-OBJ [test_progs] sockmap_basic.test.o
/data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
In function ‘connected_socket_v4’:
/data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
you mean ‘TCP_REPAIR’?
   20 |  repair = TCP_REPAIR_ON;
      |           ^~~~~~~~~~~~~
      |           TCP_REPAIR
/data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
note: each undeclared identifier is reported only once for each
function it appears in
/data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
did you mean ‘TCP_REPAIR_OPTIONS’?
   29 |  repair = TCP_REPAIR_OFF_NO_WP;
      |           ^~~~~~~~~~~~~~~~~~~~
      |           TCP_REPAIR_OPTIONS

Clearly /usr/include/linux/tcp.h is too old.
Suggestions?
Yonghong Song Feb. 9, 2020, 4:12 a.m. UTC | #3
On 2/8/20 6:41 PM, Alexei Starovoitov wrote:
> On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>
>> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
>> down") introduced sleeping issues inside RCU critical sections and while
>> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
>> one socket in the map for the problem to surface.
>>
>> This adds a test that triggers the warnings for broken locking rules. Not a
>> fix per se, but rather tooling to verify the accompanying fixes. Run on a
>> VM with 1 vCPU to reproduce the warnings.
>>
>> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
>> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
> 
> selftests/bpf no longer builds for me.
> make
>    BINARY   test_maps
>    TEST-OBJ [test_progs] sockmap_basic.test.o
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
> In function ‘connected_socket_v4’:
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
> you mean ‘TCP_REPAIR’?
>     20 |  repair = TCP_REPAIR_ON;
>        |           ^~~~~~~~~~~~~
>        |           TCP_REPAIR
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> note: each undeclared identifier is reported only once for each
> function it appears in
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
> error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
> did you mean ‘TCP_REPAIR_OPTIONS’?
>     29 |  repair = TCP_REPAIR_OFF_NO_WP;
>        |           ^~~~~~~~~~~~~~~~~~~~
>        |           TCP_REPAIR_OPTIONS
> 
> Clearly /usr/include/linux/tcp.h is too old.
> Suggestions?

In the past, when such situation happens, people typically sync to
linux/tools/include/uapi/ directory. This probably will work in this 
case as well.
Jakub Sitnicki Feb. 9, 2020, 3:29 p.m. UTC | #4
On Sun, Feb 09, 2020 at 03:41 AM CET, Alexei Starovoitov wrote:
> On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>
>> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
>> down") introduced sleeping issues inside RCU critical sections and while
>> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
>> one socket in the map for the problem to surface.
>>
>> This adds a test that triggers the warnings for broken locking rules. Not a
>> fix per se, but rather tooling to verify the accompanying fixes. Run on a
>> VM with 1 vCPU to reproduce the warnings.
>>
>> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
>> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
>
> selftests/bpf no longer builds for me.
> make
>   BINARY   test_maps
>   TEST-OBJ [test_progs] sockmap_basic.test.o
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
> In function ‘connected_socket_v4’:
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
> you mean ‘TCP_REPAIR’?
>    20 |  repair = TCP_REPAIR_ON;
>       |           ^~~~~~~~~~~~~
>       |           TCP_REPAIR
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> note: each undeclared identifier is reported only once for each
> function it appears in
> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
> error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
> did you mean ‘TCP_REPAIR_OPTIONS’?
>    29 |  repair = TCP_REPAIR_OFF_NO_WP;
>       |           ^~~~~~~~~~~~~~~~~~~~
>       |           TCP_REPAIR_OPTIONS
>
> Clearly /usr/include/linux/tcp.h is too old.
> Suggestions?

Sorry for the inconvenience. I see that tcp.h header is missing under
linux/tools/include/uapi/.

I have been building against my distro kernel headers, completely
unaware of this. This is an oversight on my side.

Can I ask for a revert? I'm traveling today with limited ability to
post patches.

I can resubmit the test with the missing header for bpf-next once it
reopens.
John Fastabend Feb. 10, 2020, 3:55 a.m. UTC | #5
Jakub Sitnicki wrote:
> On Sun, Feb 09, 2020 at 03:41 AM CET, Alexei Starovoitov wrote:
> > On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
> >>
> >> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
> >> down") introduced sleeping issues inside RCU critical sections and while
> >> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
> >> one socket in the map for the problem to surface.
> >>
> >> This adds a test that triggers the warnings for broken locking rules. Not a
> >> fix per se, but rather tooling to verify the accompanying fixes. Run on a
> >> VM with 1 vCPU to reproduce the warnings.
> >>
> >> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
> >> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
> >
> > selftests/bpf no longer builds for me.
> > make
> >   BINARY   test_maps
> >   TEST-OBJ [test_progs] sockmap_basic.test.o
> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
> > In function ‘connected_socket_v4’:
> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> > error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
> > you mean ‘TCP_REPAIR’?
> >    20 |  repair = TCP_REPAIR_ON;
> >       |           ^~~~~~~~~~~~~
> >       |           TCP_REPAIR
> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
> > note: each undeclared identifier is reported only once for each
> > function it appears in
> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
> > error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
> > did you mean ‘TCP_REPAIR_OPTIONS’?
> >    29 |  repair = TCP_REPAIR_OFF_NO_WP;
> >       |           ^~~~~~~~~~~~~~~~~~~~
> >       |           TCP_REPAIR_OPTIONS
> >
> > Clearly /usr/include/linux/tcp.h is too old.
> > Suggestions?
> 
> Sorry for the inconvenience. I see that tcp.h header is missing under
> linux/tools/include/uapi/.

How about we just add the couple defines needed to sockmap_basic.c I don't
see a need to pull in all of tcp.h just for a couple defines that wont
change anyways.

> 
> I have been building against my distro kernel headers, completely
> unaware of this. This is an oversight on my side.
> 
> Can I ask for a revert? I'm traveling today with limited ability to
> post patches.

I don't think we need a full revert.

> 
> I can resubmit the test with the missing header for bpf-next once it
> reopens.

If you are traveling I'll post a patch with the defines.
Jakub Sitnicki Feb. 10, 2020, 11:52 a.m. UTC | #6
On Mon, Feb 10, 2020 at 03:55 AM GMT, John Fastabend wrote:
> Jakub Sitnicki wrote:
>> On Sun, Feb 09, 2020 at 03:41 AM CET, Alexei Starovoitov wrote:
>> > On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>> >>
>> >> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
>> >> down") introduced sleeping issues inside RCU critical sections and while
>> >> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
>> >> one socket in the map for the problem to surface.
>> >>
>> >> This adds a test that triggers the warnings for broken locking rules. Not a
>> >> fix per se, but rather tooling to verify the accompanying fixes. Run on a
>> >> VM with 1 vCPU to reproduce the warnings.
>> >>
>> >> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
>> >> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
>> >
>> > selftests/bpf no longer builds for me.
>> > make
>> >   BINARY   test_maps
>> >   TEST-OBJ [test_progs] sockmap_basic.test.o
>> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
>> > In function ‘connected_socket_v4’:
>> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
>> > error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
>> > you mean ‘TCP_REPAIR’?
>> >    20 |  repair = TCP_REPAIR_ON;
>> >       |           ^~~~~~~~~~~~~
>> >       |           TCP_REPAIR
>> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
>> > note: each undeclared identifier is reported only once for each
>> > function it appears in
>> > /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
>> > error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
>> > did you mean ‘TCP_REPAIR_OPTIONS’?
>> >    29 |  repair = TCP_REPAIR_OFF_NO_WP;
>> >       |           ^~~~~~~~~~~~~~~~~~~~
>> >       |           TCP_REPAIR_OPTIONS
>> >
>> > Clearly /usr/include/linux/tcp.h is too old.
>> > Suggestions?
>>
>> Sorry for the inconvenience. I see that tcp.h header is missing under
>> linux/tools/include/uapi/.
>
> How about we just add the couple defines needed to sockmap_basic.c I don't
> see a need to pull in all of tcp.h just for a couple defines that wont
> change anyways.

Looking back at how this happened. test_progs.h pulls in netinet/tcp.h:

# 19 "/home/jkbs/src/linux/tools/testing/selftests/bpf/test_progs.h" 2
# 1 "/usr/include/netinet/tcp.h" 1 3 4
# 92 "/usr/include/netinet/tcp.h" 3 4

A glibc header, which gained TCP_REPAIR_* constants in 2.29 [0]:

$ git describe --contains 5cd7dbdea13eb302620491ef44837b17e9d39c5a
glibc-2.29~510

Pulling in linux/tcp.h would conflict with struct definitions in
netinet/tcp.h. So redefining the possibly missing constants, like John
suggests, is the right way out.

I'm not sure, though, how to protect against such mistakes in the
future. Any ideas?

[0] https://sourceware.org/git/?p=glibc.git;a=commit;h=5cd7dbdea13eb302620491ef44837b17e9d39c5a

>
>>
>> I have been building against my distro kernel headers, completely
>> unaware of this. This is an oversight on my side.
>>
>> Can I ask for a revert? I'm traveling today with limited ability to
>> post patches.
>
> I don't think we need a full revert.
>
>>
>> I can resubmit the test with the missing header for bpf-next once it
>> reopens.
>
> If you are traveling I'll post a patch with the defines.

Thanks, again.
Daniel Borkmann Feb. 10, 2020, 11:38 p.m. UTC | #7
On 2/10/20 12:52 PM, Jakub Sitnicki wrote:
> On Mon, Feb 10, 2020 at 03:55 AM GMT, John Fastabend wrote:
>> Jakub Sitnicki wrote:
>>> On Sun, Feb 09, 2020 at 03:41 AM CET, Alexei Starovoitov wrote:
>>>> On Thu, Feb 6, 2020 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>>>>
>>>>> Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear
>>>>> down") introduced sleeping issues inside RCU critical sections and while
>>>>> holding a spinlock on sockmap/sockhash tear-down. There has to be at least
>>>>> one socket in the map for the problem to surface.
>>>>>
>>>>> This adds a test that triggers the warnings for broken locking rules. Not a
>>>>> fix per se, but rather tooling to verify the accompanying fixes. Run on a
>>>>> VM with 1 vCPU to reproduce the warnings.
>>>>>
>>>>> Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down")
>>>>> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
>>>>
>>>> selftests/bpf no longer builds for me.
>>>> make
>>>>    BINARY   test_maps
>>>>    TEST-OBJ [test_progs] sockmap_basic.test.o
>>>> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:
>>>> In function ‘connected_socket_v4’:
>>>> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
>>>> error: ‘TCP_REPAIR_ON’ undeclared (first use in this function); did
>>>> you mean ‘TCP_REPAIR’?
>>>>     20 |  repair = TCP_REPAIR_ON;
>>>>        |           ^~~~~~~~~~~~~
>>>>        |           TCP_REPAIR
>>>> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11:
>>>> note: each undeclared identifier is reported only once for each
>>>> function it appears in
>>>> /data/users/ast/net/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11:
>>>> error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function);
>>>> did you mean ‘TCP_REPAIR_OPTIONS’?
>>>>     29 |  repair = TCP_REPAIR_OFF_NO_WP;
>>>>        |           ^~~~~~~~~~~~~~~~~~~~
>>>>        |           TCP_REPAIR_OPTIONS
>>>>
>>>> Clearly /usr/include/linux/tcp.h is too old.
>>>> Suggestions?
>>>
>>> Sorry for the inconvenience. I see that tcp.h header is missing under
>>> linux/tools/include/uapi/.
>>
>> How about we just add the couple defines needed to sockmap_basic.c I don't
>> see a need to pull in all of tcp.h just for a couple defines that wont
>> change anyways.
> 
> Looking back at how this happened. test_progs.h pulls in netinet/tcp.h:
> 
> # 19 "/home/jkbs/src/linux/tools/testing/selftests/bpf/test_progs.h" 2
> # 1 "/usr/include/netinet/tcp.h" 1 3 4
> # 92 "/usr/include/netinet/tcp.h" 3 4
> 
> A glibc header, which gained TCP_REPAIR_* constants in 2.29 [0]:
> 
> $ git describe --contains 5cd7dbdea13eb302620491ef44837b17e9d39c5a
> glibc-2.29~510
> 
> Pulling in linux/tcp.h would conflict with struct definitions in
> netinet/tcp.h. So redefining the possibly missing constants, like John
> suggests, is the right way out.
> 
> I'm not sure, though, how to protect against such mistakes in the
> future. Any ideas?

We've usually been pulling in header copies into tools/include/. Last bigger one
was in commit 13a748ea6df1 ("bpf: Sync asm-generic/socket.h to tools/") to name
one example, but redefining the way John did for smaller things is also okay,
especially if a header has further dependencies which would then also be needed
under tools/ infra.

> [0] https://sourceware.org/git/?p=glibc.git;a=commit;h=5cd7dbdea13eb302620491ef44837b17e9d39c5a
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
new file mode 100644
index 000000000000..07f5b462c2ef
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -0,0 +1,74 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+
+#include "test_progs.h"
+
+static int connected_socket_v4(void)
+{
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_port = htons(80),
+		.sin_addr = { inet_addr("127.0.0.1") },
+	};
+	socklen_t len = sizeof(addr);
+	int s, repair, err;
+
+	s = socket(AF_INET, SOCK_STREAM, 0);
+	if (CHECK_FAIL(s == -1))
+		goto error;
+
+	repair = TCP_REPAIR_ON;
+	err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
+	if (CHECK_FAIL(err))
+		goto error;
+
+	err = connect(s, (struct sockaddr *)&addr, len);
+	if (CHECK_FAIL(err))
+		goto error;
+
+	repair = TCP_REPAIR_OFF_NO_WP;
+	err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
+	if (CHECK_FAIL(err))
+		goto error;
+
+	return s;
+error:
+	perror(__func__);
+	close(s);
+	return -1;
+}
+
+/* Create a map, populate it with one socket, and free the map. */
+static void test_sockmap_create_update_free(enum bpf_map_type map_type)
+{
+	const int zero = 0;
+	int s, map, err;
+
+	s = connected_socket_v4();
+	if (CHECK_FAIL(s == -1))
+		return;
+
+	map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
+	if (CHECK_FAIL(map == -1)) {
+		perror("bpf_create_map");
+		goto out;
+	}
+
+	err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST);
+	if (CHECK_FAIL(err)) {
+		perror("bpf_map_update");
+		goto out;
+	}
+
+out:
+	close(map);
+	close(s);
+}
+
+void test_sockmap_basic(void)
+{
+	if (test__start_subtest("sockmap create_update_free"))
+		test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP);
+	if (test__start_subtest("sockhash create_update_free"))
+		test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH);
+}