Message ID | 20200311191513.3954203-1-andriin@fb.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | [bpf-next] selftests/bpf: make tcp_rtt test more robust to failures | expand |
On 03/11, Andrii Nakryiko wrote: > Switch to non-blocking accept and wait for server thread to exit before > proceeding. I noticed that sometimes tcp_rtt server thread failure would > "spill over" into other tests (that would run after tcp_rtt), probably just > because server thread exits much later and tcp_rtt doesn't wait for it. > > Fixes: 8a03222f508b ("selftests/bpf: test_progs: fix client/server race in tcp_rtt") > Signed-off-by: Andrii Nakryiko <andriin@fb.com> > --- > .../selftests/bpf/prog_tests/tcp_rtt.c | 30 +++++++++++-------- > 1 file changed, 18 insertions(+), 12 deletions(-) > > diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > index f4cd60d6fba2..d235eea0de27 100644 > --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > @@ -188,7 +188,7 @@ static int start_server(void) > }; > int fd; > > - fd = socket(AF_INET, SOCK_STREAM, 0); > + fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); > if (fd < 0) { > log_err("Failed to create server socket"); > return -1; > @@ -205,6 +205,7 @@ static int start_server(void) > > static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; > static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; > +static volatile bool server_done = false; > > static void *server_thread(void *arg) > { > @@ -222,23 +223,22 @@ static void *server_thread(void *arg) > > if (CHECK_FAIL(err < 0)) { > perror("Failed to listed on socket"); > - return NULL; > + return ERR_PTR(err); > } > > - client_fd = accept(fd, (struct sockaddr *)&addr, &len); > + while (!server_done) { > + client_fd = accept(fd, (struct sockaddr *)&addr, &len); > + if (client_fd == -1 && errno == EAGAIN) > + continue; > + break; > + } > if (CHECK_FAIL(client_fd < 0)) { > perror("Failed to accept client"); > - return NULL; > + return ERR_PTR(err); > } > > - /* Wait for the next connection (that never arrives) > - * to keep this thread alive to prevent calling > - * close() on client_fd. > - */ > - if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) { > - perror("Unexpected success in second accept"); > - return NULL; > - } > + while (!server_done) > + usleep(50); > > close(client_fd); > > @@ -249,6 +249,7 @@ void test_tcp_rtt(void) > { > int server_fd, cgroup_fd; > pthread_t tid; > + void *server_res; > > cgroup_fd = test__join_cgroup("/tcp_rtt"); > if (CHECK_FAIL(cgroup_fd < 0)) > @@ -267,6 +268,11 @@ void test_tcp_rtt(void) > pthread_mutex_unlock(&server_started_mtx); > > CHECK_FAIL(run_test(cgroup_fd, server_fd)); > + > + server_done = true; [..] > + pthread_join(tid, &server_res); > + CHECK_FAIL(IS_ERR(server_res)); I wonder if we add (move) close(server_fd) before pthread_join(), can we fix this issue without using non-blocking socket? The accept() should return as soon as server_fd is closed so it's essentially your 'server_done'. > + > close_server_fd: > close(server_fd); > close_cgroup_fd: > -- > 2.17.1 >
On Wed, Mar 11, 2020 at 1:41 PM Stanislav Fomichev <sdf@fomichev.me> wrote: > > On 03/11, Andrii Nakryiko wrote: > > Switch to non-blocking accept and wait for server thread to exit before > > proceeding. I noticed that sometimes tcp_rtt server thread failure would > > "spill over" into other tests (that would run after tcp_rtt), probably just > > because server thread exits much later and tcp_rtt doesn't wait for it. > > > > Fixes: 8a03222f508b ("selftests/bpf: test_progs: fix client/server race in tcp_rtt") > > Signed-off-by: Andrii Nakryiko <andriin@fb.com> > > --- > > .../selftests/bpf/prog_tests/tcp_rtt.c | 30 +++++++++++-------- > > 1 file changed, 18 insertions(+), 12 deletions(-) > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > > index f4cd60d6fba2..d235eea0de27 100644 > > --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > > +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c > > @@ -188,7 +188,7 @@ static int start_server(void) > > }; > > int fd; > > > > - fd = socket(AF_INET, SOCK_STREAM, 0); > > + fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); > > if (fd < 0) { > > log_err("Failed to create server socket"); > > return -1; > > @@ -205,6 +205,7 @@ static int start_server(void) > > > > static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; > > static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; > > +static volatile bool server_done = false; > > > > static void *server_thread(void *arg) > > { > > @@ -222,23 +223,22 @@ static void *server_thread(void *arg) > > > > if (CHECK_FAIL(err < 0)) { > > perror("Failed to listed on socket"); > > - return NULL; > > + return ERR_PTR(err); > > } > > > > - client_fd = accept(fd, (struct sockaddr *)&addr, &len); > > + while (!server_done) { > > + client_fd = accept(fd, (struct sockaddr *)&addr, &len); > > + if (client_fd == -1 && errno == EAGAIN) > > + continue; > > + break; > > + } > > if (CHECK_FAIL(client_fd < 0)) { > > perror("Failed to accept client"); > > - return NULL; > > + return ERR_PTR(err); > > } > > > > - /* Wait for the next connection (that never arrives) > > - * to keep this thread alive to prevent calling > > - * close() on client_fd. > > - */ > > - if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) { > > - perror("Unexpected success in second accept"); > > - return NULL; > > - } > > + while (!server_done) > > + usleep(50); > > > > close(client_fd); > > > > @@ -249,6 +249,7 @@ void test_tcp_rtt(void) > > { > > int server_fd, cgroup_fd; > > pthread_t tid; > > + void *server_res; > > > > cgroup_fd = test__join_cgroup("/tcp_rtt"); > > if (CHECK_FAIL(cgroup_fd < 0)) > > @@ -267,6 +268,11 @@ void test_tcp_rtt(void) > > pthread_mutex_unlock(&server_started_mtx); > > > > CHECK_FAIL(run_test(cgroup_fd, server_fd)); > > + > > + server_done = true; > > [..] > > + pthread_join(tid, &server_res); > > + CHECK_FAIL(IS_ERR(server_res)); > > I wonder if we add (move) close(server_fd) before pthread_join(), can we > fix this issue without using non-blocking socket? The accept() should > return as soon as server_fd is closed so it's essentially your > 'server_done'. That was my first attempt. Amazingly, closing listening socket FD doesn't unblock accept()... > > > + > > close_server_fd: > > close(server_fd); > > close_cgroup_fd: > > -- > > 2.17.1 > >
On 03/11, Andrii Nakryiko wrote: > On Wed, Mar 11, 2020 at 1:41 PM Stanislav Fomichev <sdf@fomichev.me> wrote: > > > > On 03/11, Andrii Nakryiko wrote: > > [..] > > > + pthread_join(tid, &server_res); > > > + CHECK_FAIL(IS_ERR(server_res)); > > > > I wonder if we add (move) close(server_fd) before pthread_join(), can we > > fix this issue without using non-blocking socket? The accept() should > > return as soon as server_fd is closed so it's essentially your > > 'server_done'. > > That was my first attempt. Amazingly, closing listening socket FD > doesn't unblock accept()... Ugh :-( In this case, feel free to slap: Reviewed-by: Stanislav Fomichev <sdf@google.com> My only other (minor) suggestion was to add a small delay in the first loop: while (!server_done) { accept() if (!err) { udelay(50) <-- continue } } But I suppose that shouldn't be that big of a deal..
On Wed, Mar 11, 2020 at 3:14 PM Stanislav Fomichev <sdf@fomichev.me> wrote: > > On 03/11, Andrii Nakryiko wrote: > > On Wed, Mar 11, 2020 at 1:41 PM Stanislav Fomichev <sdf@fomichev.me> wrote: > > > > > > On 03/11, Andrii Nakryiko wrote: > > > [..] > > > > + pthread_join(tid, &server_res); > > > > + CHECK_FAIL(IS_ERR(server_res)); > > > > > > I wonder if we add (move) close(server_fd) before pthread_join(), can we > > > fix this issue without using non-blocking socket? The accept() should > > > return as soon as server_fd is closed so it's essentially your > > > 'server_done'. > > > > That was my first attempt. Amazingly, closing listening socket FD > > doesn't unblock accept()... > Ugh :-( > > In this case, feel free to slap: > Reviewed-by: Stanislav Fomichev <sdf@google.com> > > My only other (minor) suggestion was to add a small delay in the first > loop: > > while (!server_done) { > accept() > if (!err) { > udelay(50) <-- > continue > } > } > > But I suppose that shouldn't be that big of a deal.. It's actually bad, I'll fix it. Not sure how I missed that one... Thanks!
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index f4cd60d6fba2..d235eea0de27 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -188,7 +188,7 @@ static int start_server(void) }; int fd; - fd = socket(AF_INET, SOCK_STREAM, 0); + fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -205,6 +205,7 @@ static int start_server(void) static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; +static volatile bool server_done = false; static void *server_thread(void *arg) { @@ -222,23 +223,22 @@ static void *server_thread(void *arg) if (CHECK_FAIL(err < 0)) { perror("Failed to listed on socket"); - return NULL; + return ERR_PTR(err); } - client_fd = accept(fd, (struct sockaddr *)&addr, &len); + while (!server_done) { + client_fd = accept(fd, (struct sockaddr *)&addr, &len); + if (client_fd == -1 && errno == EAGAIN) + continue; + break; + } if (CHECK_FAIL(client_fd < 0)) { perror("Failed to accept client"); - return NULL; + return ERR_PTR(err); } - /* Wait for the next connection (that never arrives) - * to keep this thread alive to prevent calling - * close() on client_fd. - */ - if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) { - perror("Unexpected success in second accept"); - return NULL; - } + while (!server_done) + usleep(50); close(client_fd); @@ -249,6 +249,7 @@ void test_tcp_rtt(void) { int server_fd, cgroup_fd; pthread_t tid; + void *server_res; cgroup_fd = test__join_cgroup("/tcp_rtt"); if (CHECK_FAIL(cgroup_fd < 0)) @@ -267,6 +268,11 @@ void test_tcp_rtt(void) pthread_mutex_unlock(&server_started_mtx); CHECK_FAIL(run_test(cgroup_fd, server_fd)); + + server_done = true; + pthread_join(tid, &server_res); + CHECK_FAIL(IS_ERR(server_res)); + close_server_fd: close(server_fd); close_cgroup_fd:
Switch to non-blocking accept and wait for server thread to exit before proceeding. I noticed that sometimes tcp_rtt server thread failure would "spill over" into other tests (that would run after tcp_rtt), probably just because server thread exits much later and tcp_rtt doesn't wait for it. Fixes: 8a03222f508b ("selftests/bpf: test_progs: fix client/server race in tcp_rtt") Signed-off-by: Andrii Nakryiko <andriin@fb.com> --- .../selftests/bpf/prog_tests/tcp_rtt.c | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-)