diff mbox series

[ovs-dev,RFC] lib/ovs-thread: Expand stack when more memory is available.

Message ID 20221019130146.2461673-1-mkp@redhat.com
State Changes Requested
Headers show
Series [ovs-dev,RFC] lib/ovs-thread: Expand stack when more memory is available. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Mike Pattrick Oct. 19, 2022, 1:01 p.m. UTC
Previously the minimum thread stack size was always set to 512 kB to
help accomidate smaller OpenWRT based systems. Often these devices
don't have a lot of total system memory, so such a limit makes sense.

The default under x86-64 linux is 2MB, this limit is not always enough
to reach the recursion limits in xlate_resubmit_resource_check(),
resulting in a segfault instead of a recoverable error. This can happen
even when the stack size is set up for unlimited expansion when the
virtaul memory areas of handler threads abut eachother, preventing any
further expansion.

The solution proposed here is to set a minimum of 4MB on systems with
more than 4GB of total ram.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2104779
Signed-off-by: Mike Pattrick <mkp@redhat.com>
---
 lib/ovs-thread.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 lib/ovs-thread.h |  1 +
 2 files changed, 45 insertions(+), 2 deletions(-)

Comments

Flavio Leitner Oct. 19, 2022, 1:29 p.m. UTC | #1
Hi Mike,

Thanks for the patch.

Does this patch need to change this line too?
https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18


Wouldn't it be better to have a config option that we
can change at runtime? Or perhaps leave it to use the
system's default unless configured to cap the amount?

BTW, I think we use Reported-at: tag instead of Bugzilla:.


fbl

On Wed, Oct 19, 2022 at 09:01:46AM -0400, Mike Pattrick wrote:
> Previously the minimum thread stack size was always set to 512 kB to
> help accomidate smaller OpenWRT based systems. Often these devices
> don't have a lot of total system memory, so such a limit makes sense.
> 
> The default under x86-64 linux is 2MB, this limit is not always enough
> to reach the recursion limits in xlate_resubmit_resource_check(),
> resulting in a segfault instead of a recoverable error. This can happen
> even when the stack size is set up for unlimited expansion when the
> virtaul memory areas of handler threads abut eachother, preventing any
> further expansion.
> 
> The solution proposed here is to set a minimum of 4MB on systems with
> more than 4GB of total ram.
> 
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2104779
> Signed-off-by: Mike Pattrick <mkp@redhat.com>
> ---
>  lib/ovs-thread.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/ovs-thread.h |  1 +
>  2 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
> index 78ed3e970..dbe4a036f 100644
> --- a/lib/ovs-thread.c
> +++ b/lib/ovs-thread.c
> @@ -478,10 +478,21 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg)
>       * requires approximately 384 kB according to the following analysis:
>       * https://mail.openvswitch.org/pipermail/ovs-dev/2016-January/308592.html
>       *
> -     * We use 512 kB to give us some margin of error. */
> +     * We use at least 512 kB to give us some margin of error.
> +     *
> +     * However, this can cause issues on larger systems with complex
> +     * OpenFlow tables. A default stack size of 2MB can result in segfaults
> +     * if a lot of clones and resubmits are used. So if the system memory
> +     * exceeds some limit then use a 4 MB stack.
> +     * */
>      pthread_attr_t attr;
>      pthread_attr_init(&attr);
> -    set_min_stack_size(&attr, 512 * 1024);
> +
> +    if (system_memory() >> 30 > 4) {
> +        set_min_stack_size(&attr, 4096 * 1024);
> +    } else {
> +        set_min_stack_size(&attr, 512 * 1024);
> +    }
>  
>      error = pthread_create(&thread, &attr, ovsthread_wrapper, aux);
>      if (error) {
> @@ -680,6 +691,37 @@ count_total_cores(void)
>      return n_cores > 0 ? n_cores : 0;
>  }
>  
> +/* Returns the total system memory in bytes, or 0 if the
> + * number cannot be determined. */
> +uint64_t
> +system_memory(void)
> +{
> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +    static uint64_t memory;
> +
> +    if (ovsthread_once_start(&once)) {
> +#if defined(_WIN32)
> +        MEMORYSTATUSEX statex;
> +
> +        statex.dwLength = sizeof statex;
> +        GlobalMemoryStatusEx(&statex);
> +        memory = statex.ullTotalPhys;
> +#elif defined(__linux__)
> +        long int page_count = sysconf(_SC_PHYS_PAGES);
> +        long int page_size = sysconf(_SC_PAGESIZE);
> +
> +        if (page_count > 0 && page_size > 0) {
> +            memory = page_count * page_size;
> +        } else {
> +            memory = 0;
> +        }
> +#endif
> +        ovsthread_once_done(&once);
> +    }
> +
> +    return memory;
> +}
> +
>  /* Returns 'true' if current thread is PMD thread. */
>  bool
>  thread_is_pmd(void)
> diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> index aac5e19c9..2ce66b721 100644
> --- a/lib/ovs-thread.h
> +++ b/lib/ovs-thread.h
> @@ -523,6 +523,7 @@ bool may_fork(void);
>  
>  int count_cpu_cores(void);
>  int count_total_cores(void);
> +uint64_t system_memory(void);
>  bool thread_is_pmd(void);
>  
>  #endif /* ovs-thread.h */
> -- 
> 2.31.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Mike Pattrick Oct. 19, 2022, 1:48 p.m. UTC | #2
On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
>
>
> Hi Mike,
>
> Thanks for the patch.
>
> Does this patch need to change this line too?
> https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
>
>
> Wouldn't it be better to have a config option that we
> can change at runtime? Or perhaps leave it to use the
> system's default unless configured to cap the amount?

What I'm worried about here is if OVN is used and a few other things
like dnat/snat, the resulting openflow rules can cause a segfault with
the system default 2MB stack. The stack conditions aren't really
detectable during runtime so increasing the default seemed reasonable
to me. It also doesn't seem trivial to me to determine if any given
set of OpenFlow rules will or won't cause an explosion in stack usage.

>
> BTW, I think we use Reported-at: tag instead of Bugzilla:.

You're right! I don't know where I came up with that tag.

-M

>
>
> fbl
>
> On Wed, Oct 19, 2022 at 09:01:46AM -0400, Mike Pattrick wrote:
> > Previously the minimum thread stack size was always set to 512 kB to
> > help accomidate smaller OpenWRT based systems. Often these devices
> > don't have a lot of total system memory, so such a limit makes sense.
> >
> > The default under x86-64 linux is 2MB, this limit is not always enough
> > to reach the recursion limits in xlate_resubmit_resource_check(),
> > resulting in a segfault instead of a recoverable error. This can happen
> > even when the stack size is set up for unlimited expansion when the
> > virtaul memory areas of handler threads abut eachother, preventing any
> > further expansion.
> >
> > The solution proposed here is to set a minimum of 4MB on systems with
> > more than 4GB of total ram.
> >
> > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2104779
> > Signed-off-by: Mike Pattrick <mkp@redhat.com>
> > ---
> >  lib/ovs-thread.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> >  lib/ovs-thread.h |  1 +
> >  2 files changed, 45 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
> > index 78ed3e970..dbe4a036f 100644
> > --- a/lib/ovs-thread.c
> > +++ b/lib/ovs-thread.c
> > @@ -478,10 +478,21 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg)
> >       * requires approximately 384 kB according to the following analysis:
> >       * https://mail.openvswitch.org/pipermail/ovs-dev/2016-January/308592.html
> >       *
> > -     * We use 512 kB to give us some margin of error. */
> > +     * We use at least 512 kB to give us some margin of error.
> > +     *
> > +     * However, this can cause issues on larger systems with complex
> > +     * OpenFlow tables. A default stack size of 2MB can result in segfaults
> > +     * if a lot of clones and resubmits are used. So if the system memory
> > +     * exceeds some limit then use a 4 MB stack.
> > +     * */
> >      pthread_attr_t attr;
> >      pthread_attr_init(&attr);
> > -    set_min_stack_size(&attr, 512 * 1024);
> > +
> > +    if (system_memory() >> 30 > 4) {
> > +        set_min_stack_size(&attr, 4096 * 1024);
> > +    } else {
> > +        set_min_stack_size(&attr, 512 * 1024);
> > +    }
> >
> >      error = pthread_create(&thread, &attr, ovsthread_wrapper, aux);
> >      if (error) {
> > @@ -680,6 +691,37 @@ count_total_cores(void)
> >      return n_cores > 0 ? n_cores : 0;
> >  }
> >
> > +/* Returns the total system memory in bytes, or 0 if the
> > + * number cannot be determined. */
> > +uint64_t
> > +system_memory(void)
> > +{
> > +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> > +    static uint64_t memory;
> > +
> > +    if (ovsthread_once_start(&once)) {
> > +#if defined(_WIN32)
> > +        MEMORYSTATUSEX statex;
> > +
> > +        statex.dwLength = sizeof statex;
> > +        GlobalMemoryStatusEx(&statex);
> > +        memory = statex.ullTotalPhys;
> > +#elif defined(__linux__)
> > +        long int page_count = sysconf(_SC_PHYS_PAGES);
> > +        long int page_size = sysconf(_SC_PAGESIZE);
> > +
> > +        if (page_count > 0 && page_size > 0) {
> > +            memory = page_count * page_size;
> > +        } else {
> > +            memory = 0;
> > +        }
> > +#endif
> > +        ovsthread_once_done(&once);
> > +    }
> > +
> > +    return memory;
> > +}
> > +
> >  /* Returns 'true' if current thread is PMD thread. */
> >  bool
> >  thread_is_pmd(void)
> > diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> > index aac5e19c9..2ce66b721 100644
> > --- a/lib/ovs-thread.h
> > +++ b/lib/ovs-thread.h
> > @@ -523,6 +523,7 @@ bool may_fork(void);
> >
> >  int count_cpu_cores(void);
> >  int count_total_cores(void);
> > +uint64_t system_memory(void);
> >  bool thread_is_pmd(void);
> >
> >  #endif /* ovs-thread.h */
> > --
> > 2.31.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
> --
> fbl
>
Flavio Leitner Oct. 19, 2022, 2:50 p.m. UTC | #3
On Wed, Oct 19, 2022 at 09:48:18AM -0400, Mike Pattrick wrote:
> On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
> >
> >
> > Hi Mike,
> >
> > Thanks for the patch.
> >
> > Does this patch need to change this line too?
> > https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
> >
> >
> > Wouldn't it be better to have a config option that we
> > can change at runtime? Or perhaps leave it to use the
> > system's default unless configured to cap the amount?
> 
> What I'm worried about here is if OVN is used and a few other things
> like dnat/snat, the resulting openflow rules can cause a segfault with
> the system default 2MB stack. The stack conditions aren't really
> detectable during runtime so increasing the default seemed reasonable
> to me. It also doesn't seem trivial to me to determine if any given
> set of OpenFlow rules will or won't cause an explosion in stack usage.


I agree. The issue is that if there is a crash then the only
option is to recompile and that is super difficult to do when
dealing with production environments.

Another thing to consider is that this may not be OVS' job to
deal with because OVS should rely on OS defaults, when possible.
However, sometimes OS defaults are too high for other reasons,
so we may want to cap it at OVS level.

I am not sure about users upgrading from previous versions
because if before it was limited and now it uses the OS default,
it may use more memory.

fbl

> > BTW, I think we use Reported-at: tag instead of Bugzilla:.
> 
> You're right! I don't know where I came up with that tag.
> 
> -M
> 
> >
> >
> > fbl
> >
> > On Wed, Oct 19, 2022 at 09:01:46AM -0400, Mike Pattrick wrote:
> > > Previously the minimum thread stack size was always set to 512 kB to
> > > help accomidate smaller OpenWRT based systems. Often these devices
> > > don't have a lot of total system memory, so such a limit makes sense.
> > >
> > > The default under x86-64 linux is 2MB, this limit is not always enough
> > > to reach the recursion limits in xlate_resubmit_resource_check(),
> > > resulting in a segfault instead of a recoverable error. This can happen
> > > even when the stack size is set up for unlimited expansion when the
> > > virtaul memory areas of handler threads abut eachother, preventing any
> > > further expansion.
> > >
> > > The solution proposed here is to set a minimum of 4MB on systems with
> > > more than 4GB of total ram.
> > >
> > > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2104779
> > > Signed-off-by: Mike Pattrick <mkp@redhat.com>
> > > ---
> > >  lib/ovs-thread.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> > >  lib/ovs-thread.h |  1 +
> > >  2 files changed, 45 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
> > > index 78ed3e970..dbe4a036f 100644
> > > --- a/lib/ovs-thread.c
> > > +++ b/lib/ovs-thread.c
> > > @@ -478,10 +478,21 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg)
> > >       * requires approximately 384 kB according to the following analysis:
> > >       * https://mail.openvswitch.org/pipermail/ovs-dev/2016-January/308592.html
> > >       *
> > > -     * We use 512 kB to give us some margin of error. */
> > > +     * We use at least 512 kB to give us some margin of error.
> > > +     *
> > > +     * However, this can cause issues on larger systems with complex
> > > +     * OpenFlow tables. A default stack size of 2MB can result in segfaults
> > > +     * if a lot of clones and resubmits are used. So if the system memory
> > > +     * exceeds some limit then use a 4 MB stack.
> > > +     * */
> > >      pthread_attr_t attr;
> > >      pthread_attr_init(&attr);
> > > -    set_min_stack_size(&attr, 512 * 1024);
> > > +
> > > +    if (system_memory() >> 30 > 4) {
> > > +        set_min_stack_size(&attr, 4096 * 1024);
> > > +    } else {
> > > +        set_min_stack_size(&attr, 512 * 1024);
> > > +    }
> > >
> > >      error = pthread_create(&thread, &attr, ovsthread_wrapper, aux);
> > >      if (error) {
> > > @@ -680,6 +691,37 @@ count_total_cores(void)
> > >      return n_cores > 0 ? n_cores : 0;
> > >  }
> > >
> > > +/* Returns the total system memory in bytes, or 0 if the
> > > + * number cannot be determined. */
> > > +uint64_t
> > > +system_memory(void)
> > > +{
> > > +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> > > +    static uint64_t memory;
> > > +
> > > +    if (ovsthread_once_start(&once)) {
> > > +#if defined(_WIN32)
> > > +        MEMORYSTATUSEX statex;
> > > +
> > > +        statex.dwLength = sizeof statex;
> > > +        GlobalMemoryStatusEx(&statex);
> > > +        memory = statex.ullTotalPhys;
> > > +#elif defined(__linux__)
> > > +        long int page_count = sysconf(_SC_PHYS_PAGES);
> > > +        long int page_size = sysconf(_SC_PAGESIZE);
> > > +
> > > +        if (page_count > 0 && page_size > 0) {
> > > +            memory = page_count * page_size;
> > > +        } else {
> > > +            memory = 0;
> > > +        }
> > > +#endif
> > > +        ovsthread_once_done(&once);
> > > +    }
> > > +
> > > +    return memory;
> > > +}
> > > +
> > >  /* Returns 'true' if current thread is PMD thread. */
> > >  bool
> > >  thread_is_pmd(void)
> > > diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> > > index aac5e19c9..2ce66b721 100644
> > > --- a/lib/ovs-thread.h
> > > +++ b/lib/ovs-thread.h
> > > @@ -523,6 +523,7 @@ bool may_fork(void);
> > >
> > >  int count_cpu_cores(void);
> > >  int count_total_cores(void);
> > > +uint64_t system_memory(void);
> > >  bool thread_is_pmd(void);
> > >
> > >  #endif /* ovs-thread.h */
> > > --
> > > 2.31.1
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> > --
> > fbl
> >
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Eelco Chaudron Oct. 19, 2022, 3:13 p.m. UTC | #4
On 19 Oct 2022, at 15:01, Mike Pattrick wrote:

> Previously the minimum thread stack size was always set to 512 kB to
> help accomidate smaller OpenWRT based systems. Often these devices
> don't have a lot of total system memory, so such a limit makes sense.
>
> The default under x86-64 linux is 2MB, this limit is not always enough
> to reach the recursion limits in xlate_resubmit_resource_check(),
> resulting in a segfault instead of a recoverable error. This can happen
> even when the stack size is set up for unlimited expansion when the
> virtaul memory areas of handler threads abut eachother, preventing any
> further expansion.
>
> The solution proposed here is to set a minimum of 4MB on systems with
> more than 4GB of total ram.
>
> Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2104779
> Signed-off-by: Mike Pattrick <mkp@redhat.com>
> ---
>  lib/ovs-thread.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/ovs-thread.h |  1 +
>  2 files changed, 45 insertions(+), 2 deletions(-)
>
> diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
> index 78ed3e970..dbe4a036f 100644
> --- a/lib/ovs-thread.c
> +++ b/lib/ovs-thread.c
> @@ -478,10 +478,21 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg)
>       * requires approximately 384 kB according to the following analysis:
>       * https://mail.openvswitch.org/pipermail/ovs-dev/2016-January/308592.html
>       *
> -     * We use 512 kB to give us some margin of error. */
> +     * We use at least 512 kB to give us some margin of error.
> +     *
> +     * However, this can cause issues on larger systems with complex
> +     * OpenFlow tables. A default stack size of 2MB can result in segfaults
> +     * if a lot of clones and resubmits are used. So if the system memory
> +     * exceeds some limit then use a 4 MB stack.

Any reason to choose 4G? Asking as users might be asking why after an upgrade I use more memory, for example, the microshift project.

I think it would be better to make this configurable on the command line.

I did not research but are there kernel-specific ways to maximise the stack distance to allow the stack to easily grow? Guess this is pthread related also.

> +     * */
>      pthread_attr_t attr;
>      pthread_attr_init(&attr);
> -    set_min_stack_size(&attr, 512 * 1024);
> +
> +    if (system_memory() >> 30 > 4) {
> +        set_min_stack_size(&attr, 4096 * 1024);
> +    } else {
> +        set_min_stack_size(&attr, 512 * 1024);
> +    }
>
>      error = pthread_create(&thread, &attr, ovsthread_wrapper, aux);
>      if (error) {
> @@ -680,6 +691,37 @@ count_total_cores(void)
>      return n_cores > 0 ? n_cores : 0;
>  }
>
> +/* Returns the total system memory in bytes, or 0 if the
> + * number cannot be determined. */
> +uint64_t
> +system_memory(void)
> +{
> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +    static uint64_t memory;
> +
> +    if (ovsthread_once_start(&once)) {
> +#if defined(_WIN32)
> +        MEMORYSTATUSEX statex;
> +
> +        statex.dwLength = sizeof statex;
> +        GlobalMemoryStatusEx(&statex);
> +        memory = statex.ullTotalPhys;
> +#elif defined(__linux__)
> +        long int page_count = sysconf(_SC_PHYS_PAGES);
> +        long int page_size = sysconf(_SC_PAGESIZE);
> +
> +        if (page_count > 0 && page_size > 0) {
> +            memory = page_count * page_size;
> +        } else {
> +            memory = 0;
> +        }

Under Linux there is also the sysinfo() call, but not sure what would be preferred.

> +#endif
> +        ovsthread_once_done(&once);
> +    }
> +

Guess on BDS we would return an initialized variable. We probably need a wrapper for that also.

> +    return memory;
> +}
> +
>  /* Returns 'true' if current thread is PMD thread. */
>  bool
>  thread_is_pmd(void)
> diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> index aac5e19c9..2ce66b721 100644
> --- a/lib/ovs-thread.h
> +++ b/lib/ovs-thread.h
> @@ -523,6 +523,7 @@ bool may_fork(void);
>
>  int count_cpu_cores(void);
>  int count_total_cores(void);
> +uint64_t system_memory(void);
>  bool thread_is_pmd(void);
>
>  #endif /* ovs-thread.h */
> -- 
> 2.31.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ilya Maximets Oct. 19, 2022, 4:27 p.m. UTC | #5
On 10/19/22 15:48, Mike Pattrick wrote:
> On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
>>
>>
>> Hi Mike,
>>
>> Thanks for the patch.
>>
>> Does this patch need to change this line too?
>> https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
>>
>>
>> Wouldn't it be better to have a config option that we
>> can change at runtime? Or perhaps leave it to use the
>> system's default unless configured to cap the amount?
> 
> What I'm worried about here is if OVN is used and a few other things
> like dnat/snat, the resulting openflow rules can cause a segfault with
> the system default 2MB stack. The stack conditions aren't really
> detectable during runtime so increasing the default seemed reasonable
> to me. It also doesn't seem trivial to me to determine if any given
> set of OpenFlow rules will or won't cause an explosion in stack usage.

Hi, Mike.

I was thinking in the past about a bit different approach to the problem.
I have a guess that most of the stack usage is coming from 'struct flow'
and some of the stab[] memory allocations on the stack and these are huge.
Can easily take a few KB of space each.
So, I wonder, if you have a coredump or a reproducer, maybe you could
confirm or disprove that theory?  That would be very helpful

If the theory will turn out to be true, the solution might be to move
all that to dynamically allocated memory instead of trying to guess the
sufficient upper limit for a stack size.
(Moving these to a heap might be a good idea even if they are not a
root cause of the current problem, I think.)

What do you think?

Best regards, Ilya Maximets.
Mike Pattrick Oct. 19, 2022, 10:13 p.m. UTC | #6
On Wed, Oct 19, 2022 at 12:27 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 10/19/22 15:48, Mike Pattrick wrote:
> > On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
> >>
> >>
> >> Hi Mike,
> >>
> >> Thanks for the patch.
> >>
> >> Does this patch need to change this line too?
> >> https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
> >>
> >>
> >> Wouldn't it be better to have a config option that we
> >> can change at runtime? Or perhaps leave it to use the
> >> system's default unless configured to cap the amount?
> >
> > What I'm worried about here is if OVN is used and a few other things
> > like dnat/snat, the resulting openflow rules can cause a segfault with
> > the system default 2MB stack. The stack conditions aren't really
> > detectable during runtime so increasing the default seemed reasonable
> > to me. It also doesn't seem trivial to me to determine if any given
> > set of OpenFlow rules will or won't cause an explosion in stack usage.
>
> Hi, Mike.
>
> I was thinking in the past about a bit different approach to the problem.
> I have a guess that most of the stack usage is coming from 'struct flow'
> and some of the stab[] memory allocations on the stack and these are huge.
> Can easily take a few KB of space each.
> So, I wonder, if you have a coredump or a reproducer, maybe you could
> confirm or disprove that theory?  That would be very helpful

I do have a coredump handy, the functions with struct flow and stub
like clone_xlate_actions() and patch_port_output() only account for
around 20% of the used stack space. do_xlate_actions() accounts for
over half of the stack used. I'll try reducing the stack used in these
three functions as an alternative to increasing the stack allocated.


Cheers,
M

>
> If the theory will turn out to be true, the solution might be to move
> all that to dynamically allocated memory instead of trying to guess the
> sufficient upper limit for a stack size.
> (Moving these to a heap might be a good idea even if they are not a
> root cause of the current problem, I think.)
>
> What do you think?
>
> Best regards, Ilya Maximets.
>
Ilya Maximets Oct. 20, 2022, 2:34 p.m. UTC | #7
On 10/20/22 00:13, Mike Pattrick wrote:
> On Wed, Oct 19, 2022 at 12:27 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 10/19/22 15:48, Mike Pattrick wrote:
>>> On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
>>>>
>>>>
>>>> Hi Mike,
>>>>
>>>> Thanks for the patch.
>>>>
>>>> Does this patch need to change this line too?
>>>> https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
>>>>
>>>>
>>>> Wouldn't it be better to have a config option that we
>>>> can change at runtime? Or perhaps leave it to use the
>>>> system's default unless configured to cap the amount?
>>>
>>> What I'm worried about here is if OVN is used and a few other things
>>> like dnat/snat, the resulting openflow rules can cause a segfault with
>>> the system default 2MB stack. The stack conditions aren't really
>>> detectable during runtime so increasing the default seemed reasonable
>>> to me. It also doesn't seem trivial to me to determine if any given
>>> set of OpenFlow rules will or won't cause an explosion in stack usage.
>>
>> Hi, Mike.
>>
>> I was thinking in the past about a bit different approach to the problem.
>> I have a guess that most of the stack usage is coming from 'struct flow'
>> and some of the stab[] memory allocations on the stack and these are huge.
>> Can easily take a few KB of space each.
>> So, I wonder, if you have a coredump or a reproducer, maybe you could
>> confirm or disprove that theory?  That would be very helpful
> 
> I do have a coredump handy, the functions with struct flow and stub
> like clone_xlate_actions() and patch_port_output() only account for
> around 20% of the used stack space. do_xlate_actions() accounts for
> over half of the stack used. I'll try reducing the stack used in these
> three functions as an alternative to increasing the stack allocated.

OK.  That makes sense.  Thanks!

do_xlate_actions() might be a bit tricky as the main stack usage is not
really from that particular function, but from all the small functions
that compiler inlines into it.

gcc -O2 -fstack-usage:
do_xlate_actions  720  dynamic,bounded

gcc -O2 -fstack-usage -fno-inline:
do_xlate_actions  288 dynamic,bounded


Luckily, there are at least few low hanging fruits like a monster
'struct flow_tnl flow_tnl' in the xlate_sample_action().

Best regards, Ilya Maximets.

> 
> 
> Cheers,
> M
> 
>>
>> If the theory will turn out to be true, the solution might be to move
>> all that to dynamically allocated memory instead of trying to guess the
>> sufficient upper limit for a stack size.
>> (Moving these to a heap might be a good idea even if they are not a
>> root cause of the current problem, I think.)
>>
>> What do you think?
>>
>> Best regards, Ilya Maximets.
>>
>
Eelco Chaudron Oct. 20, 2022, 2:48 p.m. UTC | #8
On 20 Oct 2022, at 16:34, Ilya Maximets wrote:

> On 10/20/22 00:13, Mike Pattrick wrote:
>> On Wed, Oct 19, 2022 at 12:27 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>>
>>> On 10/19/22 15:48, Mike Pattrick wrote:
>>>> On Wed, Oct 19, 2022 at 9:30 AM Flavio Leitner <fbl@sysclose.org> wrote:
>>>>>
>>>>>
>>>>> Hi Mike,
>>>>>
>>>>> Thanks for the patch.
>>>>>
>>>>> Does this patch need to change this line too?
>>>>> https://github.com/openvswitch/ovs/blob/31db0e043119cf597d720d94f70ec19cf5b8b7d4/rhel/usr_lib_systemd_system_ovs-vswitchd.service.in#L18
>>>>>
>>>>>
>>>>> Wouldn't it be better to have a config option that we
>>>>> can change at runtime? Or perhaps leave it to use the
>>>>> system's default unless configured to cap the amount?
>>>>
>>>> What I'm worried about here is if OVN is used and a few other things
>>>> like dnat/snat, the resulting openflow rules can cause a segfault with
>>>> the system default 2MB stack. The stack conditions aren't really
>>>> detectable during runtime so increasing the default seemed reasonable
>>>> to me. It also doesn't seem trivial to me to determine if any given
>>>> set of OpenFlow rules will or won't cause an explosion in stack usage.
>>>
>>> Hi, Mike.
>>>
>>> I was thinking in the past about a bit different approach to the problem.
>>> I have a guess that most of the stack usage is coming from 'struct flow'
>>> and some of the stab[] memory allocations on the stack and these are huge.
>>> Can easily take a few KB of space each.
>>> So, I wonder, if you have a coredump or a reproducer, maybe you could
>>> confirm or disprove that theory?  That would be very helpful
>>
>> I do have a coredump handy, the functions with struct flow and stub
>> like clone_xlate_actions() and patch_port_output() only account for
>> around 20% of the used stack space. do_xlate_actions() accounts for
>> over half of the stack used. I'll try reducing the stack used in these
>> three functions as an alternative to increasing the stack allocated.
>
> OK.  That makes sense.  Thanks!
>
> do_xlate_actions() might be a bit tricky as the main stack usage is not
> really from that particular function, but from all the small functions
> that compiler inlines into it.
>
> gcc -O2 -fstack-usage:
> do_xlate_actions  720  dynamic,bounded
>
> gcc -O2 -fstack-usage -fno-inline:
> do_xlate_actions  288 dynamic,bounded
>
>
> Luckily, there are at least few low hanging fruits like a monster
> 'struct flow_tnl flow_tnl' in the xlate_sample_action().

Also take a look at my last comment here, https://patchwork.ozlabs.org/project/openvswitch/patch/20221018173430.647165-2-amusil@redhat.com/, I think this will also save about 1K+ per loop.

> Best regards, Ilya Maximets.
>
>>
>>
>> Cheers,
>> M
>>
>>>
>>> If the theory will turn out to be true, the solution might be to move
>>> all that to dynamically allocated memory instead of trying to guess the
>>> sufficient upper limit for a stack size.
>>> (Moving these to a heap might be a good idea even if they are not a
>>> root cause of the current problem, I think.)
>>>
>>> What do you think?
>>>
>>> Best regards, Ilya Maximets.
>>>
>>
diff mbox series

Patch

diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
index 78ed3e970..dbe4a036f 100644
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -478,10 +478,21 @@  ovs_thread_create(const char *name, void *(*start)(void *), void *arg)
      * requires approximately 384 kB according to the following analysis:
      * https://mail.openvswitch.org/pipermail/ovs-dev/2016-January/308592.html
      *
-     * We use 512 kB to give us some margin of error. */
+     * We use at least 512 kB to give us some margin of error.
+     *
+     * However, this can cause issues on larger systems with complex
+     * OpenFlow tables. A default stack size of 2MB can result in segfaults
+     * if a lot of clones and resubmits are used. So if the system memory
+     * exceeds some limit then use a 4 MB stack.
+     * */
     pthread_attr_t attr;
     pthread_attr_init(&attr);
-    set_min_stack_size(&attr, 512 * 1024);
+
+    if (system_memory() >> 30 > 4) {
+        set_min_stack_size(&attr, 4096 * 1024);
+    } else {
+        set_min_stack_size(&attr, 512 * 1024);
+    }
 
     error = pthread_create(&thread, &attr, ovsthread_wrapper, aux);
     if (error) {
@@ -680,6 +691,37 @@  count_total_cores(void)
     return n_cores > 0 ? n_cores : 0;
 }
 
+/* Returns the total system memory in bytes, or 0 if the
+ * number cannot be determined. */
+uint64_t
+system_memory(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+    static uint64_t memory;
+
+    if (ovsthread_once_start(&once)) {
+#if defined(_WIN32)
+        MEMORYSTATUSEX statex;
+
+        statex.dwLength = sizeof statex;
+        GlobalMemoryStatusEx(&statex);
+        memory = statex.ullTotalPhys;
+#elif defined(__linux__)
+        long int page_count = sysconf(_SC_PHYS_PAGES);
+        long int page_size = sysconf(_SC_PAGESIZE);
+
+        if (page_count > 0 && page_size > 0) {
+            memory = page_count * page_size;
+        } else {
+            memory = 0;
+        }
+#endif
+        ovsthread_once_done(&once);
+    }
+
+    return memory;
+}
+
 /* Returns 'true' if current thread is PMD thread. */
 bool
 thread_is_pmd(void)
diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
index aac5e19c9..2ce66b721 100644
--- a/lib/ovs-thread.h
+++ b/lib/ovs-thread.h
@@ -523,6 +523,7 @@  bool may_fork(void);
 
 int count_cpu_cores(void);
 int count_total_cores(void);
+uint64_t system_memory(void);
 bool thread_is_pmd(void);
 
 #endif /* ovs-thread.h */