diff mbox series

[ovs-dev,v3,python] Don't mix system poll/monkeypatched select

Message ID 20210611142923.474384-1-twilson@redhat.com
State Deferred
Headers show
Series [ovs-dev,v3,python] Don't mix system poll/monkeypatched select | expand

Commit Message

Terry Wilson June 11, 2021, 2:29 p.m. UTC
This is a partial revert of c1aa16d19, but keeps its test.

Applications that use eventlet cannot use poll() without blocking.
To fix an issue where the check_connection_completion() code would
not detect connection errors when using select(), we switched to
using the system poll() for checking the connection while leaving
the select-based poller the default. This mixes monkeypatched and
unpatched code and caused eventlet-based code to block every time
we connect.

According to the connect(2) man page, you can detect connect()
errors when using select():

  After select(2) indicates writability, use getsockopt(2) to read
  the SO_ERROR option at level SOL_SOCKET to determine whether
  connect() completed successfully (SO_ERROR is zero) or
  unsuccessfully (SO_ERROR is one of the usual error codes listed
  here, explaining the reason for the failure)

so this patch does that instead.

Signed-off-by: Terry Wilson <twilson@redhat.com>
---
 python/ovs/poller.py      | 35 ++---------------------------------
 python/ovs/socket_util.py |  9 ++++++---
 2 files changed, 8 insertions(+), 36 deletions(-)

Comments

Numan Siddique June 14, 2021, 3:47 p.m. UTC | #1
On Fri, Jun 11, 2021 at 10:30 AM Terry Wilson <twilson@redhat.com> wrote:
>
> This is a partial revert of c1aa16d19, but keeps its test.
>
> Applications that use eventlet cannot use poll() without blocking.
> To fix an issue where the check_connection_completion() code would
> not detect connection errors when using select(), we switched to
> using the system poll() for checking the connection while leaving
> the select-based poller the default. This mixes monkeypatched and
> unpatched code and caused eventlet-based code to block every time
> we connect.
>
> According to the connect(2) man page, you can detect connect()
> errors when using select():
>
>   After select(2) indicates writability, use getsockopt(2) to read
>   the SO_ERROR option at level SOL_SOCKET to determine whether
>   connect() completed successfully (SO_ERROR is zero) or
>   unsuccessfully (SO_ERROR is one of the usual error codes listed
>   here, explaining the reason for the failure)
>
> so this patch does that instead.
>
> Signed-off-by: Terry Wilson <twilson@redhat.com>

Thanks for addressing this.

LGTM

Acked-by: Numan Siddique <numans@ovn.org>

Thanks
Numan



> ---
>  python/ovs/poller.py      | 35 ++---------------------------------
>  python/ovs/socket_util.py |  9 ++++++---
>  2 files changed, 8 insertions(+), 36 deletions(-)
>
> diff --git a/python/ovs/poller.py b/python/ovs/poller.py
> index 3624ec865..7b3238d6d 100644
> --- a/python/ovs/poller.py
> +++ b/python/ovs/poller.py
> @@ -31,22 +31,14 @@ except ImportError:
>      SSL = None
>
>  try:
> -    from eventlet import patcher as eventlet_patcher
> +    import eventlet.patcher
>
>      def _using_eventlet_green_select():
> -        return eventlet_patcher.is_monkey_patched(select)
> +        return eventlet.patcher.is_monkey_patched(select)
>  except:
> -    eventlet_patcher = None
> -
>      def _using_eventlet_green_select():
>          return False
>
> -try:
> -    from gevent import monkey as gevent_monkey
> -except:
> -    gevent_monkey = None
> -
> -
>  vlog = ovs.vlog.Vlog("poller")
>
>  POLLIN = 0x001
> @@ -265,26 +257,3 @@ class Poller(object):
>      def __reset(self):
>          self.poll = SelectPoll()
>          self.timeout = -1
> -
> -
> -def get_system_poll():
> -    """Returns the original select.poll() object. If select.poll is
> -    monkey patched by eventlet or gevent library, it gets the original
> -    select.poll and returns an object of it using the
> -    eventlet.patcher.original/gevent.monkey.get_original functions.
> -
> -    As a last resort, if there is any exception it returns the
> -    SelectPoll() object.
> -    """
> -    try:
> -        if _using_eventlet_green_select():
> -            _system_poll = eventlet_patcher.original("select").poll
> -        elif gevent_monkey and gevent_monkey.is_object_patched(
> -                'select', 'poll'):
> -            _system_poll = gevent_monkey.get_original('select', 'poll')
> -        else:
> -            _system_poll = select.poll
> -    except:
> -        _system_poll = SelectPoll
> -
> -    return _system_poll()
> diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py
> index 3faa64e9d..7dd000916 100644
> --- a/python/ovs/socket_util.py
> +++ b/python/ovs/socket_util.py
> @@ -159,8 +159,8 @@ def make_unix_socket(style, nonblock, bind_path, connect_path, short=False):
>
>
>  def check_connection_completion(sock):
> +    p = ovs.poller.SelectPoll()
>      if sys.platform == "win32":
> -        p = ovs.poller.SelectPoll()
>          event = winutils.get_new_event(None, False, True, None)
>          # Receive notification of readiness for writing, of completed
>          # connection or multipoint join operation, and of socket closure.
> @@ -170,12 +170,15 @@ def check_connection_completion(sock):
>                                   win32file.FD_CLOSE)
>          p.register(event, ovs.poller.POLLOUT)
>      else:
> -        p = ovs.poller.get_system_poll()
>          p.register(sock, ovs.poller.POLLOUT)
>      pfds = p.poll(0)
>      if len(pfds) == 1:
>          revents = pfds[0][1]
> -        if revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
> +        if revents & ovs.poller.POLLOUT:
> +            # This is how select() indicates a connect() error
> +            return sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)
> +        elif revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
> +            # This is how poll() indicates a connect() error
>              try:
>                  # The following should raise an exception.
>                  sock.send("\0".encode(), socket.MSG_DONTWAIT)
> --
> 2.26.3
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Ilya Maximets June 15, 2021, 8:06 p.m. UTC | #2
On 6/11/21 4:29 PM, Terry Wilson wrote:
> This is a partial revert of c1aa16d19, but keeps its test.
> 
> Applications that use eventlet cannot use poll() without blocking.
> To fix an issue where the check_connection_completion() code would
> not detect connection errors when using select(), we switched to
> using the system poll() for checking the connection while leaving
> the select-based poller the default. This mixes monkeypatched and
> unpatched code and caused eventlet-based code to block every time
> we connect.
> 
> According to the connect(2) man page, you can detect connect()
> errors when using select():
> 
>   After select(2) indicates writability, use getsockopt(2) to read
>   the SO_ERROR option at level SOL_SOCKET to determine whether
>   connect() completed successfully (SO_ERROR is zero) or
>   unsuccessfully (SO_ERROR is one of the usual error codes listed
>   here, explaining the reason for the failure)
> 
> so this patch does that instead.
> 
> Signed-off-by: Terry Wilson <twilson@redhat.com>
> ---
>  python/ovs/poller.py      | 35 ++---------------------------------
>  python/ovs/socket_util.py |  9 ++++++---
>  2 files changed, 8 insertions(+), 36 deletions(-)
> 
> diff --git a/python/ovs/poller.py b/python/ovs/poller.py
> index 3624ec865..7b3238d6d 100644
> --- a/python/ovs/poller.py
> +++ b/python/ovs/poller.py
> @@ -31,22 +31,14 @@ except ImportError:
>      SSL = None
>  
>  try:
> -    from eventlet import patcher as eventlet_patcher
> +    import eventlet.patcher
>  
>      def _using_eventlet_green_select():
> -        return eventlet_patcher.is_monkey_patched(select)
> +        return eventlet.patcher.is_monkey_patched(select)
>  except:
> -    eventlet_patcher = None
> -
>      def _using_eventlet_green_select():
>          return False
>  
> -try:
> -    from gevent import monkey as gevent_monkey
> -except:
> -    gevent_monkey = None
> -
> -
>  vlog = ovs.vlog.Vlog("poller")
>  
>  POLLIN = 0x001
> @@ -265,26 +257,3 @@ class Poller(object):
>      def __reset(self):
>          self.poll = SelectPoll()
>          self.timeout = -1
> -
> -
> -def get_system_poll():
> -    """Returns the original select.poll() object. If select.poll is
> -    monkey patched by eventlet or gevent library, it gets the original
> -    select.poll and returns an object of it using the
> -    eventlet.patcher.original/gevent.monkey.get_original functions.
> -
> -    As a last resort, if there is any exception it returns the
> -    SelectPoll() object.
> -    """
> -    try:
> -        if _using_eventlet_green_select():
> -            _system_poll = eventlet_patcher.original("select").poll
> -        elif gevent_monkey and gevent_monkey.is_object_patched(
> -                'select', 'poll'):
> -            _system_poll = gevent_monkey.get_original('select', 'poll')
> -        else:
> -            _system_poll = select.poll
> -    except:
> -        _system_poll = SelectPoll
> -
> -    return _system_poll()
> diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py
> index 3faa64e9d..7dd000916 100644
> --- a/python/ovs/socket_util.py
> +++ b/python/ovs/socket_util.py
> @@ -159,8 +159,8 @@ def make_unix_socket(style, nonblock, bind_path, connect_path, short=False):
>  
>  
>  def check_connection_completion(sock):
> +    p = ovs.poller.SelectPoll()
>      if sys.platform == "win32":
> -        p = ovs.poller.SelectPoll()
>          event = winutils.get_new_event(None, False, True, None)
>          # Receive notification of readiness for writing, of completed
>          # connection or multipoint join operation, and of socket closure.
> @@ -170,12 +170,15 @@ def check_connection_completion(sock):
>                                   win32file.FD_CLOSE)
>          p.register(event, ovs.poller.POLLOUT)
>      else:
> -        p = ovs.poller.get_system_poll()
>          p.register(sock, ovs.poller.POLLOUT)
>      pfds = p.poll(0)
>      if len(pfds) == 1:
>          revents = pfds[0][1]
> -        if revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
> +        if revents & ovs.poller.POLLOUT:
> +            # This is how select() indicates a connect() error

This wording doesn't sound correct to me as it might be an error and might
be not, IIUC.  POLLOUT here just means that something happened.  It might
be an error or the actual POLLOUT.
(Ugh... select()/poll() are so inconsistent across different platforms)

> +            return sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)

Looking at the history, it seems that ESX doesn't support SO_ERROR (at least
some old versions), so we need to avoid use of it.
See commit d6cedfd9d29d ("socket-util: Avoid using SO_ERROR.")

We, likely, need to treat this in the same path as error. i.e. invoke send()
and catch the exception.  However, for the POLLOUT case we should not
log and return the error if send() succeeds.  But this doesn't sound like
a good idea to send NULL bytes to everyone on success...

So, some options here:

1. Keep as is, i.e. system poll()
   --> python will be blocked by poll()

2. Fully revert c1aa16d19, i.e. just use select()
   --> connections will be treated as established even if they are not
       until the actual read/write.

3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
   --> give up on systems that doesn't support SO_ERRROR. (ESX?)

4. Use select() + getsockopt(SO_ERRROR), keep returning 0 (success) on
   ENOPROTOOPT
   --> systems that doesn't support might think that connections are
       established while they are not.

5. Try to detect if SO_ERRROR supported or not and use system poll() or
   select() + getsockopt(SO_ERRROR) based on that fact.
   --> better performance on systems that supports and same as now on
       systems that doesn't.

Option 3 is, probably, OK, but I don't know.
Option 4 might be the best choice, but I'm not quiet sure how to detect.
Is checking for ENOPROTOOPT enough (same actually applies to option 3
as I'm not sure if ESX actually reports ENOPROTOOPT)?

Also, we're using select() in C code for windows.  Do we have the same
issue there?  Alin, do you know if check_connection_completion() correctly
detects connection failure on windows, e.g. if 'Check stream open block'
tests are working?

Any thoughts?

Ben, what do you think?

Best regards, Ilya Maximets.
Ilya Maximets June 16, 2021, 10:24 a.m. UTC | #3
On 6/15/21 10:06 PM, Ilya Maximets wrote:
> On 6/11/21 4:29 PM, Terry Wilson wrote:
>> This is a partial revert of c1aa16d19, but keeps its test.
>>
>> Applications that use eventlet cannot use poll() without blocking.
>> To fix an issue where the check_connection_completion() code would
>> not detect connection errors when using select(), we switched to
>> using the system poll() for checking the connection while leaving
>> the select-based poller the default. This mixes monkeypatched and
>> unpatched code and caused eventlet-based code to block every time
>> we connect.

One more question:
Why exactly this happens?  I mean, we're calling poll(0) that should
not block, why eventlet is blocked in this case?

>>
>> According to the connect(2) man page, you can detect connect()
>> errors when using select():
>>
>>   After select(2) indicates writability, use getsockopt(2) to read
>>   the SO_ERROR option at level SOL_SOCKET to determine whether
>>   connect() completed successfully (SO_ERROR is zero) or
>>   unsuccessfully (SO_ERROR is one of the usual error codes listed
>>   here, explaining the reason for the failure)
>>
>> so this patch does that instead.
>>
>> Signed-off-by: Terry Wilson <twilson@redhat.com>
>> ---
>>  python/ovs/poller.py      | 35 ++---------------------------------
>>  python/ovs/socket_util.py |  9 ++++++---
>>  2 files changed, 8 insertions(+), 36 deletions(-)
>>
>> diff --git a/python/ovs/poller.py b/python/ovs/poller.py
>> index 3624ec865..7b3238d6d 100644
>> --- a/python/ovs/poller.py
>> +++ b/python/ovs/poller.py
>> @@ -31,22 +31,14 @@ except ImportError:
>>      SSL = None
>>  
>>  try:
>> -    from eventlet import patcher as eventlet_patcher
>> +    import eventlet.patcher
>>  
>>      def _using_eventlet_green_select():
>> -        return eventlet_patcher.is_monkey_patched(select)
>> +        return eventlet.patcher.is_monkey_patched(select)
>>  except:
>> -    eventlet_patcher = None
>> -
>>      def _using_eventlet_green_select():
>>          return False
>>  
>> -try:
>> -    from gevent import monkey as gevent_monkey
>> -except:
>> -    gevent_monkey = None
>> -
>> -
>>  vlog = ovs.vlog.Vlog("poller")
>>  
>>  POLLIN = 0x001
>> @@ -265,26 +257,3 @@ class Poller(object):
>>      def __reset(self):
>>          self.poll = SelectPoll()
>>          self.timeout = -1
>> -
>> -
>> -def get_system_poll():
>> -    """Returns the original select.poll() object. If select.poll is
>> -    monkey patched by eventlet or gevent library, it gets the original
>> -    select.poll and returns an object of it using the
>> -    eventlet.patcher.original/gevent.monkey.get_original functions.
>> -
>> -    As a last resort, if there is any exception it returns the
>> -    SelectPoll() object.
>> -    """
>> -    try:
>> -        if _using_eventlet_green_select():
>> -            _system_poll = eventlet_patcher.original("select").poll
>> -        elif gevent_monkey and gevent_monkey.is_object_patched(
>> -                'select', 'poll'):
>> -            _system_poll = gevent_monkey.get_original('select', 'poll')
>> -        else:
>> -            _system_poll = select.poll
>> -    except:
>> -        _system_poll = SelectPoll
>> -
>> -    return _system_poll()
>> diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py
>> index 3faa64e9d..7dd000916 100644
>> --- a/python/ovs/socket_util.py
>> +++ b/python/ovs/socket_util.py
>> @@ -159,8 +159,8 @@ def make_unix_socket(style, nonblock, bind_path, connect_path, short=False):
>>  
>>  
>>  def check_connection_completion(sock):
>> +    p = ovs.poller.SelectPoll()
>>      if sys.platform == "win32":
>> -        p = ovs.poller.SelectPoll()
>>          event = winutils.get_new_event(None, False, True, None)
>>          # Receive notification of readiness for writing, of completed
>>          # connection or multipoint join operation, and of socket closure.
>> @@ -170,12 +170,15 @@ def check_connection_completion(sock):
>>                                   win32file.FD_CLOSE)
>>          p.register(event, ovs.poller.POLLOUT)
>>      else:
>> -        p = ovs.poller.get_system_poll()
>>          p.register(sock, ovs.poller.POLLOUT)
>>      pfds = p.poll(0)
>>      if len(pfds) == 1:
>>          revents = pfds[0][1]
>> -        if revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
>> +        if revents & ovs.poller.POLLOUT:
>> +            # This is how select() indicates a connect() error
> 
> This wording doesn't sound correct to me as it might be an error and might
> be not, IIUC.  POLLOUT here just means that something happened.  It might
> be an error or the actual POLLOUT.
> (Ugh... select()/poll() are so inconsistent across different platforms)
> 
>> +            return sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)
> 
> Looking at the history, it seems that ESX doesn't support SO_ERROR (at least
> some old versions), so we need to avoid use of it.
> See commit d6cedfd9d29d ("socket-util: Avoid using SO_ERROR.")
> 
> We, likely, need to treat this in the same path as error. i.e. invoke send()
> and catch the exception.  However, for the POLLOUT case we should not
> log and return the error if send() succeeds.  But this doesn't sound like
> a good idea to send NULL bytes to everyone on success...
> 
> So, some options here:
> 
> 1. Keep as is, i.e. system poll()
>    --> python will be blocked by poll()
> 
> 2. Fully revert c1aa16d19, i.e. just use select()
>    --> connections will be treated as established even if they are not
>        until the actual read/write.
> 
> 3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
>    --> give up on systems that doesn't support SO_ERRROR. (ESX?)
> 
> 4. Use select() + getsockopt(SO_ERRROR), keep returning 0 (success) on
>    ENOPROTOOPT
>    --> systems that doesn't support might think that connections are
>        established while they are not.
> 
> 5. Try to detect if SO_ERRROR supported or not and use system poll() or
>    select() + getsockopt(SO_ERRROR) based on that fact.
>    --> better performance on systems that supports and same as now on
>        systems that doesn't.
> 
> Option 3 is, probably, OK, but I don't know.
> Option 4 might be the best choice, but I'm not quiet sure how to detect.
> Is checking for ENOPROTOOPT enough (same actually applies to option 3
> as I'm not sure if ESX actually reports ENOPROTOOPT)?
> 
> Also, we're using select() in C code for windows.  Do we have the same
> issue there?  Alin, do you know if check_connection_completion() correctly
> detects connection failure on windows, e.g. if 'Check stream open block'
> tests are working?
> 
> Any thoughts?
> 
> Ben, what do you think?
> 
> Best regards, Ilya Maximets.
>
Ben Pfaff June 22, 2021, 4:51 p.m. UTC | #4
On Tue, Jun 15, 2021 at 10:06:53PM +0200, Ilya Maximets wrote:
> 3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
>    --> give up on systems that doesn't support SO_ERRROR. (ESX?)

d6cedfd9d29d is the commit titled "socket-util: Avoid using SO_ERROR."
from 2012.  This was right after the Nicira acquisition when we were
trying to port OVS to ESX.  We dropped that effort later and took a
different strategy on ESX.  You can kind of see that since there are no
references to ESX in the log from 2014 to 2019 and the one in 2019 is
commit e6fc4e2ce97e ("Remove ESX references.").

So, in short, ESX doesn't matter, go ahead and use SO_ERROR.
Ilya Maximets June 22, 2021, 5:13 p.m. UTC | #5
On 6/22/21 6:51 PM, Ben Pfaff wrote:
> On Tue, Jun 15, 2021 at 10:06:53PM +0200, Ilya Maximets wrote:
>> 3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
>>    --> give up on systems that doesn't support SO_ERRROR. (ESX?)
> 
> d6cedfd9d29d is the commit titled "socket-util: Avoid using SO_ERROR."
> from 2012.  This was right after the Nicira acquisition when we were
> trying to port OVS to ESX.  We dropped that effort later and took a
> different strategy on ESX.  You can kind of see that since there are no
> references to ESX in the log from 2014 to 2019 and the one in 2019 is
> commit e6fc4e2ce97e ("Remove ESX references.").
> 
> So, in short, ESX doesn't matter, go ahead and use SO_ERROR.
> 

Good to know.  Thanks!

So, IIUC, if we'll revert following 3 commits, the issue should be fixed:

1. c1aa16d191d2 ("ovs python: ovs.stream.open_block() returns success even if the remote is unreachable")
2. 24f974c481bc ("socket-util: Remove get_socket_error().")
3. d6cedfd9d29d ("socket-util: Avoid using SO_ERROR.")

We should keep test cases from the commit #1, though.

This can be done as a single patch that reverts 3 above with an
explanation, why we're doing that.

Terry, what do you think?

Best regards, Ilya Maximets.
Terry Wilson July 1, 2021, 6:27 a.m. UTC | #6
Sounds good to me. As you said before, the call to poll(0) itself doesn't
appear to block eventlet. Further inspection showed that the issue that we
were seeing was actually caused by cpu-bound code in python-ovs that was
blocking eventlet on connection in __do_parse_update() and, if not using
the c json extension, Parser.feed(). The short story is that eventlet
recommends putting time.sleep(0) calls in cpu-heavy code without I/O to
cooperatively yield to eventlet greenthreads. I'll post something with a
little  more analysis and a patch tomorrow when I'm not typing on my phone.

With that said, I do still like the SO_ERROR code better than hacking
around select/poll for eventlet.



On Tue, Jun 22, 2021, 12:13 PM Ilya Maximets <i.maximets@ovn.org> wrote:

> On 6/22/21 6:51 PM, Ben Pfaff wrote:
> > On Tue, Jun 15, 2021 at 10:06:53PM +0200, Ilya Maximets wrote:
> >> 3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
> >>    --> give up on systems that doesn't support SO_ERRROR. (ESX?)
> >
> > d6cedfd9d29d is the commit titled "socket-util: Avoid using SO_ERROR."
> > from 2012.  This was right after the Nicira acquisition when we were
> > trying to port OVS to ESX.  We dropped that effort later and took a
> > different strategy on ESX.  You can kind of see that since there are no
> > references to ESX in the log from 2014 to 2019 and the one in 2019 is
> > commit e6fc4e2ce97e ("Remove ESX references.").
> >
> > So, in short, ESX doesn't matter, go ahead and use SO_ERROR.
> >
>
> Good to know.  Thanks!
>
> So, IIUC, if we'll revert following 3 commits, the issue should be fixed:
>
> 1. c1aa16d191d2 ("ovs python: ovs.stream.open_block() returns success even
> if the remote is unreachable")
> 2. 24f974c481bc ("socket-util: Remove get_socket_error().")
> 3. d6cedfd9d29d ("socket-util: Avoid using SO_ERROR.")
>
> We should keep test cases from the commit #1, though.
>
> This can be done as a single patch that reverts 3 above with an
> explanation, why we're doing that.
>
> Terry, what do you think?
>
> Best regards, Ilya Maximets.
>
>
Ilya Maximets July 16, 2021, 2:29 p.m. UTC | #7
On 7/1/21 8:27 AM, Terry Wilson wrote:
> Sounds good to me. As you said before, the call to poll(0) itself doesn't appear to block eventlet. Further inspection showed that the issue that we were seeing was actually caused by cpu-bound code in python-ovs that was blocking eventlet on connection in __do_parse_update() and, if not using the c json extension, Parser.feed(). The short story is that eventlet recommends putting time.sleep(0) calls in cpu-heavy code without I/O to cooperatively yield to eventlet greenthreads. I'll post something with a little  more analysis and a patch tomorrow when I'm not typing on my phone.
> 
> With that said, I do still like the SO_ERROR code better than hacking around select/poll for eventlet.

Thanks for getting to the bottom of the issue!
Since it's not essential to revert these patches right now,
I'll keep them.  In case we'll find another issue with this
code in the future, we'll know what to do. :)

> 
> 
> 
> On Tue, Jun 22, 2021, 12:13 PM Ilya Maximets <i.maximets@ovn.org <mailto:i.maximets@ovn.org>> wrote:
> 
>     On 6/22/21 6:51 PM, Ben Pfaff wrote:
>     > On Tue, Jun 15, 2021 at 10:06:53PM +0200, Ilya Maximets wrote:
>     >> 3. Use select() + getsockopt(SO_ERRROR), effectively revert d6cedfd9d29d
>     >>    --> give up on systems that doesn't support SO_ERRROR. (ESX?)
>     >
>     > d6cedfd9d29d is the commit titled "socket-util: Avoid using SO_ERROR."
>     > from 2012.  This was right after the Nicira acquisition when we were
>     > trying to port OVS to ESX.  We dropped that effort later and took a
>     > different strategy on ESX.  You can kind of see that since there are no
>     > references to ESX in the log from 2014 to 2019 and the one in 2019 is
>     > commit e6fc4e2ce97e ("Remove ESX references.").
>     >
>     > So, in short, ESX doesn't matter, go ahead and use SO_ERROR.
>     >
> 
>     Good to know.  Thanks!
> 
>     So, IIUC, if we'll revert following 3 commits, the issue should be fixed:
> 
>     1. c1aa16d191d2 ("ovs python: ovs.stream.open_block() returns success even if the remote is unreachable")
>     2. 24f974c481bc ("socket-util: Remove get_socket_error().")
>     3. d6cedfd9d29d ("socket-util: Avoid using SO_ERROR.")
> 
>     We should keep test cases from the commit #1, though.
> 
>     This can be done as a single patch that reverts 3 above with an
>     explanation, why we're doing that.
> 
>     Terry, what do you think?
> 
>     Best regards, Ilya Maximets.
>
diff mbox series

Patch

diff --git a/python/ovs/poller.py b/python/ovs/poller.py
index 3624ec865..7b3238d6d 100644
--- a/python/ovs/poller.py
+++ b/python/ovs/poller.py
@@ -31,22 +31,14 @@  except ImportError:
     SSL = None
 
 try:
-    from eventlet import patcher as eventlet_patcher
+    import eventlet.patcher
 
     def _using_eventlet_green_select():
-        return eventlet_patcher.is_monkey_patched(select)
+        return eventlet.patcher.is_monkey_patched(select)
 except:
-    eventlet_patcher = None
-
     def _using_eventlet_green_select():
         return False
 
-try:
-    from gevent import monkey as gevent_monkey
-except:
-    gevent_monkey = None
-
-
 vlog = ovs.vlog.Vlog("poller")
 
 POLLIN = 0x001
@@ -265,26 +257,3 @@  class Poller(object):
     def __reset(self):
         self.poll = SelectPoll()
         self.timeout = -1
-
-
-def get_system_poll():
-    """Returns the original select.poll() object. If select.poll is
-    monkey patched by eventlet or gevent library, it gets the original
-    select.poll and returns an object of it using the
-    eventlet.patcher.original/gevent.monkey.get_original functions.
-
-    As a last resort, if there is any exception it returns the
-    SelectPoll() object.
-    """
-    try:
-        if _using_eventlet_green_select():
-            _system_poll = eventlet_patcher.original("select").poll
-        elif gevent_monkey and gevent_monkey.is_object_patched(
-                'select', 'poll'):
-            _system_poll = gevent_monkey.get_original('select', 'poll')
-        else:
-            _system_poll = select.poll
-    except:
-        _system_poll = SelectPoll
-
-    return _system_poll()
diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py
index 3faa64e9d..7dd000916 100644
--- a/python/ovs/socket_util.py
+++ b/python/ovs/socket_util.py
@@ -159,8 +159,8 @@  def make_unix_socket(style, nonblock, bind_path, connect_path, short=False):
 
 
 def check_connection_completion(sock):
+    p = ovs.poller.SelectPoll()
     if sys.platform == "win32":
-        p = ovs.poller.SelectPoll()
         event = winutils.get_new_event(None, False, True, None)
         # Receive notification of readiness for writing, of completed
         # connection or multipoint join operation, and of socket closure.
@@ -170,12 +170,15 @@  def check_connection_completion(sock):
                                  win32file.FD_CLOSE)
         p.register(event, ovs.poller.POLLOUT)
     else:
-        p = ovs.poller.get_system_poll()
         p.register(sock, ovs.poller.POLLOUT)
     pfds = p.poll(0)
     if len(pfds) == 1:
         revents = pfds[0][1]
-        if revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
+        if revents & ovs.poller.POLLOUT:
+            # This is how select() indicates a connect() error
+            return sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR)
+        elif revents & ovs.poller.POLLERR or revents & ovs.poller.POLLHUP:
+            # This is how poll() indicates a connect() error
             try:
                 # The following should raise an exception.
                 sock.send("\0".encode(), socket.MSG_DONTWAIT)