diff mbox series

[1/2] cifs: change wait_for_free_request to take number of requests to wait for

Message ID 20190220061145.28241-2-lsahlber@redhat.com
State New
Headers show
Series Simplify handling of credits in compound_send_recv() | expand

Commit Message

Ronnie Sahlberg Feb. 20, 2019, 6:11 a.m. UTC
Change wait_for_free_request to allow specifying how many requests(credits)
to wait for.
Pass 1 from all callsites.

This should not change any behavior but will allow future callers to
atomically wait for >1 requests.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/cifsglob.h  |  4 ++--
 fs/cifs/smb2ops.c   |  2 +-
 fs/cifs/transport.c | 25 ++++++++++++++-----------
 3 files changed, 17 insertions(+), 14 deletions(-)

Comments

Pavel Shilovsky Feb. 21, 2019, 1:39 a.m. UTC | #1
вт, 19 февр. 2019 г. в 22:12, Ronnie Sahlberg <lsahlber@redhat.com>:
>
> Change wait_for_free_request to allow specifying how many requests(credits)
> to wait for.
> Pass 1 from all callsites.
>
> This should not change any behavior but will allow future callers to
> atomically wait for >1 requests.
>
> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
> ---
>  fs/cifs/cifsglob.h  |  4 ++--
>  fs/cifs/smb2ops.c   |  2 +-
>  fs/cifs/transport.c | 25 ++++++++++++++-----------
>  3 files changed, 17 insertions(+), 14 deletions(-)
>
> diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
> index 5bf463c2d9dc..1b3f62eee4d0 100644
> --- a/fs/cifs/cifsglob.h
> +++ b/fs/cifs/cifsglob.h
> @@ -732,13 +732,13 @@ in_flight(struct TCP_Server_Info *server)
>  }
>
>  static inline bool
> -has_credits(struct TCP_Server_Info *server, int *credits)
> +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
>  {
>         int num;
>         spin_lock(&server->req_lock);
>         num = *credits;
>         spin_unlock(&server->req_lock);
> -       return num > 0;
> +       return num >= num_credits;
>  }
>
>  static inline void
> diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
> index 080929a64c64..372a3563b79b 100644
> --- a/fs/cifs/smb2ops.c
> +++ b/fs/cifs/smb2ops.c
> @@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
>                         spin_unlock(&server->req_lock);
>                         cifs_num_waiters_inc(server);
>                         rc = wait_event_killable(server->request_q,
> -                                       has_credits(server, &server->credits));
> +                                has_credits(server, &server->credits, 1));
>                         cifs_num_waiters_dec(server);
>                         if (rc)
>                                 return rc;
> diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
> index d4f1224fb7cb..8ce5b6e787c6 100644
> --- a/fs/cifs/transport.c
> +++ b/fs/cifs/transport.c
> @@ -478,7 +478,8 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
>
>  static int
>  wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
> -                     int *credits, unsigned int *instance)
> +                     int num_credits, int *credits,
> +                     unsigned int *instance)
>  {
>         int rc;
>
> @@ -495,11 +496,11 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
>         }
>
>         while (1) {
> -               if (*credits <= 0) {
> +               if (*credits < num_credits) {
>                         spin_unlock(&server->req_lock);
>                         cifs_num_waiters_inc(server);
>                         rc = wait_event_killable(server->request_q,
> -                                                has_credits(server, credits));
> +                                has_credits(server, credits, num_credits));
>                         cifs_num_waiters_dec(server);
>                         if (rc)
>                                 return rc;
> @@ -517,8 +518,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
>
>                         /* update # of requests on the wire to server */
>                         if (timeout != CIFS_BLOCKING_OP) {
> -                               *credits -= 1;
> -                               server->in_flight++;
> +                               *credits -= num_credits;
> +                               server->in_flight += num_credits;
>                                 *instance = server->reconnect_instance;
>                         }
>                         spin_unlock(&server->req_lock);
> @@ -530,7 +531,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
>
>  static int
>  wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
> -                     const int optype, unsigned int *instance)
> +                     int num, const int optype, unsigned int *instance)

Waiting for more than one credit without a timeout is a source of
possibility to be stuck. We already have 'timeout' argument which
contains some part of flags passed to transport layer from PDU. We
might consider renaming 'optype' argument into flags and use 'timeout'
with its true meaning - timeout of the waiting for credits.

Once implemented, we can define some default period of time which we
can *afford* to wait for credits and then fall back to sequential code
if we timed out.

I also suggest to add another function wait_for_compound_credits and
make wait_for_free_request calls it - doesn't require changes to all
the places where the latter is being used.

>  {
>         int *val;
>
> @@ -538,7 +539,7 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
>         /* Since an echo is already inflight, no need to wait to send another */
>         if (*val <= 0 && optype == CIFS_ECHO_OP)
>                 return -EAGAIN;
> -       return wait_for_free_credits(server, timeout, val, instance);
> +       return wait_for_free_credits(server, timeout, num, val, instance);
>  }
>
>  int
> @@ -646,7 +647,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
>         optype = flags & CIFS_OP_MASK;
>
>         if ((flags & CIFS_HAS_CREDITS) == 0) {
> -               rc = wait_for_free_request(server, timeout, optype, &instance);
> +               rc = wait_for_free_request(server, timeout, 1, optype,
> +                                          &instance);
>                 if (rc)
>                         return rc;
>                 credits.value = 1;
> @@ -923,7 +925,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
>          * Ensure we obtain 1 credit per request in the compound chain.
>          */
>         for (i = 0; i < num_rqst; i++) {
> -               rc = wait_for_free_request(ses->server, timeout, optype,
> +               rc = wait_for_free_request(ses->server, timeout, 1, optype,
>                                            &instance);
>
>                 if (rc == 0) {
> @@ -1212,7 +1214,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
>                 return -EIO;
>         }
>
> -       rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
> +       rc = wait_for_free_request(ses->server, timeout, 1, 0,
> +                                  &credits.instance);
>         if (rc)
>                 return rc;
>
> @@ -1354,7 +1357,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
>                 return -EIO;
>         }
>
> -       rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
> +       rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 1, 0,
>                                    &instance);
>         if (rc)
>                 return rc;
> --
> 2.13.6
>

--
Best regards,
Pavel Shilovsky
ronnie sahlberg Feb. 28, 2019, 5:12 a.m. UTC | #2
On Thu, Feb 21, 2019 at 11:39 AM Pavel Shilovsky <piastryyy@gmail.com> wrote:
>
> вт, 19 февр. 2019 г. в 22:12, Ronnie Sahlberg <lsahlber@redhat.com>:
> >
> > Change wait_for_free_request to allow specifying how many requests(credits)
> > to wait for.
> > Pass 1 from all callsites.
> >
> > This should not change any behavior but will allow future callers to
> > atomically wait for >1 requests.
> >
> > Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
> > ---
> >  fs/cifs/cifsglob.h  |  4 ++--
> >  fs/cifs/smb2ops.c   |  2 +-
> >  fs/cifs/transport.c | 25 ++++++++++++++-----------
> >  3 files changed, 17 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
> > index 5bf463c2d9dc..1b3f62eee4d0 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -732,13 +732,13 @@ in_flight(struct TCP_Server_Info *server)
> >  }
> >
> >  static inline bool
> > -has_credits(struct TCP_Server_Info *server, int *credits)
> > +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
> >  {
> >         int num;
> >         spin_lock(&server->req_lock);
> >         num = *credits;
> >         spin_unlock(&server->req_lock);
> > -       return num > 0;
> > +       return num >= num_credits;
> >  }
> >
> >  static inline void
> > diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
> > index 080929a64c64..372a3563b79b 100644
> > --- a/fs/cifs/smb2ops.c
> > +++ b/fs/cifs/smb2ops.c
> > @@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
> >                         spin_unlock(&server->req_lock);
> >                         cifs_num_waiters_inc(server);
> >                         rc = wait_event_killable(server->request_q,
> > -                                       has_credits(server, &server->credits));
> > +                                has_credits(server, &server->credits, 1));
> >                         cifs_num_waiters_dec(server);
> >                         if (rc)
> >                                 return rc;
> > diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
> > index d4f1224fb7cb..8ce5b6e787c6 100644
> > --- a/fs/cifs/transport.c
> > +++ b/fs/cifs/transport.c
> > @@ -478,7 +478,8 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
> >
> >  static int
> >  wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
> > -                     int *credits, unsigned int *instance)
> > +                     int num_credits, int *credits,
> > +                     unsigned int *instance)
> >  {
> >         int rc;
> >
> > @@ -495,11 +496,11 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
> >         }
> >
> >         while (1) {
> > -               if (*credits <= 0) {
> > +               if (*credits < num_credits) {
> >                         spin_unlock(&server->req_lock);
> >                         cifs_num_waiters_inc(server);
> >                         rc = wait_event_killable(server->request_q,
> > -                                                has_credits(server, credits));
> > +                                has_credits(server, credits, num_credits));
> >                         cifs_num_waiters_dec(server);
> >                         if (rc)
> >                                 return rc;
> > @@ -517,8 +518,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
> >
> >                         /* update # of requests on the wire to server */
> >                         if (timeout != CIFS_BLOCKING_OP) {
> > -                               *credits -= 1;
> > -                               server->in_flight++;
> > +                               *credits -= num_credits;
> > +                               server->in_flight += num_credits;
> >                                 *instance = server->reconnect_instance;
> >                         }
> >                         spin_unlock(&server->req_lock);
> > @@ -530,7 +531,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
> >
> >  static int
> >  wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
> > -                     const int optype, unsigned int *instance)
> > +                     int num, const int optype, unsigned int *instance)
>
> Waiting for more than one credit without a timeout is a source of
> possibility to be stuck. We already have 'timeout' argument which
> contains some part of flags passed to transport layer from PDU. We
> might consider renaming 'optype' argument into flags and use 'timeout'
> with its true meaning - timeout of the waiting for credits.
>
> Once implemented, we can define some default period of time which we
> can *afford* to wait for credits and then fall back to sequential code
> if we timed out.
>
> I also suggest to add another function wait_for_compound_credits and
> make wait_for_free_request calls it - doesn't require changes to all
> the places where the latter is being used.

Thanks, I am reworking it and will resend shortly.
Passing flags down into these functions instead of timeout/optype
makes sense and I will do that.

I am not sure we need to add a timeout to represent a deadline after
we return -EAGAIN or similar.
But for sure we need to prevent waiting atomically for >1 credits to
block forever.
This could happen in two scenarios, the first would be if the number
of free credits are persistently pegged at <=1 credits
due to persistent and massive number of threads causing single credits
SMB2 I/O.
I.e. we are constantly starved for multi-credit requests.

The second would be if we get stuck at <=1 available credits even when
the session is idle.
This would be a bug and the best is probably to let the reconnect
eventually happen and thus force the credits to be in sync again
between server and client.
So I don't think we need t do anything explicit for that case here.

The first case is a genuine case, but I think we can handle that in a
different way instead of switching back to a "grab one credit at a
time" like we have now.
In that scenario we would have a different problem that is  there
could be so many concurrent calls for compounded operations requiring
>=3 credits each
that we eventually consume MAX_CREDITS number of credits across these
threads. Each thread having already allocated one of more credits less
that it needs for the full compound
and the the threads are now all deadlocked each waiting for one or
more credits that will never become available.
Making the allocation of credits for a compound allocate all of them
atomically solves this type of deadlock, but would be prone to the
credits starvation issue.

The way I propose to solve the credits starvation problem for
compounds is something like this (I haven't coded it yet so I dont
know exactly what it will look like)
* After the session has been fully established, we should be able to
assume we WILL have a whole bunch of credits available.
Modulo a hypothetical situation where the sever will just never grant
us more than 1 or 2 credits in total. I don't think this is a
reasonable situation we need to worry too much about and maybe the
right thing to do here is just to block all compound
operations until we get a session reconnect where hopefully the server
will be more reasonable.
* We can define a new MAX_CREDITS_FOR_COMPOUND which would be the
largest number of credits that we will ever create a single compound
for.
I think that would be 4 at this stage.
* In the loop in wait_for_free_credits(), If this is a request for a
single credit AND iff it is a normal operation, i.e. &CIFS_OP_MASK ==
0
then we will NOT grant the single credit but continue looping.

I.e. basically reserving the last MAX_CREDITS_FOR_COMPOUND credits
before we completely run out to only be available for compounds.
That way we do guarantee that compound requests will be making
progress and will not be completely starved by non-compound requests..



>
> >  {
> >         int *val;
> >
> > @@ -538,7 +539,7 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
> >         /* Since an echo is already inflight, no need to wait to send another */
> >         if (*val <= 0 && optype == CIFS_ECHO_OP)
> >                 return -EAGAIN;
> > -       return wait_for_free_credits(server, timeout, val, instance);
> > +       return wait_for_free_credits(server, timeout, num, val, instance);
> >  }
> >
> >  int
> > @@ -646,7 +647,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
> >         optype = flags & CIFS_OP_MASK;
> >
> >         if ((flags & CIFS_HAS_CREDITS) == 0) {
> > -               rc = wait_for_free_request(server, timeout, optype, &instance);
> > +               rc = wait_for_free_request(server, timeout, 1, optype,
> > +                                          &instance);
> >                 if (rc)
> >                         return rc;
> >                 credits.value = 1;
> > @@ -923,7 +925,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
> >          * Ensure we obtain 1 credit per request in the compound chain.
> >          */
> >         for (i = 0; i < num_rqst; i++) {
> > -               rc = wait_for_free_request(ses->server, timeout, optype,
> > +               rc = wait_for_free_request(ses->server, timeout, 1, optype,
> >                                            &instance);
> >
> >                 if (rc == 0) {
> > @@ -1212,7 +1214,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
> >                 return -EIO;
> >         }
> >
> > -       rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
> > +       rc = wait_for_free_request(ses->server, timeout, 1, 0,
> > +                                  &credits.instance);
> >         if (rc)
> >                 return rc;
> >
> > @@ -1354,7 +1357,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
> >                 return -EIO;
> >         }
> >
> > -       rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
> > +       rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 1, 0,
> >                                    &instance);
> >         if (rc)
> >                 return rc;
> > --
> > 2.13.6
> >
>
> --
> Best regards,
> Pavel Shilovsky
Pavel Shilovsky March 6, 2019, 2:15 a.m. UTC | #3
On Wed, Feb 27, 2019 at 09:12 PM, ronnie sahlberg
<ronniesahlberg@gmail.com> wrote:
>
> On Thu, Feb 21, 2019 at 11:39 AM Pavel Shilovsky <piastryyy@gmail.com> wrote:
> >
...
> >
> > Waiting for more than one credit without a timeout is a source of
> > possibility to be stuck. We already have 'timeout' argument which
> > contains some part of flags passed to transport layer from PDU. We
> > might consider renaming 'optype' argument into flags and use 'timeout'
> > with its true meaning - timeout of the waiting for credits.
> >
> > Once implemented, we can define some default period of time which we
> > can *afford* to wait for credits and then fall back to sequential code
> > if we timed out.
> >
> > I also suggest to add another function wait_for_compound_credits and
> > make wait_for_free_request calls it - doesn't require changes to all
> > the places where the latter is being used.
>
> Thanks, I am reworking it and will resend shortly.
> Passing flags down into these functions instead of timeout/optype
> makes sense and I will do that.
>
> I am not sure we need to add a timeout to represent a deadline after
> we return -EAGAIN or similar.
> But for sure we need to prevent waiting atomically for >1 credits to
> block forever.

(adding Jeremy to comment and cc'ing samba-technical)

Thanks for taking a look at this.

If we don't set a timeout on waiting for several credits at once, we
may end up waiting forever. It is only a matter of what is a
possibility to be stuck. We can try to guess if the server is going to
grant us more or not in future by analyzing the number of requests in
flight or other parameters but we can't say for sure because the spec
doesn't say anything about minimal number of credits. This means that
under certain conditions the server may end up giving us a very few
credits that won't be enough to do compounding.

Those conditions might be different but from the top of the head I can
give an example of a highly loaded file server with thousands of
active connections that experiences CPU and/or memory pressure and
decides to limit the number of credits per connection. That's why I do
think that the ability to set a timeout on waiting is essential. We
can start with something between 100ms and 1s (and even making it
auto-tunable - let's say a couple round trips to the server) and then
decide what is the best (or even make it configurable).

> This could happen in two scenarios, the first would be if the number
> of free credits are persistently pegged at <=1 credits
> due to persistent and massive number of threads causing single credits
> SMB2 I/O.
> I.e. we are constantly starved for multi-credit requests.
>
> The second would be if we get stuck at <=1 available credits even when
> the session is idle.
> This would be a bug and the best is probably to let the reconnect
> eventually happen and thus force the credits to be in sync again
> between server and client.
> So I don't think we need t do anything explicit for that case here.
>
> The first case is a genuine case, but I think we can handle that in a
> different way instead of switching back to a "grab one credit at a
> time" like we have now.
> In that scenario we would have a different problem that is  there
> could be so many concurrent calls for compounded operations requiring
> >=3 credits each
> that we eventually consume MAX_CREDITS number of credits across these
> threads. Each thread having already allocated one of more credits less
> that it needs for the full compound
> and the the threads are now all deadlocked each waiting for one or
> more credits that will never become available.
> Making the allocation of credits for a compound allocate all of them
> atomically solves this type of deadlock, but would be prone to the
> credits starvation issue.

The approach of grabbing one credit in a time was a temporary work
around that didn't require a lot of code changes and was easier to be
pushed to stable kernels. There are no doubts that it left a
possibility to be stuck and we need a proper solution to handle
compound requests.

> The way I propose to solve the credits starvation problem for
> compounds is something like this (I haven't coded it yet so I dont
> know exactly what it will look like)
> * After the session has been fully established, we should be able to
> assume we WILL have a whole bunch of credits available.
> Modulo a hypothetical situation where the sever will just never grant
> us more than 1 or 2 credits in total. I don't think this is a
> reasonable situation we need to worry too much about and maybe the
> right thing to do here is just to block all compound
> operations until we get a session reconnect where hopefully the server
> will be more reasonable.
> * We can define a new MAX_CREDITS_FOR_COMPOUND which would be the
> largest number of credits that we will ever create a single compound
> for.
> I think that would be 4 at this stage.

Note, that we reserve 1 credit for echo request and 1 credit for
oplock/lease break ack, so those operations won't be stuck waiting for
credits. If the maximum number of requests in the compounded chain is
4, then 5 credits granted by the server will make our client useless
for certain operations.

> * In the loop in wait_for_free_credits(), If this is a request for a
> single credit AND iff it is a normal operation, i.e. &CIFS_OP_MASK ==
> 0
> then we will NOT grant the single credit but continue looping.
>
> I.e. basically reserving the last MAX_CREDITS_FOR_COMPOUND credits
> before we completely run out to only be available for compounds.
> That way we do guarantee that compound requests will be making
> progress and will not be completely starved by non-compound requests..
>

Trying to reserve MAX_CREDITS_FOR_COMPOUNDING increases latency of
single-credit requests. In some workloads compounding won't be even
used (let's say it is pure IO workload), so instead of using all the
bandwidth available to the client (and the credit bandwidth might be
pretty narrow for highly loaded servers) we will end up using only a
part of it.

But it is not only about latency of a single-credit request, it is
also about latency of a compounded request itself. Instead of waiting
for all the credits for a long period of time we can start making
progress by doing sequential operations. If the round trip time is
small, the penalty of indefinite waiting instead of going sequential
will be rather substantial. That's why waiting only for some
reasonable period of time seems like a good option to try: if we have
already waited for some reasonable time, let's stop waiting, return
special error code and let the upper layer process the request
sequentially.

So, I think that compounding is a great optimization and it is
important to try using it as much as possible across the client code.
In the same time, the SMB3 client should be capable to handle any
workload or scenario that doesn't contradict the spec.

Thoughts?


--
Best regards,
Pavel Shilovsky
ronnie sahlberg March 6, 2019, 4:36 a.m. UTC | #4
On Wed, Mar 6, 2019 at 12:15 PM Pavel Shilovsky <piastryyy@gmail.com> wrote:
>
> On Wed, Feb 27, 2019 at 09:12 PM, ronnie sahlberg
> <ronniesahlberg@gmail.com> wrote:
> >
> > On Thu, Feb 21, 2019 at 11:39 AM Pavel Shilovsky <piastryyy@gmail.com> wrote:
> > >
> ...
> > >
> > > Waiting for more than one credit without a timeout is a source of
> > > possibility to be stuck. We already have 'timeout' argument which
> > > contains some part of flags passed to transport layer from PDU. We
> > > might consider renaming 'optype' argument into flags and use 'timeout'
> > > with its true meaning - timeout of the waiting for credits.
> > >
> > > Once implemented, we can define some default period of time which we
> > > can *afford* to wait for credits and then fall back to sequential code
> > > if we timed out.
> > >
> > > I also suggest to add another function wait_for_compound_credits and
> > > make wait_for_free_request calls it - doesn't require changes to all
> > > the places where the latter is being used.
> >
> > Thanks, I am reworking it and will resend shortly.
> > Passing flags down into these functions instead of timeout/optype
> > makes sense and I will do that.
> >
> > I am not sure we need to add a timeout to represent a deadline after
> > we return -EAGAIN or similar.
> > But for sure we need to prevent waiting atomically for >1 credits to
> > block forever.
>
> (adding Jeremy to comment and cc'ing samba-technical)
>
> Thanks for taking a look at this.
>
> If we don't set a timeout on waiting for several credits at once, we
> may end up waiting forever. It is only a matter of what is a
> possibility to be stuck. We can try to guess if the server is going to
> grant us more or not in future by analyzing the number of requests in
> flight or other parameters but we can't say for sure because the spec
> doesn't say anything about minimal number of credits. This means that
> under certain conditions the server may end up giving us a very few
> credits that won't be enough to do compounding.
>
> Those conditions might be different but from the top of the head I can
> give an example of a highly loaded file server with thousands of
> active connections that experiences CPU and/or memory pressure and
> decides to limit the number of credits per connection. That's why I do
> think that the ability to set a timeout on waiting is essential. We
> can start with something between 100ms and 1s (and even making it
> auto-tunable - let's say a couple round trips to the server) and then
> decide what is the best (or even make it configurable).
>
> > This could happen in two scenarios, the first would be if the number
> > of free credits are persistently pegged at <=1 credits
> > due to persistent and massive number of threads causing single credits
> > SMB2 I/O.
> > I.e. we are constantly starved for multi-credit requests.
> >
> > The second would be if we get stuck at <=1 available credits even when
> > the session is idle.
> > This would be a bug and the best is probably to let the reconnect
> > eventually happen and thus force the credits to be in sync again
> > between server and client.
> > So I don't think we need t do anything explicit for that case here.
> >
> > The first case is a genuine case, but I think we can handle that in a
> > different way instead of switching back to a "grab one credit at a
> > time" like we have now.
> > In that scenario we would have a different problem that is  there
> > could be so many concurrent calls for compounded operations requiring
> > >=3 credits each
> > that we eventually consume MAX_CREDITS number of credits across these
> > threads. Each thread having already allocated one of more credits less
> > that it needs for the full compound
> > and the the threads are now all deadlocked each waiting for one or
> > more credits that will never become available.
> > Making the allocation of credits for a compound allocate all of them
> > atomically solves this type of deadlock, but would be prone to the
> > credits starvation issue.
>
> The approach of grabbing one credit in a time was a temporary work
> around that didn't require a lot of code changes and was easier to be
> pushed to stable kernels. There are no doubts that it left a
> possibility to be stuck and we need a proper solution to handle
> compound requests.
>
> > The way I propose to solve the credits starvation problem for
> > compounds is something like this (I haven't coded it yet so I dont
> > know exactly what it will look like)
> > * After the session has been fully established, we should be able to
> > assume we WILL have a whole bunch of credits available.
> > Modulo a hypothetical situation where the sever will just never grant
> > us more than 1 or 2 credits in total. I don't think this is a
> > reasonable situation we need to worry too much about and maybe the
> > right thing to do here is just to block all compound
> > operations until we get a session reconnect where hopefully the server
> > will be more reasonable.
> > * We can define a new MAX_CREDITS_FOR_COMPOUND which would be the
> > largest number of credits that we will ever create a single compound
> > for.
> > I think that would be 4 at this stage.
>
> Note, that we reserve 1 credit for echo request and 1 credit for
> oplock/lease break ack, so those operations won't be stuck waiting for
> credits. If the maximum number of requests in the compounded chain is
> 4, then 5 credits granted by the server will make our client useless
> for certain operations.
>
> > * In the loop in wait_for_free_credits(), If this is a request for a
> > single credit AND iff it is a normal operation, i.e. &CIFS_OP_MASK ==
> > 0
> > then we will NOT grant the single credit but continue looping.
> >
> > I.e. basically reserving the last MAX_CREDITS_FOR_COMPOUND credits
> > before we completely run out to only be available for compounds.
> > That way we do guarantee that compound requests will be making
> > progress and will not be completely starved by non-compound requests..
> >
>
> Trying to reserve MAX_CREDITS_FOR_COMPOUNDING increases latency of
> single-credit requests. In some workloads compounding won't be even
> used (let's say it is pure IO workload), so instead of using all the
> bandwidth available to the client (and the credit bandwidth might be
> pretty narrow for highly loaded servers) we will end up using only a
> part of it.
>
> But it is not only about latency of a single-credit request, it is
> also about latency of a compounded request itself. Instead of waiting
> for all the credits for a long period of time we can start making
> progress by doing sequential operations. If the round trip time is
> small, the penalty of indefinite waiting instead of going sequential
> will be rather substantial. That's why waiting only for some
> reasonable period of time seems like a good option to try: if we have
> already waited for some reasonable time, let's stop waiting, return
> special error code and let the upper layer process the request
> sequentially.
>
> So, I think that compounding is a great optimization and it is
> important to try using it as much as possible across the client code.
> In the same time, the SMB3 client should be capable to handle any
> workload or scenario that doesn't contradict the spec.
>
> Thoughts?

Thanks. I have added a, for now, fixed timeout with a rather large value and
logging a message when we give up waiting for a multi-credit request.

These messages should be good for identifying if/when this happens.

As for switching from multi-credit compound sequences to to
non-compounded serialized code,
I don't think we should do that because we would have to duplicate all
these codepaths. And only to solve
a pathological issue where performance would already be so poor that
the share is basically inoperable IMHO.


This would only be an issue IFF we have a server that for very long
timeperiods pegs us with less than
a handful of credits even on an idle session.
I think it is fair to wait very long, even indefinitely in this case.
What we might do is eventually perform a full
reconnect and hope for a miracle or log something to point we need to
re-balance our servers.


Again, I am not sure if we should switched to single threaded
un-compouded mode in that situation.
I would consider it a pathological case where performance and behavior
would have degraded so
much that the smb share is almost inoperable.

I would consider this similar to behavior where due to packetloss TCP
pegs us permanently at a congestion window of 3kb.
The solution there is not better packet-loss recovery in TCP or tuning
re-transmission timeouts but fix the underlying network issue
and prevent this pathological case from happening in the first place.


Those are my thoughts. This would be a case where we can log what is
wrong but the fix should be to prevent the server from
doing this in the first place.

regards
ronnie sahlberg

>
>
> --
> Best regards,
> Pavel Shilovsky
diff mbox series

Patch

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5bf463c2d9dc..1b3f62eee4d0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -732,13 +732,13 @@  in_flight(struct TCP_Server_Info *server)
 }
 
 static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
 {
 	int num;
 	spin_lock(&server->req_lock);
 	num = *credits;
 	spin_unlock(&server->req_lock);
-	return num > 0;
+	return num >= num_credits;
 }
 
 static inline void
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 080929a64c64..372a3563b79b 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -185,7 +185,7 @@  smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
 			spin_unlock(&server->req_lock);
 			cifs_num_waiters_inc(server);
 			rc = wait_event_killable(server->request_q,
-					has_credits(server, &server->credits));
+				 has_credits(server, &server->credits, 1));
 			cifs_num_waiters_dec(server);
 			if (rc)
 				return rc;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d4f1224fb7cb..8ce5b6e787c6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -478,7 +478,8 @@  smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 
 static int
 wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
-		      int *credits, unsigned int *instance)
+		      int num_credits, int *credits,
+		      unsigned int *instance)
 {
 	int rc;
 
@@ -495,11 +496,11 @@  wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
 	}
 
 	while (1) {
-		if (*credits <= 0) {
+		if (*credits < num_credits) {
 			spin_unlock(&server->req_lock);
 			cifs_num_waiters_inc(server);
 			rc = wait_event_killable(server->request_q,
-						 has_credits(server, credits));
+				 has_credits(server, credits, num_credits));
 			cifs_num_waiters_dec(server);
 			if (rc)
 				return rc;
@@ -517,8 +518,8 @@  wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
 
 			/* update # of requests on the wire to server */
 			if (timeout != CIFS_BLOCKING_OP) {
-				*credits -= 1;
-				server->in_flight++;
+				*credits -= num_credits;
+				server->in_flight += num_credits;
 				*instance = server->reconnect_instance;
 			}
 			spin_unlock(&server->req_lock);
@@ -530,7 +531,7 @@  wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
 
 static int
 wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
-		      const int optype, unsigned int *instance)
+		      int num, const int optype, unsigned int *instance)
 {
 	int *val;
 
@@ -538,7 +539,7 @@  wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
 	/* Since an echo is already inflight, no need to wait to send another */
 	if (*val <= 0 && optype == CIFS_ECHO_OP)
 		return -EAGAIN;
-	return wait_for_free_credits(server, timeout, val, instance);
+	return wait_for_free_credits(server, timeout, num, val, instance);
 }
 
 int
@@ -646,7 +647,8 @@  cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	optype = flags & CIFS_OP_MASK;
 
 	if ((flags & CIFS_HAS_CREDITS) == 0) {
-		rc = wait_for_free_request(server, timeout, optype, &instance);
+		rc = wait_for_free_request(server, timeout, 1, optype,
+					   &instance);
 		if (rc)
 			return rc;
 		credits.value = 1;
@@ -923,7 +925,7 @@  compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	 * Ensure we obtain 1 credit per request in the compound chain.
 	 */
 	for (i = 0; i < num_rqst; i++) {
-		rc = wait_for_free_request(ses->server, timeout, optype,
+		rc = wait_for_free_request(ses->server, timeout, 1, optype,
 					   &instance);
 
 		if (rc == 0) {
@@ -1212,7 +1214,8 @@  SendReceive(const unsigned int xid, struct cifs_ses *ses,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
+	rc = wait_for_free_request(ses->server, timeout, 1, 0,
+				   &credits.instance);
 	if (rc)
 		return rc;
 
@@ -1354,7 +1357,7 @@  SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 1, 0,
 				   &instance);
 	if (rc)
 		return rc;