diff mbox

scsi-generic: bugfixes for 'SCSIRequest' conversion

Message ID 1290586723-8724-1-git-send-email-nab@linux-iscsi.org
State New
Headers show

Commit Message

Nicholas A. Bellinger Nov. 24, 2010, 8:18 a.m. UTC
From: Nicholas Bellinger <nab@linux-iscsi.org>

This patch adds a handful of bugfixes for scsi-generic
that where added into:

commit a4194b3f79a85e111f000788ddec05d465748851
Author: Hannes Reinecke <hare@suse.de>
Date:   Mon Nov 22 15:39:33 2010 -0800

    scsi: Use 'SCSIRequest' directly

this includes:

*) Fix incorrect errno usage in switch() statement within
   scsi_command_complete()

*) Remove bogus scsi_command_complete() for residual case
   within scsi_read_complete()

*) Remove incorrect '-' sign from return in scsi_send_command()

Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.

Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>
---
 hw/scsi-generic.c |   14 ++++++--------
 1 files changed, 6 insertions(+), 8 deletions(-)

Comments

Nicholas A. Bellinger Nov. 24, 2010, 8:57 a.m. UTC | #1
On Wed, 2010-11-24 at 09:57 +0100, Hannes Reinecke wrote:
> On 11/24/2010 09:18 AM, Nicholas A. Bellinger wrote:
> > From: Nicholas Bellinger <nab@linux-iscsi.org>
> > 
> > This patch adds a handful of bugfixes for scsi-generic
> > that where added into:
> > 
> > commit a4194b3f79a85e111f000788ddec05d465748851
> > Author: Hannes Reinecke <hare@suse.de>
> > Date:   Mon Nov 22 15:39:33 2010 -0800
> > 
> >     scsi: Use 'SCSIRequest' directly
> > 
> > this includes:
> > 
> > *) Fix incorrect errno usage in switch() statement within
> >    scsi_command_complete()
> > 
> > *) Remove bogus scsi_command_complete() for residual case
> >    within scsi_read_complete()
> > 
> > *) Remove incorrect '-' sign from return in scsi_send_command()
> > 
> > Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
> > Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.
> > 
> > Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>
> > ---
> >  hw/scsi-generic.c |   14 ++++++--------
> >  1 files changed, 6 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
> > index 7d30115..dc277cc 100644
> > --- a/hw/scsi-generic.c
> > +++ b/hw/scsi-generic.c
> > @@ -146,13 +146,13 @@ static void scsi_command_complete(void *opaque, int ret)
> >  
> >      if (ret != 0) {
> >          switch(ret) {
> > -            case ENODEV:
> > +            case -ENODEV:
> >                  s->senselen = scsi_set_sense(s, SENSE_CODE(LUN_NOT_SUPPORTED));
> >                  break;
> > -            case EINVAL:
> > +            case -EINVAL:
> >                  s->senselen = scsi_set_sense(s, SENSE_CODE(INVALID_FIELD));
> >                  break;
> > -            case EBADR:
> > +            case -EBADR:
> >                  s->senselen = scsi_set_sense(s, SENSE_CODE(TARGET_FAILURE));
> >                  break;
> >              default:
> Oh. Correct. Although we could do a 'switch (-ret)' here.
> 
> > @@ -230,12 +230,10 @@ static void scsi_read_complete(void * opaque, int ret)
> >          return;
> >      }
> >      len = r->io_header.dxfer_len - r->io_header.resid;
> > -    DPRINTF("Data ready tag=0x%x len=%d\n", r->req.tag, len);
> > +    DPRINTF("Data ready tag=0x%x remaining len=%d\n", r->req.tag, len);
> >  
> >      r->len = -1;
> >      r->req.bus->complete(&r->req, SCSI_REASON_DATA, len);
> > -    if (len == 0)
> > -        scsi_command_complete(r, 0);
> >  }
> >  
> >  /* Read more data from scsi device into buffer.  */
> 
> Yes, that's correct (after a fashion).
> Difference is that we're having to do one more callback into
> scsi_read_data() with this hunk. But ok, ack.
> 
> > @@ -375,7 +373,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
> >      }
> >      scsi_req_fixup(&r->req);
> >  
> > -    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", lun, tag,
> > +    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", req->lun, req->tag,
> >              r->req.cmd.xfer, cmd[0]);
> >  
> >  #ifdef DEBUG_SCSI
> > @@ -414,7 +412,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
> >      r->len = r->req.cmd.xfer;
> >      if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
> >          r->len = 0;
> > -        return -r->req.cmd.xfer;
> > +        return r->req.cmd.xfer;
> >      }
> >  
> >      return r->req.cmd.xfer;
> NACK.
> Returning a negative value here means we're about to execute a write.
> Cf comment at the start of the function:
> 
> /* Execute a scsi command.  Returns the length of the data
>    expected by the command.  This will be Positive for data
>    transfers from the device (eg. disk reads), negative for
>    transfers to the device (eg. disk writes),
>    and zero if the command does not transfer any data.  */
> 

OK, changint this back in megasas-upstream-v1 here along with the same
bug that was added to the the outgoing hw/scsi-bsg.c..

But yeah, this is a really confusing interface and seems like it really
should be fixed, right..?  Unless there is some legacy reason that it
makes it difficult to do so..?

--nab
Hannes Reinecke Nov. 24, 2010, 8:57 a.m. UTC | #2
On 11/24/2010 09:18 AM, Nicholas A. Bellinger wrote:
> From: Nicholas Bellinger <nab@linux-iscsi.org>
> 
> This patch adds a handful of bugfixes for scsi-generic
> that where added into:
> 
> commit a4194b3f79a85e111f000788ddec05d465748851
> Author: Hannes Reinecke <hare@suse.de>
> Date:   Mon Nov 22 15:39:33 2010 -0800
> 
>     scsi: Use 'SCSIRequest' directly
> 
> this includes:
> 
> *) Fix incorrect errno usage in switch() statement within
>    scsi_command_complete()
> 
> *) Remove bogus scsi_command_complete() for residual case
>    within scsi_read_complete()
> 
> *) Remove incorrect '-' sign from return in scsi_send_command()
> 
> Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
> Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.
> 
> Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>
> ---
>  hw/scsi-generic.c |   14 ++++++--------
>  1 files changed, 6 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
> index 7d30115..dc277cc 100644
> --- a/hw/scsi-generic.c
> +++ b/hw/scsi-generic.c
> @@ -146,13 +146,13 @@ static void scsi_command_complete(void *opaque, int ret)
>  
>      if (ret != 0) {
>          switch(ret) {
> -            case ENODEV:
> +            case -ENODEV:
>                  s->senselen = scsi_set_sense(s, SENSE_CODE(LUN_NOT_SUPPORTED));
>                  break;
> -            case EINVAL:
> +            case -EINVAL:
>                  s->senselen = scsi_set_sense(s, SENSE_CODE(INVALID_FIELD));
>                  break;
> -            case EBADR:
> +            case -EBADR:
>                  s->senselen = scsi_set_sense(s, SENSE_CODE(TARGET_FAILURE));
>                  break;
>              default:
Oh. Correct. Although we could do a 'switch (-ret)' here.

> @@ -230,12 +230,10 @@ static void scsi_read_complete(void * opaque, int ret)
>          return;
>      }
>      len = r->io_header.dxfer_len - r->io_header.resid;
> -    DPRINTF("Data ready tag=0x%x len=%d\n", r->req.tag, len);
> +    DPRINTF("Data ready tag=0x%x remaining len=%d\n", r->req.tag, len);
>  
>      r->len = -1;
>      r->req.bus->complete(&r->req, SCSI_REASON_DATA, len);
> -    if (len == 0)
> -        scsi_command_complete(r, 0);
>  }
>  
>  /* Read more data from scsi device into buffer.  */

Yes, that's correct (after a fashion).
Difference is that we're having to do one more callback into
scsi_read_data() with this hunk. But ok, ack.

> @@ -375,7 +373,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
>      }
>      scsi_req_fixup(&r->req);
>  
> -    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", lun, tag,
> +    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", req->lun, req->tag,
>              r->req.cmd.xfer, cmd[0]);
>  
>  #ifdef DEBUG_SCSI
> @@ -414,7 +412,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
>      r->len = r->req.cmd.xfer;
>      if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
>          r->len = 0;
> -        return -r->req.cmd.xfer;
> +        return r->req.cmd.xfer;
>      }
>  
>      return r->req.cmd.xfer;
NACK.
Returning a negative value here means we're about to execute a write.
Cf comment at the start of the function:

/* Execute a scsi command.  Returns the length of the data
   expected by the command.  This will be Positive for data
   transfers from the device (eg. disk reads), negative for
   transfers to the device (eg. disk writes),
   and zero if the command does not transfer any data.  */

Cheers,

Hannes
Kevin Wolf Nov. 24, 2010, 9:04 a.m. UTC | #3
Am 24.11.2010 09:57, schrieb Hannes Reinecke:
> On 11/24/2010 09:18 AM, Nicholas A. Bellinger wrote:
>> From: Nicholas Bellinger <nab@linux-iscsi.org>
>>
>> This patch adds a handful of bugfixes for scsi-generic
>> that where added into:
>>
>> commit a4194b3f79a85e111f000788ddec05d465748851
>> Author: Hannes Reinecke <hare@suse.de>
>> Date:   Mon Nov 22 15:39:33 2010 -0800
>>
>>     scsi: Use 'SCSIRequest' directly
>>
>> this includes:
>>
>> *) Fix incorrect errno usage in switch() statement within
>>    scsi_command_complete()
>>
>> *) Remove bogus scsi_command_complete() for residual case
>>    within scsi_read_complete()
>>
>> *) Remove incorrect '-' sign from return in scsi_send_command()
>>
>> Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
>> Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.
>>
>> Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>

Hannes, can you fold the necessary parts of this into the next version
of your series, so that we don't break things first and fix them only later?

Kevin
Hannes Reinecke Nov. 24, 2010, 10:16 a.m. UTC | #4
On 11/24/2010 10:04 AM, Kevin Wolf wrote:
> Am 24.11.2010 09:57, schrieb Hannes Reinecke:
>> On 11/24/2010 09:18 AM, Nicholas A. Bellinger wrote:
>>> From: Nicholas Bellinger <nab@linux-iscsi.org>
>>>
>>> This patch adds a handful of bugfixes for scsi-generic
>>> that where added into:
>>>
>>> commit a4194b3f79a85e111f000788ddec05d465748851
>>> Author: Hannes Reinecke <hare@suse.de>
>>> Date:   Mon Nov 22 15:39:33 2010 -0800
>>>
>>>     scsi: Use 'SCSIRequest' directly
>>>
>>> this includes:
>>>
>>> *) Fix incorrect errno usage in switch() statement within
>>>    scsi_command_complete()
>>>
>>> *) Remove bogus scsi_command_complete() for residual case
>>>    within scsi_read_complete()
>>>
>>> *) Remove incorrect '-' sign from return in scsi_send_command()
>>>
>>> Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
>>> Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.
>>>
>>> Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>
> 
> Hannes, can you fold the necessary parts of this into the next version
> of your series, so that we don't break things first and fix them only later?
> 
I'll be folding the first one into my series.

The second one is actually a genuine error, and a rather old one to
boot. It went in with this commit:

commit 89c0f6438d16ebceccdcd096bbc0b5536146a443
Author: aurel32 <aurel32@c046a42c-6fe2-441c-8c8c-71466251a162>
Date:   Fri Oct 17 08:08:56 2008 +0000
Subject: scsi-generic: correct error management

So I'd rather have it submitted separately.
And a second opinion is _definitely_ required here.
nab, can you do the honours?

The third is bogus anyway.

Cheers,

Hannes
Kevin Wolf Nov. 24, 2010, 10:34 a.m. UTC | #5
Am 24.11.2010 11:16, schrieb Hannes Reinecke:
> The second one is actually a genuine error, and a rather old one to
> boot. It went in with this commit:
> 
> commit 89c0f6438d16ebceccdcd096bbc0b5536146a443
> Author: aurel32 <aurel32@c046a42c-6fe2-441c-8c8c-71466251a162>
> Date:   Fri Oct 17 08:08:56 2008 +0000
> Subject: scsi-generic: correct error management
> 
> So I'd rather have it submitted separately.
> And a second opinion is _definitely_ required here.
> nab, can you do the honours?

The commit message of this commit says it's a workaround for a problem
with lsi:

> - when a read is aborted due to a mark/EOF/EOD/EOM, the len reported to
> controller can be 0. LSI controller emulation doesn't know how to manage
> this. A workaround found is to call the completion routine with
> SCSI_REASON_DONE just after calling it with SCSI_REASON_DATA with len=0.

Are you sure that it's not needed any more?

Kevin
Hannes Reinecke Nov. 24, 2010, 10:46 a.m. UTC | #6
On 11/24/2010 11:34 AM, Kevin Wolf wrote:
> Am 24.11.2010 11:16, schrieb Hannes Reinecke:
>> The second one is actually a genuine error, and a rather old one to
>> boot. It went in with this commit:
>>
>> commit 89c0f6438d16ebceccdcd096bbc0b5536146a443
>> Author: aurel32 <aurel32@c046a42c-6fe2-441c-8c8c-71466251a162>
>> Date:   Fri Oct 17 08:08:56 2008 +0000
>> Subject: scsi-generic: correct error management
>>
>> So I'd rather have it submitted separately.
>> And a second opinion is _definitely_ required here.
>> nab, can you do the honours?
> 
> The commit message of this commit says it's a workaround for a problem
> with lsi:
> 
>> - when a read is aborted due to a mark/EOF/EOD/EOM, the len reported to
>> controller can be 0. LSI controller emulation doesn't know how to manage
>> this. A workaround found is to call the completion routine with
>> SCSI_REASON_DONE just after calling it with SCSI_REASON_DATA with len=0.
> 
> Are you sure that it's not needed any more?
> 
Don't ask me. I didn't do the patch, and my knowledge of lsi HBA
internals is scanty.
Nic, can you comment here?

Cheers,

Hannes
Nicholas A. Bellinger Nov. 24, 2010, 10:53 a.m. UTC | #7
On Wed, 2010-11-24 at 11:16 +0100, Hannes Reinecke wrote:
> On 11/24/2010 10:04 AM, Kevin Wolf wrote:
> > Am 24.11.2010 09:57, schrieb Hannes Reinecke:
> >> On 11/24/2010 09:18 AM, Nicholas A. Bellinger wrote:
> >>> From: Nicholas Bellinger <nab@linux-iscsi.org>
> >>>
> >>> This patch adds a handful of bugfixes for scsi-generic
> >>> that where added into:
> >>>
> >>> commit a4194b3f79a85e111f000788ddec05d465748851
> >>> Author: Hannes Reinecke <hare@suse.de>
> >>> Date:   Mon Nov 22 15:39:33 2010 -0800
> >>>
> >>>     scsi: Use 'SCSIRequest' directly
> >>>
> >>> this includes:
> >>>
> >>> *) Fix incorrect errno usage in switch() statement within
> >>>    scsi_command_complete()
> >>>
> >>> *) Remove bogus scsi_command_complete() for residual case
> >>>    within scsi_read_complete()
> >>>
> >>> *) Remove incorrect '-' sign from return in scsi_send_command()
> >>>
> >>> Tested with .37-rc2 TCM_Loop FILEIO backstores on KVM host into
> >>> Debian Lenny v2.6.26 KVM guest with an xfs filesystem mount.
> >>>
> >>> Signed-off-by: Nicholas A. Bellinger nab@linux-iscsi.org>
> > 
> > Hannes, can you fold the necessary parts of this into the next version
> > of your series, so that we don't break things first and fix them only later?
> > 
> I'll be folding the first one into my series.
> 
> The second one is actually a genuine error, and a rather old one to
> boot. It went in with this commit:
> 
> commit 89c0f6438d16ebceccdcd096bbc0b5536146a443
> Author: aurel32 <aurel32@c046a42c-6fe2-441c-8c8c-71466251a162>
> Date:   Fri Oct 17 08:08:56 2008 +0000
> Subject: scsi-generic: correct error management
> 
> So I'd rather have it submitted separately.
> And a second opinion is _definitely_ required here.
> nab, can you do the honours?
> 

Yep, so it appears that commit 89c0f6438d16 did introduce the bogus
'double complete' in scsi_read_complete, which I think was intended to
handle residual counts for TYPE_TYPE...

 /* Cancel a pending data transfer.  */
@@ -251,6 +257,8 @@ static void scsi_read_complete(void * opaque, int ret)

     r->len = -1;
     s->completion(s->opaque, SCSI_REASON_DATA, r->tag, len);
+    if (len == 0)
+        scsi_command_complete(r, 0);
 }

I am currently under the assumption for this and bsg_read_complete that
s->completion(..., len) is handling the residual count back to block.

Is this correct..?

--nab
Benjamin Herrenschmidt Nov. 25, 2010, 8:46 a.m. UTC | #8
On Wed, 2010-11-24 at 11:46 +0100, Hannes Reinecke wrote:
> >> - when a read is aborted due to a mark/EOF/EOD/EOM, the len
> reported to
> >> controller can be 0. LSI controller emulation doesn't know how to
> manage
> >> this. A workaround found is to call the completion routine with
> >> SCSI_REASON_DONE just after calling it with SCSI_REASON_DATA with
> len=0.
> > 
> > Are you sure that it's not needed any more?
> > 
> Don't ask me. I didn't do the patch, and my knowledge of lsi HBA
> internals is scanty.
> Nic, can you comment here? 

Well, writing an HBA myself, it took me a while to figure out what I'm
supposed to expect from the layer :-)

So far tho, it appears that I can (at least with scsi-disk) rely on
always been eventually called with SCSI_REASON_DONE so my code (and
maybe the usb-msd code too, I haven't verified) relies on that to
complete requests... Is that incorrect ?

Cheers,
Ben.
Gerd Hoffmann Nov. 25, 2010, 8:50 a.m. UTC | #9
On 11/25/10 09:46, Benjamin Herrenschmidt wrote:

> So far tho, it appears that I can (at least with scsi-disk) rely on
> always been eventually called with SCSI_REASON_DONE so my code (and
> maybe the usb-msd code too, I haven't verified) relies on that to
> complete requests... Is that incorrect ?

Yes.

cheers,
   Gerd
Benjamin Herrenschmidt Nov. 25, 2010, 9:01 a.m. UTC | #10
On Thu, 2010-11-25 at 09:50 +0100, Gerd Hoffmann wrote:
> On 11/25/10 09:46, Benjamin Herrenschmidt wrote:
> 
> > So far tho, it appears that I can (at least with scsi-disk) rely on
> > always been eventually called with SCSI_REASON_DONE so my code (and
> > maybe the usb-msd code too, I haven't verified) relies on that to
> > complete requests... Is that incorrect ?
> 
> Yes.

Well, so far it works :-) But I suppose I must be lucky.. I must admit
that it's very unclear how that SCSI "stack" is meant to be used from
the HBA standpoint.

Right now, I've somewhat come up with:

  - client request occurs
  - call device send_command()
  - if result is 0, assume my complete() was called with 
    SCSI_REASON_DONE
  - else, use sign of result for transfer direction, store the
    absolute value as the total expected transfer len and call
    the device scsi_data_read()/write() and wait for complete()
  - when complete() is called:
    - if SCSI_REASON_DONE, complete client request
    - else perform the client "DMA" for "arg" bytes
    - call the device scsi_data_read()/write() again

So far it seems to work with scsi-disk but maybe I miss something in
which case I would very much enjoy being corrected :-)

Cheers,
Ben.
Hannes Reinecke Nov. 25, 2010, 9:06 a.m. UTC | #11
On 11/25/2010 10:01 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2010-11-25 at 09:50 +0100, Gerd Hoffmann wrote:
>> On 11/25/10 09:46, Benjamin Herrenschmidt wrote:
>>
>>> So far tho, it appears that I can (at least with scsi-disk) rely on
>>> always been eventually called with SCSI_REASON_DONE so my code (and
>>> maybe the usb-msd code too, I haven't verified) relies on that to
>>> complete requests... Is that incorrect ?
>>
>> Yes.
> 
> Well, so far it works :-) But I suppose I must be lucky.. I must admit
> that it's very unclear how that SCSI "stack" is meant to be used from
> the HBA standpoint.
> 
> Right now, I've somewhat come up with:
> 
>   - client request occurs
>   - call device send_command()
>   - if result is 0, assume my complete() was called with 
>     SCSI_REASON_DONE
>   - else, use sign of result for transfer direction, store the
>     absolute value as the total expected transfer len and call
>     the device scsi_data_read()/write() and wait for complete()
>   - when complete() is called:
>     - if SCSI_REASON_DONE, complete client request
>     - else perform the client "DMA" for "arg" bytes
>     - call the device scsi_data_read()/write() again
> 
No, this is exactly as I'm expecting the SCSI layer to work.
So from the light of this the patch to scsi-generic is valid.
And it really looks like papering over a bug in the lsi HBA code.

Gerd?

Cheers,

Hannes
Benjamin Herrenschmidt Nov. 25, 2010, 9:07 a.m. UTC | #12
On Thu, 2010-11-25 at 10:06 +0100, Hannes Reinecke wrote:
> No, this is exactly as I'm expecting the SCSI layer to work.
> So from the light of this the patch to scsi-generic is valid.
> And it really looks like papering over a bug in the lsi HBA code.

Ok. I have no special case tho for a complete() coming for data with 0
in arg. I will just skip the DMA part and call read/write again until I
get SCSI_REASON_DONE.

Cheers,
Ben.
Gerd Hoffmann Nov. 25, 2010, 9:25 a.m. UTC | #13
>> Right now, I've somewhat come up with:
>>
>>    - client request occurs
>>    - call device send_command()
>>    - if result is 0, assume my complete() was called with
>>      SCSI_REASON_DONE
>>    - else, use sign of result for transfer direction, store the
>>      absolute value as the total expected transfer len and call
>>      the device scsi_data_read()/write() and wait for complete()
>>    - when complete() is called:
>>      - if SCSI_REASON_DONE, complete client request
>>      - else perform the client "DMA" for "arg" bytes
>>      - call the device scsi_data_read()/write() again
>>
> No, this is exactly as I'm expecting the SCSI layer to work.

Yes.

> So from the light of this the patch to scsi-generic is valid.

Yes.

> And it really looks like papering over a bug in the lsi HBA code.

Indeed.  If the patch breaks lsi I'd agree that it is lsi's fault. 
Anyone tested whenever current upstream lsi actually breaks with the 
patch applied?

I've attempted to convert lsi to use iovecs a while ago, and *that* 
version definitively had problems with short transfers, so maybe this is 
a leftover from those days where scsi hacking was based on that branch ...

cheers,
   Gerd
Benjamin Herrenschmidt Dec. 16, 2010, 1:58 a.m. UTC | #14
> > The commit message of this commit says it's a workaround for a problem
> > with lsi:
> > 
> >> - when a read is aborted due to a mark/EOF/EOD/EOM, the len reported to
> >> controller can be 0. LSI controller emulation doesn't know how to manage
> >> this. A workaround found is to call the completion routine with
> >> SCSI_REASON_DONE just after calling it with SCSI_REASON_DATA with len=0.
> > 
> > Are you sure that it's not needed any more?
> > 
> Don't ask me. I didn't do the patch, and my knowledge of lsi HBA
> internals is scanty.
> Nic, can you comment here?

Back to this topic...

On my current WIP code, currently based on qemu upstream commit
f711df67d611e4762966a249742a5f7499e19f99, I've just observed the
following behaviour:

Use -cdrom /dev/cdrom (which points to /dev/sr0)

No disk in the drive, scsi-disk kicks in. test-unit-ready properly says
there's no medium, so far so good.

Now, my (buggy) guest code tries to do a READ_10. At this point, qemu
hangs. What happens is that my scsi complete callback get called with
reason 1 (DATA) and arg 0. I then perform no data copy, and call back
sdev->info->read_data() which eventually calls me back again with reason
1 and arg 0, etc... in a loop.

I haven't had a chance yet to look at what's happening in the guts of
scsi-disk, but here, either I'm supposed to special case DATA with 0
bytes requested in my HBA driver, or there's a bug in scsi-disk.

I can work around it by doing a cancel_io in that case and then
completing the request is if reason had been 0 (DONE) but that's of
course just a band-aid.

I'll let you know what I find out when I get a chance to look at this
again (hopefully soon).

Cheers,
Ben.
Benjamin Herrenschmidt Dec. 21, 2010, 1:49 a.m. UTC | #15
> Yep, so it appears that commit 89c0f6438d16 did introduce the bogus
> 'double complete' in scsi_read_complete, which I think was intended to
> handle residual counts for TYPE_TYPE...
> 
>  /* Cancel a pending data transfer.  */
> @@ -251,6 +257,8 @@ static void scsi_read_complete(void * opaque, int ret)
> 
>      r->len = -1;
>      s->completion(s->opaque, SCSI_REASON_DATA, r->tag, len);
> +    if (len == 0)
> +        scsi_command_complete(r, 0);
>  }
> 
> I am currently under the assumption for this and bsg_read_complete that
> s->completion(..., len) is handling the residual count back to block.
> 
> Is this correct..?

So I just debugged a crash where loading my vscsi driver kills qemu
(segfault) after trying to complete a command twice with scsi-generic.

Removing the above hunk fixes it. So this is a genuine fix that should
be applied (asap even :-)

I still have an odd problem with scsi-disk.c where reading from an
empty cdrom drive crashes it, I'll debug that later.

Cheers,
Ben.
Nicholas A. Bellinger Dec. 23, 2010, 9:58 p.m. UTC | #16
On Tue, 2010-12-21 at 12:49 +1100, Benjamin Herrenschmidt wrote:
> > Yep, so it appears that commit 89c0f6438d16 did introduce the bogus
> > 'double complete' in scsi_read_complete, which I think was intended to
> > handle residual counts for TYPE_TYPE...
> > 
> >  /* Cancel a pending data transfer.  */
> > @@ -251,6 +257,8 @@ static void scsi_read_complete(void * opaque, int ret)
> > 
> >      r->len = -1;
> >      s->completion(s->opaque, SCSI_REASON_DATA, r->tag, len);
> > +    if (len == 0)
> > +        scsi_command_complete(r, 0);
> >  }
> > 
> > I am currently under the assumption for this and bsg_read_complete that
> > s->completion(..., len) is handling the residual count back to block.
> > 
> > Is this correct..?
> 
> So I just debugged a crash where loading my vscsi driver kills qemu
> (segfault) after trying to complete a command twice with scsi-generic.
> 
> Removing the above hunk fixes it. So this is a genuine fix that should
> be applied (asap even :-)
> 

Hi Ben,

Thanks for verifying this one.  Kevin, please make sure this original
patch to drop the bogus double complete gets picked up.

Best Regards,

--nab

> I still have an odd problem with scsi-disk.c where reading from an
> empty cdrom drive crashes it, I'll debug that later.
> 
> Cheers,
> Ben.
> 
>
Kevin Wolf Jan. 13, 2011, 2:59 p.m. UTC | #17
Am 23.12.2010 22:58, schrieb Nicholas A. Bellinger:
> On Tue, 2010-12-21 at 12:49 +1100, Benjamin Herrenschmidt wrote:
>>> Yep, so it appears that commit 89c0f6438d16 did introduce the bogus
>>> 'double complete' in scsi_read_complete, which I think was intended to
>>> handle residual counts for TYPE_TYPE...
>>>
>>>  /* Cancel a pending data transfer.  */
>>> @@ -251,6 +257,8 @@ static void scsi_read_complete(void * opaque, int ret)
>>>
>>>      r->len = -1;
>>>      s->completion(s->opaque, SCSI_REASON_DATA, r->tag, len);
>>> +    if (len == 0)
>>> +        scsi_command_complete(r, 0);
>>>  }
>>>
>>> I am currently under the assumption for this and bsg_read_complete that
>>> s->completion(..., len) is handling the residual count back to block.
>>>
>>> Is this correct..?
>>
>> So I just debugged a crash where loading my vscsi driver kills qemu
>> (segfault) after trying to complete a command twice with scsi-generic.
>>
>> Removing the above hunk fixes it. So this is a genuine fix that should
>> be applied (asap even :-)
>>
> 
> Hi Ben,
> 
> Thanks for verifying this one.  Kevin, please make sure this original
> patch to drop the bogus double complete gets picked up.

Wasn't the original patch NACKed by Hannes in parts? Can you re-post a
patch that includes only this specific fix?

Kevin
diff mbox

Patch

diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 7d30115..dc277cc 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -146,13 +146,13 @@  static void scsi_command_complete(void *opaque, int ret)
 
     if (ret != 0) {
         switch(ret) {
-            case ENODEV:
+            case -ENODEV:
                 s->senselen = scsi_set_sense(s, SENSE_CODE(LUN_NOT_SUPPORTED));
                 break;
-            case EINVAL:
+            case -EINVAL:
                 s->senselen = scsi_set_sense(s, SENSE_CODE(INVALID_FIELD));
                 break;
-            case EBADR:
+            case -EBADR:
                 s->senselen = scsi_set_sense(s, SENSE_CODE(TARGET_FAILURE));
                 break;
             default:
@@ -230,12 +230,10 @@  static void scsi_read_complete(void * opaque, int ret)
         return;
     }
     len = r->io_header.dxfer_len - r->io_header.resid;
-    DPRINTF("Data ready tag=0x%x len=%d\n", r->req.tag, len);
+    DPRINTF("Data ready tag=0x%x remaining len=%d\n", r->req.tag, len);
 
     r->len = -1;
     r->req.bus->complete(&r->req, SCSI_REASON_DATA, len);
-    if (len == 0)
-        scsi_command_complete(r, 0);
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -375,7 +373,7 @@  static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
     }
     scsi_req_fixup(&r->req);
 
-    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", lun, tag,
+    DPRINTF("Command: lun=%d tag=0x%x len %zd data=0x%02x", req->lun, req->tag,
             r->req.cmd.xfer, cmd[0]);
 
 #ifdef DEBUG_SCSI
@@ -414,7 +412,7 @@  static int32_t scsi_send_command(SCSIRequest *req, uint8_t *cmd)
     r->len = r->req.cmd.xfer;
     if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
         r->len = 0;
-        return -r->req.cmd.xfer;
+        return r->req.cmd.xfer;
     }
 
     return r->req.cmd.xfer;