Patchwork [1/4] block: Allow devices to indicate whether discarded blocks are zeroed

login
register
mail settings
Submitter Martin K. Petersen
Date Nov. 21, 2009, 2:45 a.m.
Message ID <1258771524-26673-2-git-send-email-martin.petersen@oracle.com>
Download mbox | patch
Permalink /patch/38953/
State Not Applicable
Delegated to: David Miller
Headers show

Comments

Martin K. Petersen - Nov. 21, 2009, 2:45 a.m.
The discard ioctl is used by mkfs utilities to clear a block device
prior to putting metadata down.  However, not all devices return zeroed
blocks after a discard.  Some drives return stale data, potentially
containing old superblocks.  It is therefore important to know whether
discarded blocks are properly zeroed.

Both ATA and SCSI drives have configuration bits that indicate whether
zeroes are returned after a discard operation.  Implement a block level
interface that allows this information to be bubbled up the stack.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-settings.c   |    2 ++
 block/blk-sysfs.c      |   11 +++++++++++
 include/linux/blkdev.h |    1 +
 3 files changed, 14 insertions(+), 0 deletions(-)
Christoph Hellwig - Nov. 21, 2009, 10:13 a.m.
On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
> The discard ioctl is used by mkfs utilities to clear a block device
> prior to putting metadata down.  However, not all devices return zeroed
> blocks after a discard.  Some drives return stale data, potentially
> containing old superblocks.  It is therefore important to know whether
> discarded blocks are properly zeroed.

At least for mkfs.xfs we make sure to still zero the important areas
after the TRIM ioctl anyway.

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler - Nov. 21, 2009, 12:50 p.m.
On 11/20/2009 09:45 PM, Martin K. Petersen wrote:
> The discard ioctl is used by mkfs utilities to clear a block device
> prior to putting metadata down.  However, not all devices return zeroed
> blocks after a discard.  Some drives return stale data, potentially
> containing old superblocks.  It is therefore important to know whether
> discarded blocks are properly zeroed.
>
> Both ATA and SCSI drives have configuration bits that indicate whether
> zeroes are returned after a discard operation.  Implement a block level
> interface that allows this information to be bubbled up the stack.
>
> Signed-off-by: Martin K. Petersen<martin.petersen@oracle.com>
> ---
>   block/blk-settings.c   |    2 ++
>   block/blk-sysfs.c      |   11 +++++++++++
>   include/linux/blkdev.h |    1 +
>   3 files changed, 14 insertions(+), 0 deletions(-)
>
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 7f986ca..1027e30 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -100,6 +100,7 @@ void blk_set_default_limits(struct queue_limits *lim)
>   	lim->discard_granularity = 0;
>   	lim->discard_alignment = 0;
>   	lim->discard_misaligned = 0;
> +	lim->discard_zeroes_data = -1;
>    


Did you mean to set this to -1 ?

ric


>   	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
>   	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY>>  PAGE_SHIFT);
>   	lim->alignment_offset = 0;
> @@ -543,6 +544,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>
>   	t->io_min = max(t->io_min, b->io_min);
>   	t->no_cluster |= b->no_cluster;
> +	t->discard_zeroes_data&= b->discard_zeroes_data;
>
>   	/* Bottom device offset aligned? */
>   	if (offset&&
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 3147145..1f1d2b6 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -136,6 +136,11 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
>   	return queue_var_show(q->limits.max_discard_sectors<<  9, page);
>   }
>
> +static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->limits.discard_zeroes_data == 1 ? 1 : 0, page);
> +}
> +
>   static ssize_t
>   queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
>   {
> @@ -313,6 +318,11 @@ static struct queue_sysfs_entry queue_discard_max_entry = {
>   	.show = queue_discard_max_show,
>   };
>
> +static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
> +	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
> +	.show = queue_discard_zeroes_data_show,
> +};
> +
>   static struct queue_sysfs_entry queue_nonrot_entry = {
>   	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
>   	.show = queue_nonrot_show,
> @@ -350,6 +360,7 @@ static struct attribute *default_attrs[] = {
>   	&queue_io_opt_entry.attr,
>   	&queue_discard_granularity_entry.attr,
>   	&queue_discard_max_entry.attr,
> +	&queue_discard_zeroes_data_entry.attr,
>   	&queue_nonrot_entry.attr,
>   	&queue_nomerges_entry.attr,
>   	&queue_rq_affinity_entry.attr,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 3b67221..e605945 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -322,6 +322,7 @@ struct queue_limits {
>   	unsigned char		misaligned;
>   	unsigned char		discard_misaligned;
>   	unsigned char		no_cluster;
> +	signed char		discard_zeroes_data;
>   };
>
>   struct request_queue
>    

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox - Nov. 21, 2009, 7:58 p.m.
On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
> On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
> > The discard ioctl is used by mkfs utilities to clear a block device
> > prior to putting metadata down.  However, not all devices return zeroed
> > blocks after a discard.  Some drives return stale data, potentially
> > containing old superblocks.  It is therefore important to know whether
> > discarded blocks are properly zeroed.
> 
> At least for mkfs.xfs we make sure to still zero the important areas
> after the TRIM ioctl anyway.

Could you change that to zero _before_ the TRIM?
Martin K. Petersen - Nov. 21, 2009, 8:17 p.m.
>>>>> "Ric" == Ric Wheeler <rwheeler@redhat.com> writes:

>> + lim->discard_zeroes_data = -1;

Ric> Did you mean to set this to -1 ?

Yep, it's a "not set yet" value for the stacking to work properly in
this case.
Mark Lord - Nov. 22, 2009, 2:43 a.m.
Matthew Wilcox wrote:
> On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
>> On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
>>> The discard ioctl is used by mkfs utilities to clear a block device
>>> prior to putting metadata down.  However, not all devices return zeroed
>>> blocks after a discard.  Some drives return stale data, potentially
>>> containing old superblocks.  It is therefore important to know whether
>>> discarded blocks are properly zeroed.
>> At least for mkfs.xfs we make sure to still zero the important areas
>> after the TRIM ioctl anyway.
> 
> Could you change that to zero _before_ the TRIM?
..


Hopefully not for the drives that *don't* guarantee zeros after TRIM.
Eg. most existing ones, including all of the Indilinx chipset drives.

Cheers

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ric Wheeler - Nov. 23, 2009, 4:37 p.m.
On 11/21/2009 09:43 PM, Mark Lord wrote:
> Matthew Wilcox wrote:
>> On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
>>> On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
>>>> The discard ioctl is used by mkfs utilities to clear a block device
>>>> prior to putting metadata down.  However, not all devices return 
>>>> zeroed
>>>> blocks after a discard.  Some drives return stale data, potentially
>>>> containing old superblocks.  It is therefore important to know whether
>>>> discarded blocks are properly zeroed.
>>> At least for mkfs.xfs we make sure to still zero the important areas
>>> after the TRIM ioctl anyway.
>>
>> Could you change that to zero _before_ the TRIM?
> ..
>
>
> Hopefully not for the drives that *don't* guarantee zeros after TRIM.
> Eg. most existing ones, including all of the Indilinx chipset drives.
>
> Cheers
>

Note that writing to a block after a discard (write or trim) changes the 
state of that block back to allocated (mapped) in the target. Basically, 
that would seem to make the trim redundant and irrelevant.

You can zero before the discard but that can still fail is the device 
returns garbage for a trim'ed block I think,

Ric

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer - Nov. 23, 2009, 4:54 p.m.
On Mon, Nov 23, 2009 at 11:37 AM, Ric Wheeler <rwheeler@redhat.com> wrote:
> On 11/21/2009 09:43 PM, Mark Lord wrote:
>>
>> Matthew Wilcox wrote:
>>>
>>> On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
>>>>
>>>> On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
>>>>>
>>>>> The discard ioctl is used by mkfs utilities to clear a block device
>>>>> prior to putting metadata down.  However, not all devices return zeroed
>>>>> blocks after a discard.  Some drives return stale data, potentially
>>>>> containing old superblocks.  It is therefore important to know whether
>>>>> discarded blocks are properly zeroed.
>>>>
>>>> At least for mkfs.xfs we make sure to still zero the important areas
>>>> after the TRIM ioctl anyway.
>>>
>>> Could you change that to zero _before_ the TRIM?
>>
>> ..
>>
>>
>> Hopefully not for the drives that *don't* guarantee zeros after TRIM.
>> Eg. most existing ones, including all of the Indilinx chipset drives.
>>
>> Cheers
>>
>
> Note that writing to a block after a discard (write or trim) changes the
> state of that block back to allocated (mapped) in the target. Basically,
> that would seem to make the trim redundant and irrelevant.
>
> You can zero before the discard but that can still fail is the device
> returns garbage for a trim'ed block I think,
>
> Ric

I understood the mkfs process to be:

- discard / trim 100% of the block device
- write zeros to the relatively small group of blocks that the
filesystem assumes will be zero'd.

Seems like the only reasonable way to handle it.

Greg
Ric Wheeler - Nov. 23, 2009, 5:02 p.m.
On 11/23/2009 11:54 AM, Greg Freemyer wrote:
> On Mon, Nov 23, 2009 at 11:37 AM, Ric Wheeler<rwheeler@redhat.com>  wrote:
>> On 11/21/2009 09:43 PM, Mark Lord wrote:
>>>
>>> Matthew Wilcox wrote:
>>>>
>>>> On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
>>>>>
>>>>> On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
>>>>>>
>>>>>> The discard ioctl is used by mkfs utilities to clear a block device
>>>>>> prior to putting metadata down.  However, not all devices return zeroed
>>>>>> blocks after a discard.  Some drives return stale data, potentially
>>>>>> containing old superblocks.  It is therefore important to know whether
>>>>>> discarded blocks are properly zeroed.
>>>>>
>>>>> At least for mkfs.xfs we make sure to still zero the important areas
>>>>> after the TRIM ioctl anyway.
>>>>
>>>> Could you change that to zero _before_ the TRIM?
>>>
>>> ..
>>>
>>>
>>> Hopefully not for the drives that *don't* guarantee zeros after TRIM.
>>> Eg. most existing ones, including all of the Indilinx chipset drives.
>>>
>>> Cheers
>>>
>>
>> Note that writing to a block after a discard (write or trim) changes the
>> state of that block back to allocated (mapped) in the target. Basically,
>> that would seem to make the trim redundant and irrelevant.
>>
>> You can zero before the discard but that can still fail is the device
>> returns garbage for a trim'ed block I think,
>>
>> Ric
>
> I understood the mkfs process to be:
>
> - discard / trim 100% of the block device
> - write zeros to the relatively small group of blocks that the
> filesystem assumes will be zero'd.
>
> Seems like the only reasonable way to handle it.
>
> Greg

I agree - the discard of the whole device is a good idea.

I just want to make clear that "discard block X; write block X with zeroed data" 
undoes the discard in general :-)

ric

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig - Nov. 23, 2009, 5:03 p.m.
On Mon, Nov 23, 2009 at 12:02:48PM -0500, Ric Wheeler wrote:
> I agree - the discard of the whole device is a good idea.
>
> I just want to make clear that "discard block X; write block X with 
> zeroed data" undoes the discard in general :-)

We can skip the writing of zeroes if we know the device returns zeroed
blocks after a trim.  Martin's patch exports that information to
userspace, and once we have a nice enough interface (e.g. blkid or an
ioctl) we can actually use it in mkfs to optimize the writing of zeroes
away.  Raw growling in sysfs is a bit too nasty to add it to mkfs for
those few blocks IMHO.

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig - Nov. 23, 2009, 5:05 p.m.
On Sat, Nov 21, 2009 at 12:58:03PM -0700, Matthew Wilcox wrote:
> On Sat, Nov 21, 2009 at 05:13:56AM -0500, Christoph Hellwig wrote:
> > On Fri, Nov 20, 2009 at 09:45:21PM -0500, Martin K. Petersen wrote:
> > > The discard ioctl is used by mkfs utilities to clear a block device
> > > prior to putting metadata down.  However, not all devices return zeroed
> > > blocks after a discard.  Some drives return stale data, potentially
> > > containing old superblocks.  It is therefore important to know whether
> > > discarded blocks are properly zeroed.
> > 
> > At least for mkfs.xfs we make sure to still zero the important areas
> > after the TRIM ioctl anyway.
> 
> Could you change that to zero _before_ the TRIM?

I could easily, but it would be a very bad idea.

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen - Nov. 23, 2009, 5:50 p.m.
Christoph Hellwig wrote:
> On Mon, Nov 23, 2009 at 12:02:48PM -0500, Ric Wheeler wrote:
>> I agree - the discard of the whole device is a good idea.
>>
>> I just want to make clear that "discard block X; write block X with 
>> zeroed data" undoes the discard in general :-)
> 
> We can skip the writing of zeroes if we know the device returns zeroed
> blocks after a trim.  Martin's patch exports that information to
> userspace, and once we have a nice enough interface (e.g. blkid or an
> ioctl) we can actually use it in mkfs to optimize the writing of zeroes
> away.  Raw growling in sysfs is a bit too nasty to add it to mkfs for
> those few blocks IMHO.
> 

well, in testing, I've found that not all devices which claim to return
0s post-trim, do.

For example one ssd I have, if I pattern 1M of 0xaf, then trim from
512k to 1M, and read it back (all IO done direct) I get:

00000000  af af af af af af af af  af af af af af af af af
*
00080000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
000a9000  af af af af af af af af  af af af af af af af af
*
00100000

with scraps of data left over at 0xA9000.

So for the very few blocks that we zero at mkfs (thinking xfs here,
anyway) I'd really rather just zero them post-trim to be safe, at least
for now.

-Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7f986ca..1027e30 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -100,6 +100,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
+	lim->discard_zeroes_data = -1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -543,6 +544,7 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->io_min = max(t->io_min, b->io_min);
 	t->no_cluster |= b->no_cluster;
+	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Bottom device offset aligned? */
 	if (offset &&
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3147145..1f1d2b6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -136,6 +136,11 @@  static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 	return queue_var_show(q->limits.max_discard_sectors << 9, page);
 }
 
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_zeroes_data == 1 ? 1 : 0, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -313,6 +318,11 @@  static struct queue_sysfs_entry queue_discard_max_entry = {
 	.show = queue_discard_max_show,
 };
 
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.show = queue_discard_zeroes_data_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -350,6 +360,7 @@  static struct attribute *default_attrs[] = {
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
+	&queue_discard_zeroes_data_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3b67221..e605945 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -322,6 +322,7 @@  struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
+	signed char		discard_zeroes_data;
 };
 
 struct request_queue