Patchwork [RFC] Don't do page stablization if !CONFIG_BLKDEV_INTEGRITY

login
register
mail settings
Submitter Martin K. Petersen
Date March 8, 2012, 3:45 a.m.
Message ID <yq1zkbrhi2l.fsf@sermon.lab.mkp.net>
Download mbox | patch
Permalink /patch/145445/
State New
Headers show

Comments

Martin K. Petersen - March 8, 2012, 3:45 a.m.
>>>>> "Boaz" == Boaz Harrosh <bharrosh@panasas.com> writes:

Boaz> As I stated many times before, the device should have a property
Boaz> that says if it needs stable pages or not. The candidates for
Boaz> stable pages are:

Boaz> 	- DIF/DIX enabled devices
Boaz> 	- RAID-1/4/5/6 devices
Boaz> 	- iscsi devices with data digest signature
Boaz> 	- Any other checksum enabled block device.

Boaz> A fedora distro will have CONFIG_BLKDEV_INTEGRITY set then you are
Boaz> always out of luck, even with devices that can care less.

Boaz> Please submit a proper patch, even a temporary mount option. But
Boaz> this is ABI. The best is to find where to export it as part of the
Boaz> device's properties sysfs dir.

We could do something like this:

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh - March 8, 2012, 4:37 a.m.
On 03/07/2012 07:45 PM, Martin K. Petersen wrote:
>>>>>> "Boaz" == Boaz Harrosh <bharrosh@panasas.com> writes:
> 
> Boaz> As I stated many times before, the device should have a property
> Boaz> that says if it needs stable pages or not. The candidates for
> Boaz> stable pages are:
> 
> Boaz> 	- DIF/DIX enabled devices
> Boaz> 	- RAID-1/4/5/6 devices
> Boaz> 	- iscsi devices with data digest signature
> Boaz> 	- Any other checksum enabled block device.
> 
> Boaz> A fedora distro will have CONFIG_BLKDEV_INTEGRITY set then you are
> Boaz> always out of luck, even with devices that can care less.
> 
> Boaz> Please submit a proper patch, even a temporary mount option. But
> Boaz> this is ABI. The best is to find where to export it as part of the
> Boaz> device's properties sysfs dir.
> 
> We could do something like this:
> 

Yes, this one is perfect.

Combined with Darrick's patch to actually inspect the flag at the filesystem level
is the solution I want.

When submitted I will also send a patch to set .needs_stable_pages in iscsi
when needed.

Thanks, Martin
Boaz

> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 5680b91..442a0df 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -125,6 +125,7 @@ void blk_set_default_limits(struct queue_limits *lim)
>  	lim->io_opt = 0;
>  	lim->misaligned = 0;
>  	lim->cluster = 1;
> +	lim->needs_stable_pages = false;
>  }
>  EXPORT_SYMBOL(blk_set_default_limits);
>  
> @@ -571,6 +572,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>  	t->cluster &= b->cluster;
>  	t->discard_zeroes_data &= b->discard_zeroes_data;
>  
> +	t->needs_stable_pages &= b->needs_stable_pages;
> +
>  	/* Physical block size a multiple of the logical block size? */
>  	if (t->physical_block_size & (t->logical_block_size - 1)) {
>  		t->physical_block_size = t->logical_block_size;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 5b85d91..d464aca 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -161,6 +161,11 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
>  	return queue_var_show(queue_discard_zeroes_data(q), page);
>  }
>  
> +static ssize_t queue_needs_stable_pages_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->limits.needs_stable_pages, page);
> +}
> +
>  static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
>  {
>  	return sprintf(page, "%llu\n",
> @@ -364,6 +369,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
>  	.show = queue_discard_zeroes_data_show,
>  };
>  
> +static struct queue_sysfs_entry queue_needs_stable_pages_entry = {
> +	.attr = {.name = "needs_stable_pages", .mode = S_IRUGO },
> +	.show = queue_needs_stable_pages_show,
> +};
> +
>  static struct queue_sysfs_entry queue_write_same_max_entry = {
>  	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
>  	.show = queue_write_same_max_show,
> @@ -416,6 +426,7 @@ static struct attribute *default_attrs[] = {
>  	&queue_discard_granularity_entry.attr,
>  	&queue_discard_max_entry.attr,
>  	&queue_discard_zeroes_data_entry.attr,
> +	&queue_needs_stable_pages_entry.attr,
>  	&queue_write_same_max_entry.attr,
>  	&queue_nonrot_entry.attr,
>  	&queue_nomerges_entry.attr,
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 26eff46..146bed4 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1752,10 +1752,11 @@ static void sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffe
>  		return;
>  	}
>  
> -	if (scsi_host_dif_capable(sdp->host, type))
> +	if (scsi_host_dif_capable(sdp->host, type)) {
>  		sd_printk(KERN_NOTICE, sdkp,
>  			  "Enabling DIF Type %u protection\n", type);
> -	else
> +		sdkp->disk->queue->limits.needs_stable_pages = true;
> +	} else
>  		sd_printk(KERN_NOTICE, sdkp,
>  			  "Disabling DIF Type %u protection\n", type);
>  }
> diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
> index 0cb39ff..9dc330c 100644
> --- a/drivers/scsi/sd_dif.c
> +++ b/drivers/scsi/sd_dif.c
> @@ -338,6 +338,8 @@ void sd_dif_config_host(struct scsi_disk *sdkp)
>  	sd_printk(KERN_NOTICE, sdkp,
>  		  "Enabling DIX %s protection\n", disk->integrity->name);
>  
> +	disk->queue->limits.needs_stable_pages = true;
> +
>  	/* Signal to block layer that we support sector tagging */
>  	if (dif && type && sdkp->ATO) {
>  		if (type == SD_DIF_TYPE3_PROTECTION)
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 92956b7..a5a33db 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -266,6 +266,8 @@ struct queue_limits {
>  	unsigned char		discard_misaligned;
>  	unsigned char		cluster;
>  	unsigned char		discard_zeroes_data;
> +
> +	bool			needs_stable_pages;
>  };
>  
>  struct request_queue {

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil - March 8, 2012, 6:27 a.m.
Hi Boaz, Martin, Ted,

On Wed, 7 Mar 2012, Boaz Harrosh wrote:
> On 03/07/2012 07:45 PM, Martin K. Petersen wrote:
> >>>>>> "Boaz" == Boaz Harrosh <bharrosh@panasas.com> writes:
> > 
> > Boaz> As I stated many times before, the device should have a property
> > Boaz> that says if it needs stable pages or not. The candidates for
> > Boaz> stable pages are:
> > 
> > Boaz> 	- DIF/DIX enabled devices
> > Boaz> 	- RAID-1/4/5/6 devices
> > Boaz> 	- iscsi devices with data digest signature
> > Boaz> 	- Any other checksum enabled block device.
> > 
> > Boaz> A fedora distro will have CONFIG_BLKDEV_INTEGRITY set then you are
> > Boaz> always out of luck, even with devices that can care less.
> > 
> > Boaz> Please submit a proper patch, even a temporary mount option. But
> > Boaz> this is ABI. The best is to find where to export it as part of the
> > Boaz> device's properties sysfs dir.
> > 
> > We could do something like this:
> > 
> 
> Yes, this one is perfect.
> 
> Combined with Darrick's patch to actually inspect the flag at the 
> filesystem level is the solution I want.

This avoids the problem for devices that don't need stable pages, but 
doesn't help for those that do (btrfs, raid, iscsi, dif/dix, etc.).  It 
seems to me like a more elegant solution would be to COW the page in the 
address_space so that you get stable writeback pages without blocking.  
That's clearly more complex, and I'm sure there are a range of issues 
involved in making that work, but I would hope that it would be doable 
with generic MM infrastructure so that everyone would benefit.

I would love to talk to some MM people at LSF about what it would take to 
make this work...

sage



> 
> When submitted I will also send a patch to set .needs_stable_pages in 
> iscsi when needed.
> 
> Thanks, Martin
> Boaz
> 
> > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > index 5680b91..442a0df 100644
> > --- a/block/blk-settings.c
> > +++ b/block/blk-settings.c
> > @@ -125,6 +125,7 @@ void blk_set_default_limits(struct queue_limits *lim)
> >  	lim->io_opt = 0;
> >  	lim->misaligned = 0;
> >  	lim->cluster = 1;
> > +	lim->needs_stable_pages = false;
> >  }
> >  EXPORT_SYMBOL(blk_set_default_limits);
> >  
> > @@ -571,6 +572,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> >  	t->cluster &= b->cluster;
> >  	t->discard_zeroes_data &= b->discard_zeroes_data;
> >  
> > +	t->needs_stable_pages &= b->needs_stable_pages;
> > +
> >  	/* Physical block size a multiple of the logical block size? */
> >  	if (t->physical_block_size & (t->logical_block_size - 1)) {
> >  		t->physical_block_size = t->logical_block_size;
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index 5b85d91..d464aca 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -161,6 +161,11 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
> >  	return queue_var_show(queue_discard_zeroes_data(q), page);
> >  }
> >  
> > +static ssize_t queue_needs_stable_pages_show(struct request_queue *q, char *page)
> > +{
> > +	return queue_var_show(q->limits.needs_stable_pages, page);
> > +}
> > +
> >  static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
> >  {
> >  	return sprintf(page, "%llu\n",
> > @@ -364,6 +369,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
> >  	.show = queue_discard_zeroes_data_show,
> >  };
> >  
> > +static struct queue_sysfs_entry queue_needs_stable_pages_entry = {
> > +	.attr = {.name = "needs_stable_pages", .mode = S_IRUGO },
> > +	.show = queue_needs_stable_pages_show,
> > +};
> > +
> >  static struct queue_sysfs_entry queue_write_same_max_entry = {
> >  	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
> >  	.show = queue_write_same_max_show,
> > @@ -416,6 +426,7 @@ static struct attribute *default_attrs[] = {
> >  	&queue_discard_granularity_entry.attr,
> >  	&queue_discard_max_entry.attr,
> >  	&queue_discard_zeroes_data_entry.attr,
> > +	&queue_needs_stable_pages_entry.attr,
> >  	&queue_write_same_max_entry.attr,
> >  	&queue_nonrot_entry.attr,
> >  	&queue_nomerges_entry.attr,
> > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> > index 26eff46..146bed4 100644
> > --- a/drivers/scsi/sd.c
> > +++ b/drivers/scsi/sd.c
> > @@ -1752,10 +1752,11 @@ static void sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffe
> >  		return;
> >  	}
> >  
> > -	if (scsi_host_dif_capable(sdp->host, type))
> > +	if (scsi_host_dif_capable(sdp->host, type)) {
> >  		sd_printk(KERN_NOTICE, sdkp,
> >  			  "Enabling DIF Type %u protection\n", type);
> > -	else
> > +		sdkp->disk->queue->limits.needs_stable_pages = true;
> > +	} else
> >  		sd_printk(KERN_NOTICE, sdkp,
> >  			  "Disabling DIF Type %u protection\n", type);
> >  }
> > diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
> > index 0cb39ff..9dc330c 100644
> > --- a/drivers/scsi/sd_dif.c
> > +++ b/drivers/scsi/sd_dif.c
> > @@ -338,6 +338,8 @@ void sd_dif_config_host(struct scsi_disk *sdkp)
> >  	sd_printk(KERN_NOTICE, sdkp,
> >  		  "Enabling DIX %s protection\n", disk->integrity->name);
> >  
> > +	disk->queue->limits.needs_stable_pages = true;
> > +
> >  	/* Signal to block layer that we support sector tagging */
> >  	if (dif && type && sdkp->ATO) {
> >  		if (type == SD_DIF_TYPE3_PROTECTION)
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 92956b7..a5a33db 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -266,6 +266,8 @@ struct queue_limits {
> >  	unsigned char		discard_misaligned;
> >  	unsigned char		cluster;
> >  	unsigned char		discard_zeroes_data;
> > +
> > +	bool			needs_stable_pages;
> >  };
> >  
> >  struct request_queue {
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o - March 8, 2012, 3:43 p.m.
On Wed, Mar 07, 2012 at 10:27:43PM -0800, Sage Weil wrote:
> 
> This avoids the problem for devices that don't need stable pages, but 
> doesn't help for those that do (btrfs, raid, iscsi, dif/dix, etc.).  It 
> seems to me like a more elegant solution would be to COW the page in the 
> address_space so that you get stable writeback pages without blocking.  
> That's clearly more complex, and I'm sure there are a range of issues 
> involved in making that work, but I would hope that it would be doable 
> with generic MM infrastructure so that everyone would benefit.

Well, even doing a COW (or anything that involves messing with page
tables) is not free.  So even if we can make the cost of stable
writeback pages cheaper, if we can completely avoid the cost, this
would be good.  I'd also rather fix the performance regression sooner
rather than later, and I suspect the COW solution is not something
that could be prepared in time for the upcoming merge window.

Martin, would you be willing to try to get your patch submitted for
the upcoming merge window?  Or I'd be willing to carry your patch and
then rework Darrick's to use the exported flag, and carry it in my
tree, maybe that would be better.

> I would love to talk to some MM people at LSF about what it would take to 
> make this work...

Sure, long term, I'm much more sympathetic to iSCSI than I am about
DIF/DIX (which due to drive manufacturer's pricing strategies I don't
think it will ever become mainstream --- which was my main concern;
why should we be inflicting pretty severe performance regressions for
the common case, just to improve things for obscure high-end hardware?
It's similar to how Solaris trashed 1 and 2 processor performance,
because they were optimizing things for their high margin SunFires).
So getting something which makes page stablization not suck so much in
the long term seems like a fine goal.  But even though we've worked on
improve SMP scalability in a way that didn't sacrifice 2 and 4-way
processors, we've still supported compiling with !CONFIG_SMP....

     		       	   	      - Ted

> 
> sage
> 
> 
> 
> > 
> > When submitted I will also send a patch to set .needs_stable_pages in 
> > iscsi when needed.
> > 
> > Thanks, Martin
> > Boaz
> > 
> > > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > > index 5680b91..442a0df 100644
> > > --- a/block/blk-settings.c
> > > +++ b/block/blk-settings.c
> > > @@ -125,6 +125,7 @@ void blk_set_default_limits(struct queue_limits *lim)
> > >  	lim->io_opt = 0;
> > >  	lim->misaligned = 0;
> > >  	lim->cluster = 1;
> > > +	lim->needs_stable_pages = false;
> > >  }
> > >  EXPORT_SYMBOL(blk_set_default_limits);
> > >  
> > > @@ -571,6 +572,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> > >  	t->cluster &= b->cluster;
> > >  	t->discard_zeroes_data &= b->discard_zeroes_data;
> > >  
> > > +	t->needs_stable_pages &= b->needs_stable_pages;
> > > +
> > >  	/* Physical block size a multiple of the logical block size? */
> > >  	if (t->physical_block_size & (t->logical_block_size - 1)) {
> > >  		t->physical_block_size = t->logical_block_size;
> > > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > > index 5b85d91..d464aca 100644
> > > --- a/block/blk-sysfs.c
> > > +++ b/block/blk-sysfs.c
> > > @@ -161,6 +161,11 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
> > >  	return queue_var_show(queue_discard_zeroes_data(q), page);
> > >  }
> > >  
> > > +static ssize_t queue_needs_stable_pages_show(struct request_queue *q, char *page)
> > > +{
> > > +	return queue_var_show(q->limits.needs_stable_pages, page);
> > > +}
> > > +
> > >  static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
> > >  {
> > >  	return sprintf(page, "%llu\n",
> > > @@ -364,6 +369,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
> > >  	.show = queue_discard_zeroes_data_show,
> > >  };
> > >  
> > > +static struct queue_sysfs_entry queue_needs_stable_pages_entry = {
> > > +	.attr = {.name = "needs_stable_pages", .mode = S_IRUGO },
> > > +	.show = queue_needs_stable_pages_show,
> > > +};
> > > +
> > >  static struct queue_sysfs_entry queue_write_same_max_entry = {
> > >  	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
> > >  	.show = queue_write_same_max_show,
> > > @@ -416,6 +426,7 @@ static struct attribute *default_attrs[] = {
> > >  	&queue_discard_granularity_entry.attr,
> > >  	&queue_discard_max_entry.attr,
> > >  	&queue_discard_zeroes_data_entry.attr,
> > > +	&queue_needs_stable_pages_entry.attr,
> > >  	&queue_write_same_max_entry.attr,
> > >  	&queue_nonrot_entry.attr,
> > >  	&queue_nomerges_entry.attr,
> > > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> > > index 26eff46..146bed4 100644
> > > --- a/drivers/scsi/sd.c
> > > +++ b/drivers/scsi/sd.c
> > > @@ -1752,10 +1752,11 @@ static void sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffe
> > >  		return;
> > >  	}
> > >  
> > > -	if (scsi_host_dif_capable(sdp->host, type))
> > > +	if (scsi_host_dif_capable(sdp->host, type)) {
> > >  		sd_printk(KERN_NOTICE, sdkp,
> > >  			  "Enabling DIF Type %u protection\n", type);
> > > -	else
> > > +		sdkp->disk->queue->limits.needs_stable_pages = true;
> > > +	} else
> > >  		sd_printk(KERN_NOTICE, sdkp,
> > >  			  "Disabling DIF Type %u protection\n", type);
> > >  }
> > > diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
> > > index 0cb39ff..9dc330c 100644
> > > --- a/drivers/scsi/sd_dif.c
> > > +++ b/drivers/scsi/sd_dif.c
> > > @@ -338,6 +338,8 @@ void sd_dif_config_host(struct scsi_disk *sdkp)
> > >  	sd_printk(KERN_NOTICE, sdkp,
> > >  		  "Enabling DIX %s protection\n", disk->integrity->name);
> > >  
> > > +	disk->queue->limits.needs_stable_pages = true;
> > > +
> > >  	/* Signal to block layer that we support sector tagging */
> > >  	if (dif && type && sdkp->ATO) {
> > >  		if (type == SD_DIF_TYPE3_PROTECTION)
> > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > > index 92956b7..a5a33db 100644
> > > --- a/include/linux/blkdev.h
> > > +++ b/include/linux/blkdev.h
> > > @@ -266,6 +266,8 @@ struct queue_limits {
> > >  	unsigned char		discard_misaligned;
> > >  	unsigned char		cluster;
> > >  	unsigned char		discard_zeroes_data;
> > > +
> > > +	bool			needs_stable_pages;
> > >  };
> > >  
> > >  struct request_queue {
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen - March 8, 2012, 4:36 p.m.
>>>>> "Ted" == Ted Ts'o <tytso@mit.edu> writes:

Ted> Martin, would you be willing to try to get your patch submitted for
Ted> the upcoming merge window?  Or I'd be willing to carry your patch
Ted> and then rework Darrick's to use the exported flag, and carry it in
Ted> my tree, maybe that would be better.

There's probably going to be some conflicts due to both topology updates
and the write same changes I have pending. So it's probably best that I
submit this patch as part of my kits for Jens and James. Should go out
today.


Ted> why should we be inflicting pretty severe performance regressions
Ted> for the common case, just to improve things for obscure high-end
Ted> hardware?

I'm perfectly ok with that now that we have established that there are
real world workloads that do suffer with the wait in place.
Sage Weil - March 8, 2012, 4:43 p.m.
On Thu, 8 Mar 2012, Ted Ts'o wrote:
> On Wed, Mar 07, 2012 at 10:27:43PM -0800, Sage Weil wrote:
> > 
> > This avoids the problem for devices that don't need stable pages, but 
> > doesn't help for those that do (btrfs, raid, iscsi, dif/dix, etc.).  It 
> > seems to me like a more elegant solution would be to COW the page in the 
> > address_space so that you get stable writeback pages without blocking.  
> > That's clearly more complex, and I'm sure there are a range of issues 
> > involved in making that work, but I would hope that it would be doable 
> > with generic MM infrastructure so that everyone would benefit.
> 
> Well, even doing a COW (or anything that involves messing with page
> tables) is not free.  So even if we can make the cost of stable
> writeback pages cheaper, if we can completely avoid the cost, this
> would be good.  I'd also rather fix the performance regression sooner
> rather than later, and I suspect the COW solution is not something
> that could be prepared in time for the upcoming merge window.

Definitely.  This patch looks like a fine approach for your situation. I 
just don't want the subject to come up without talking about a general 
solution.  And it's very interesting to hear about a (simple) workload 
that is affected by the wait_on_page_writeback().

Thanks-
sage



> Martin, would you be willing to try to get your patch submitted for
> the upcoming merge window?  Or I'd be willing to carry your patch and
> then rework Darrick's to use the exported flag, and carry it in my
> tree, maybe that would be better.
> 
> > I would love to talk to some MM people at LSF about what it would take to 
> > make this work...
> 
> Sure, long term, I'm much more sympathetic to iSCSI than I am about
> DIF/DIX (which due to drive manufacturer's pricing strategies I don't
> think it will ever become mainstream --- which was my main concern;
> why should we be inflicting pretty severe performance regressions for
> the common case, just to improve things for obscure high-end hardware?
> It's similar to how Solaris trashed 1 and 2 processor performance,
> because they were optimizing things for their high margin SunFires).
> So getting something which makes page stablization not suck so much in
> the long term seems like a fine goal.  But even though we've worked on
> improve SMP scalability in a way that didn't sacrifice 2 and 4-way
> processors, we've still supported compiling with !CONFIG_SMP....
> 
>      		       	   	      - Ted
> 
> > 
> > sage
> > 
> > 
> > 
> > > 
> > > When submitted I will also send a patch to set .needs_stable_pages in 
> > > iscsi when needed.
> > > 
> > > Thanks, Martin
> > > Boaz
> > > 
> > > > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > > > index 5680b91..442a0df 100644
> > > > --- a/block/blk-settings.c
> > > > +++ b/block/blk-settings.c
> > > > @@ -125,6 +125,7 @@ void blk_set_default_limits(struct queue_limits *lim)
> > > >  	lim->io_opt = 0;
> > > >  	lim->misaligned = 0;
> > > >  	lim->cluster = 1;
> > > > +	lim->needs_stable_pages = false;
> > > >  }
> > > >  EXPORT_SYMBOL(blk_set_default_limits);
> > > >  
> > > > @@ -571,6 +572,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> > > >  	t->cluster &= b->cluster;
> > > >  	t->discard_zeroes_data &= b->discard_zeroes_data;
> > > >  
> > > > +	t->needs_stable_pages &= b->needs_stable_pages;
> > > > +
> > > >  	/* Physical block size a multiple of the logical block size? */
> > > >  	if (t->physical_block_size & (t->logical_block_size - 1)) {
> > > >  		t->physical_block_size = t->logical_block_size;
> > > > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > > > index 5b85d91..d464aca 100644
> > > > --- a/block/blk-sysfs.c
> > > > +++ b/block/blk-sysfs.c
> > > > @@ -161,6 +161,11 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
> > > >  	return queue_var_show(queue_discard_zeroes_data(q), page);
> > > >  }
> > > >  
> > > > +static ssize_t queue_needs_stable_pages_show(struct request_queue *q, char *page)
> > > > +{
> > > > +	return queue_var_show(q->limits.needs_stable_pages, page);
> > > > +}
> > > > +
> > > >  static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
> > > >  {
> > > >  	return sprintf(page, "%llu\n",
> > > > @@ -364,6 +369,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
> > > >  	.show = queue_discard_zeroes_data_show,
> > > >  };
> > > >  
> > > > +static struct queue_sysfs_entry queue_needs_stable_pages_entry = {
> > > > +	.attr = {.name = "needs_stable_pages", .mode = S_IRUGO },
> > > > +	.show = queue_needs_stable_pages_show,
> > > > +};
> > > > +
> > > >  static struct queue_sysfs_entry queue_write_same_max_entry = {
> > > >  	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
> > > >  	.show = queue_write_same_max_show,
> > > > @@ -416,6 +426,7 @@ static struct attribute *default_attrs[] = {
> > > >  	&queue_discard_granularity_entry.attr,
> > > >  	&queue_discard_max_entry.attr,
> > > >  	&queue_discard_zeroes_data_entry.attr,
> > > > +	&queue_needs_stable_pages_entry.attr,
> > > >  	&queue_write_same_max_entry.attr,
> > > >  	&queue_nonrot_entry.attr,
> > > >  	&queue_nomerges_entry.attr,
> > > > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> > > > index 26eff46..146bed4 100644
> > > > --- a/drivers/scsi/sd.c
> > > > +++ b/drivers/scsi/sd.c
> > > > @@ -1752,10 +1752,11 @@ static void sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffe
> > > >  		return;
> > > >  	}
> > > >  
> > > > -	if (scsi_host_dif_capable(sdp->host, type))
> > > > +	if (scsi_host_dif_capable(sdp->host, type)) {
> > > >  		sd_printk(KERN_NOTICE, sdkp,
> > > >  			  "Enabling DIF Type %u protection\n", type);
> > > > -	else
> > > > +		sdkp->disk->queue->limits.needs_stable_pages = true;
> > > > +	} else
> > > >  		sd_printk(KERN_NOTICE, sdkp,
> > > >  			  "Disabling DIF Type %u protection\n", type);
> > > >  }
> > > > diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
> > > > index 0cb39ff..9dc330c 100644
> > > > --- a/drivers/scsi/sd_dif.c
> > > > +++ b/drivers/scsi/sd_dif.c
> > > > @@ -338,6 +338,8 @@ void sd_dif_config_host(struct scsi_disk *sdkp)
> > > >  	sd_printk(KERN_NOTICE, sdkp,
> > > >  		  "Enabling DIX %s protection\n", disk->integrity->name);
> > > >  
> > > > +	disk->queue->limits.needs_stable_pages = true;
> > > > +
> > > >  	/* Signal to block layer that we support sector tagging */
> > > >  	if (dif && type && sdkp->ATO) {
> > > >  		if (type == SD_DIF_TYPE3_PROTECTION)
> > > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > > > index 92956b7..a5a33db 100644
> > > > --- a/include/linux/blkdev.h
> > > > +++ b/include/linux/blkdev.h
> > > > @@ -266,6 +266,8 @@ struct queue_limits {
> > > >  	unsigned char		discard_misaligned;
> > > >  	unsigned char		cluster;
> > > >  	unsigned char		discard_zeroes_data;
> > > > +
> > > > +	bool			needs_stable_pages;
> > > >  };
> > > >  
> > > >  struct request_queue {
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > 
> > > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Lutomirski - March 15, 2012, 2:10 a.m.
On 03/08/2012 08:43 AM, Sage Weil wrote:
> On Thu, 8 Mar 2012, Ted Ts'o wrote:
>> On Wed, Mar 07, 2012 at 10:27:43PM -0800, Sage Weil wrote:
>>>
>>> This avoids the problem for devices that don't need stable pages, but 
>>> doesn't help for those that do (btrfs, raid, iscsi, dif/dix, etc.).  It 
>>> seems to me like a more elegant solution would be to COW the page in the 
>>> address_space so that you get stable writeback pages without blocking.  
>>> That's clearly more complex, and I'm sure there are a range of issues 
>>> involved in making that work, but I would hope that it would be doable 
>>> with generic MM infrastructure so that everyone would benefit.
>>
>> Well, even doing a COW (or anything that involves messing with page
>> tables) is not free.  So even if we can make the cost of stable
>> writeback pages cheaper, if we can completely avoid the cost, this
>> would be good.  I'd also rather fix the performance regression sooner
>> rather than later, and I suspect the COW solution is not something
>> that could be prepared in time for the upcoming merge window.
> 
> Definitely.  This patch looks like a fine approach for your situation. I 
> just don't want the subject to come up without talking about a general 
> solution.  And it's very interesting to hear about a (simple) workload 
> that is affected by the wait_on_page_writeback().

I'll add a simple workload.  I have a soft real-time program that has
two threads.  One of them fallocates some files, mmaps them, mlocks
them, and touches all the pages to prefault them.  (This thread has no
real-time constraints -- it just needs to keep up.)  The other thread
writes to the files.

On Windows, this works very well.  On Linux without stable pages, it
almost works.  With stable pages, it's a complete disaster.  No amount
of minimizing the amount of time that pages under writeback can cause
writers to sleep will help -- writers *must not wait for io* when
writing mlocked, prefaulted pages for my code to work.

(The other issue involves file_update_time.  I'll send a fix eventually.)

FWIW, it would be really nice if there was a way to lock a mapping so
hard that accesses are guaranteed to not even cause soft faults.  We're
far from being able to do that now, though.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh - March 15, 2012, 4:46 a.m.
On 03/14/2012 07:10 PM, Andy Lutomirski wrote:
> On 03/08/2012 08:43 AM, Sage Weil wrote:
>> On Thu, 8 Mar 2012, Ted Ts'o wrote:
>>> On Wed, Mar 07, 2012 at 10:27:43PM -0800, Sage Weil wrote:
>>>>
>>>> This avoids the problem for devices that don't need stable pages, but 
>>>> doesn't help for those that do (btrfs, raid, iscsi, dif/dix, etc.).  It 
>>>> seems to me like a more elegant solution would be to COW the page in the 
>>>> address_space so that you get stable writeback pages without blocking.  
>>>> That's clearly more complex, and I'm sure there are a range of issues 
>>>> involved in making that work, but I would hope that it would be doable 
>>>> with generic MM infrastructure so that everyone would benefit.
>>>
>>> Well, even doing a COW (or anything that involves messing with page
>>> tables) is not free.  So even if we can make the cost of stable
>>> writeback pages cheaper, if we can completely avoid the cost, this
>>> would be good.  I'd also rather fix the performance regression sooner
>>> rather than later, and I suspect the COW solution is not something
>>> that could be prepared in time for the upcoming merge window.
>>
>> Definitely.  This patch looks like a fine approach for your situation. I 
>> just don't want the subject to come up without talking about a general 
>> solution.  And it's very interesting to hear about a (simple) workload 
>> that is affected by the wait_on_page_writeback().
> 
> I'll add a simple workload.  I have a soft real-time program that has
> two threads.  One of them fallocates some files, mmaps them, mlocks
> them, and touches all the pages to prefault them.  (This thread has no
> real-time constraints -- it just needs to keep up.)  The other thread
> writes to the files.
> 
> On Windows, this works very well.  On Linux without stable pages, it
> almost works.  With stable pages, it's a complete disaster.  No amount
> of minimizing the amount of time that pages under writeback can cause
> writers to sleep will help -- writers *must not wait for io* when
> writing mlocked, prefaulted pages for my code to work.
> 

Right, this is Windows shit. If your goal is to never wait, IO
as fast as possible, and use the least amount of CPU time
then it's exactly the opposite of what you want to do.
You want to do async Direct IO.

Also as Dave Chinner said "Double/ring buffering with async
IO dispatch"

BTW Even On windows there are much better ways to do this. Also
there ring buffering with async direct IO.

> (The other issue involves file_update_time.  I'll send a fix eventually.)
> 
> FWIW, it would be really nice if there was a way to lock a mapping so
> hard that accesses are guaranteed to not even cause soft faults.  We're
> far from being able to do that now, though.
> 
> --Andy

Cheers
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Lutomirski - March 15, 2012, 5:02 a.m.
On Wed, Mar 14, 2012 at 9:46 PM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 03/14/2012 07:10 PM, Andy Lutomirski wrote:
>> On 03/08/2012 08:43 AM, Sage Weil wrote:
>> I'll add a simple workload.  I have a soft real-time program that has
>> two threads.  One of them fallocates some files, mmaps them, mlocks
>> them, and touches all the pages to prefault them.  (This thread has no
>> real-time constraints -- it just needs to keep up.)  The other thread
>> writes to the files.
>>
>> On Windows, this works very well.  On Linux without stable pages, it
>> almost works.  With stable pages, it's a complete disaster.  No amount
>> of minimizing the amount of time that pages under writeback can cause
>> writers to sleep will help -- writers *must not wait for io* when
>> writing mlocked, prefaulted pages for my code to work.
>>
>
> Right, this is Windows shit. If your goal is to never wait, IO
> as fast as possible, and use the least amount of CPU time
> then it's exactly the opposite of what you want to do.
> You want to do async Direct IO.

I don't care if the io is as fast as possible and I don't care about
cpu time (that's what multicore is for).  I care about real-time
threads not waiting.  And I'm not sure why the superior performance on
Windows is "shit".

In any case, there's no possible way that async direct I/O is as
low-latency as memcpy for short writes.

>
> Also as Dave Chinner said "Double/ring buffering with async
> IO dispatch"

That's certainly an option.  In fact, any program that has issues with
write latency due to stable pages or any other cause can do this.  But
it's more code, it's more likely to lose data on an application crash,
and it's annoying since the kernel *already* has a perfectly good page
cache and mmap mechanism that ought to work.

Also, double buffering is kind of a PITA when other threads might be
trying to real the same file at the same time.  That means I have to
get them all to find the same mapping, which can't live on an ext4
filesystem, requiring a daemon.

So I still claim my use case is valid, and I think Linux should handle
it better than it does now.  The proposed fix of getting rid of stable
pages when they're not needed is good enough for me.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5680b91..442a0df 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -125,6 +125,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->cluster = 1;
+	lim->needs_stable_pages = false;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -571,6 +572,8 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->cluster &= b->cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
+	t->needs_stable_pages &= b->needs_stable_pages;
+
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5b85d91..d464aca 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -161,6 +161,11 @@  static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
 	return queue_var_show(queue_discard_zeroes_data(q), page);
 }
 
+static ssize_t queue_needs_stable_pages_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.needs_stable_pages, page);
+}
+
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
 {
 	return sprintf(page, "%llu\n",
@@ -364,6 +369,11 @@  static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
 	.show = queue_discard_zeroes_data_show,
 };
 
+static struct queue_sysfs_entry queue_needs_stable_pages_entry = {
+	.attr = {.name = "needs_stable_pages", .mode = S_IRUGO },
+	.show = queue_needs_stable_pages_show,
+};
+
 static struct queue_sysfs_entry queue_write_same_max_entry = {
 	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
 	.show = queue_write_same_max_show,
@@ -416,6 +426,7 @@  static struct attribute *default_attrs[] = {
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_needs_stable_pages_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 26eff46..146bed4 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1752,10 +1752,11 @@  static void sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffe
 		return;
 	}
 
-	if (scsi_host_dif_capable(sdp->host, type))
+	if (scsi_host_dif_capable(sdp->host, type)) {
 		sd_printk(KERN_NOTICE, sdkp,
 			  "Enabling DIF Type %u protection\n", type);
-	else
+		sdkp->disk->queue->limits.needs_stable_pages = true;
+	} else
 		sd_printk(KERN_NOTICE, sdkp,
 			  "Disabling DIF Type %u protection\n", type);
 }
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 0cb39ff..9dc330c 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -338,6 +338,8 @@  void sd_dif_config_host(struct scsi_disk *sdkp)
 	sd_printk(KERN_NOTICE, sdkp,
 		  "Enabling DIX %s protection\n", disk->integrity->name);
 
+	disk->queue->limits.needs_stable_pages = true;
+
 	/* Signal to block layer that we support sector tagging */
 	if (dif && type && sdkp->ATO) {
 		if (type == SD_DIF_TYPE3_PROTECTION)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 92956b7..a5a33db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -266,6 +266,8 @@  struct queue_limits {
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
 	unsigned char		discard_zeroes_data;
+
+	bool			needs_stable_pages;
 };
 
 struct request_queue {