diff mbox

[V1,02/17] ext4: Add the basic function for inline data support.

Message ID 1319614468-11227-2-git-send-email-tm@tao.ma
State Superseded, archived
Headers show

Commit Message

Tao Ma Oct. 26, 2011, 7:34 a.m. UTC
From: Tao Ma <boyu.mt@taobao.com>

Implement inline data with xattr. This idea is inspired by Andreas.
So now we use "system.data" to store xattr.
For inode_size = 256, currently we uses all the space between i_extra_isize
and inode_size. For inode_size > 256, we use half of that space.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ext4/ext4.h  |    5 ++
 fs/ext4/inode.c |    9 ++-
 fs/ext4/xattr.c |  216 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/xattr.h |   68 +++++++++++++++++
 4 files changed, 295 insertions(+), 3 deletions(-)

Comments

Andreas Dilger Oct. 26, 2011, 8:36 a.m. UTC | #1
On 2011-10-26, at 1:34 AM, Tao Ma wrote:
> Implement inline data with xattr. This idea is inspired by Andreas.
> So now we use "system.data" to store xattr.  For inode_size = 256,
> currently we uses all the space between i_extra_isize and inode_size.
> For inode_size > 256, we use half of that space.
> 
> +#define EXT4_XATTR_SYSTEM_DATA_NAME		"data"

Did you check whether the "data" string takes 4 bytes or 8 in the
xattr header?  I believe it is storing the NUL termination (and
the code is using "sizeof(EXT4_XATTR_SYSTEM_DATA_NAME)" = 5), so
I believe it will round up to the next 4-byte boundary.  Using a
3-byte name will save a bit of space, like "system.dat".

> +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
> +			    void *buffer, loff_t pos, unsigned len)
> +{
> +	header = IHDR(inode, ext4_raw_inode(iloc));
> +	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
> +					    EXT4_I(inode)->i_inline_off);
> +	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
> +	       buffer + pos, len);
> +}
> +
> +int ext4_init_inline_data(handle_t *handle, struct inode *inode,
> +			  struct ext4_iloc *iloc)
> +{
> +	size = ext4_get_max_inline_size(inode);
> +	value = kzalloc(size, GFP_NOFS);
> +	if (!value)
> +		return -ENOMEM;
> +
> +	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
> +}

Since file data is changed very rarely, instead of consuming the full
xattr space that may not be needed, wouldn't it be better to change
ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
the exact-sized buffer into the xattr?  That will allow other xattrs
to be stored in this space as well as the inline data.


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tao Ma Oct. 26, 2011, 2:38 p.m. UTC | #2
On 10/26/2011 04:36 PM, Andreas Dilger wrote:
> On 2011-10-26, at 1:34 AM, Tao Ma wrote:
>> Implement inline data with xattr. This idea is inspired by Andreas.
>> So now we use "system.data" to store xattr.  For inode_size = 256,
>> currently we uses all the space between i_extra_isize and inode_size.
>> For inode_size > 256, we use half of that space.
>>
>> +#define EXT4_XATTR_SYSTEM_DATA_NAME		"data"
> 
> Did you check whether the "data" string takes 4 bytes or 8 in the
> xattr header?  I believe it is storing the NUL termination (and
> the code is using "sizeof(EXT4_XATTR_SYSTEM_DATA_NAME)" = 5), so
> I believe it will round up to the next 4-byte boundary.  Using a
> 3-byte name will save a bit of space, like "system.dat".
Actually it will takes 4 bytes instead of 8, since in
ext4_xattr_set_entry it uses memcpy to copy the name with namelen
initialized with strlen("data"). So "data" is fine here. As for the
calculation of max_inline_size, yes, it uses
sizeof(EXT4_XATTR_SYSTEM_DATA_NAME) which should be changed to strlen.
Thanks for the advice.
> 
>> +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
>> +			    void *buffer, loff_t pos, unsigned len)
>> +{
>> +	header = IHDR(inode, ext4_raw_inode(iloc));
>> +	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
>> +					    EXT4_I(inode)->i_inline_off);
>> +	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
>> +	       buffer + pos, len);
>> +}
>> +
>> +int ext4_init_inline_data(handle_t *handle, struct inode *inode,
>> +			  struct ext4_iloc *iloc)
>> +{
>> +	size = ext4_get_max_inline_size(inode);
>> +	value = kzalloc(size, GFP_NOFS);
>> +	if (!value)
>> +		return -ENOMEM;
>> +
>> +	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>> +}
> 
> Since file data is changed very rarely, instead of consuming the full
> xattr space that may not be needed, wouldn't it be better to change
> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
> the exact-sized buffer into the xattr?  That will allow other xattrs
> to be stored in this space as well as the inline data.
I am just worried about the cpu usage. You know, the xattr values in
ext4 has to be packed so if we change the content of an inline file
frequently(say append), the inline xattr value will be removed and added
frequently which should consume much cpu cycles. What's more, the other
xattr values has to be moved also if they are not aligned to the end of
the inode. I am not sure whether it is good for performance or not.
Another side effect is that we have to write the whole inline data every
time as a new xattr value replace every time while the current solution
just needs to memcpy the appended bytes.

Thanks
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Oct. 26, 2011, 10:28 p.m. UTC | #3
On 2011-10-26, at 8:38 AM, Tao Ma wrote:
> On 10/26/2011 04:36 PM, Andreas Dilger wrote:
>> On 2011-10-26, at 1:34 AM, Tao Ma wrote:
>>> +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
>>> +			    void *buffer, loff_t pos, unsigned len)
>>> +{
>>> +	header = IHDR(inode, ext4_raw_inode(iloc));
>>> +	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
>>> +					    EXT4_I(inode)->i_inline_off);
>>> +	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
>>> +	       buffer + pos, len);
>>> +}
>>> +
>>> +int ext4_init_inline_data(handle_t *handle, struct inode *inode,
>>> +			  struct ext4_iloc *iloc)
>>> +{
>>> +	size = ext4_get_max_inline_size(inode);
>>> +	value = kzalloc(size, GFP_NOFS);
>>> +	if (!value)
>>> +		return -ENOMEM;
>>> +
>>> +	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>>> +}
>> 
>> Since file data is changed very rarely, instead of consuming the full
>> xattr space that may not be needed, wouldn't it be better to change
>> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
>> the exact-sized buffer into the xattr?  That will allow other xattrs
>> to be stored in this space as well as the inline data.
> 
> I am just worried about the cpu usage. You know, the xattr values in
> ext4 has to be packed so if we change the content of an inline file
> frequently(say append), the inline xattr value will be removed and added
> frequently which should consume much cpu cycles.  What's more, the other
> xattr values has to be moved also if they are not aligned to the end of
> the inode. I am not sure whether it is good for performance or not.

I'd also guess it isn't the most CPU efficient mechanism, but the main
question is whether this extra CPU usage is even noticeable compared
to the IO time?  Even with the added CPU usage, there is a dramatic
reduction in the IO (no external block to write), so it would always
be a net win to do it that way.

> Another side effect is that we have to write the whole inline data every
> time as a new xattr value replace every time while the current solution
> just needs to memcpy the appended bytes.

What about only storing a new xattr if the file size is increasing, or
when it is truncated to zero?  If the write is <= existing xattr size
then it can use the same mechanism as today (inline overwrite of the
xattr buffer, and update of the xattr checksum).  That avoids overhead
for the case of repeatedly writing a small same-size value into the file.
If some application is appending 1 byte at a time to a file, I think
the CPU overhead in the xattr code is the least of their worries.

The main reason I don't like to consume all of the xattr space right
away is that this will cause OTHER xattrs to immediately be pushed
into the external xattr block (e.g. selinux, security, etc) and then
we will be even worse off than before (file data in inode, xattr in
external block, and added complexity for no benefit).

Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tao Ma Oct. 27, 2011, 12:51 a.m. UTC | #4
On 10/27/2011 06:28 AM, Andreas Dilger wrote:
> On 2011-10-26, at 8:38 AM, Tao Ma wrote:
>> On 10/26/2011 04:36 PM, Andreas Dilger wrote:
>>> On 2011-10-26, at 1:34 AM, Tao Ma wrote:
>>>> +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
>>>> +			    void *buffer, loff_t pos, unsigned len)
>>>> +{
>>>> +	header = IHDR(inode, ext4_raw_inode(iloc));
>>>> +	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
>>>> +					    EXT4_I(inode)->i_inline_off);
>>>> +	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
>>>> +	       buffer + pos, len);
>>>> +}
>>>> +
>>>> +int ext4_init_inline_data(handle_t *handle, struct inode *inode,
>>>> +			  struct ext4_iloc *iloc)
>>>> +{
>>>> +	size = ext4_get_max_inline_size(inode);
>>>> +	value = kzalloc(size, GFP_NOFS);
>>>> +	if (!value)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>>>> +}
>>>
>>> Since file data is changed very rarely, instead of consuming the full
>>> xattr space that may not be needed, wouldn't it be better to change
>>> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
>>> the exact-sized buffer into the xattr?  That will allow other xattrs
>>> to be stored in this space as well as the inline data.
>>
>> I am just worried about the cpu usage. You know, the xattr values in
>> ext4 has to be packed so if we change the content of an inline file
>> frequently(say append), the inline xattr value will be removed and added
>> frequently which should consume much cpu cycles.  What's more, the other
>> xattr values has to be moved also if they are not aligned to the end of
>> the inode. I am not sure whether it is good for performance or not.
> 
> I'd also guess it isn't the most CPU efficient mechanism, but the main
> question is whether this extra CPU usage is even noticeable compared
> to the IO time?  Even with the added CPU usage, there is a dramatic
> reduction in the IO (no external block to write), so it would always
> be a net win to do it that way.
It seems so. anyway, I will do some tests for file appending to see how
much these 2 methods differs.
> 
>> Another side effect is that we have to write the whole inline data every
>> time as a new xattr value replace every time while the current solution
>> just needs to memcpy the appended bytes.
> 
> What about only storing a new xattr if the file size is increasing, or
> when it is truncated to zero?  If the write is <= existing xattr size
> then it can use the same mechanism as today (inline overwrite of the
> xattr buffer, and update of the xattr checksum).  That avoids overhead
> for the case of repeatedly writing a small same-size value into the file.
> If some application is appending 1 byte at a time to a file, I think
> the CPU overhead in the xattr code is the least of their worries.
> 
> The main reason I don't like to consume all of the xattr space right
> away is that this will cause OTHER xattrs to immediately be pushed
> into the external xattr block (e.g. selinux, security, etc) and then
> we will be even worse off than before (file data in inode, xattr in
> external block, and added complexity for no benefit).
To be honest, with inode size = 256, we don't have much spaces left in
the inode. With current i_extra_isize 28, we have only 92 bytes left for
xattrs(4 bytes for the xattr header magic and 4 bytes for the gap
between ext4_xattr_entry and the value, 256 - 128 - 28 - 4 - 4). So
considering one ext4_xattr_entry have 16 bytes and with the miminum
namelen of 4, if we support 2 entries(one for inline data and one for a
real xattr), these will take 40 bytes. And only 52 bytes are left. I
don't think these bytes are enough for 2 xattr values. ;) So why not
take all of them(72 bytes)? As for inode size > 256, the inline data
will only takes half of the spaces left and leaves the space for other
xattrs. Does it make sense?

btw, I have no idea of what a normal acl xattr takes, but if it takes
more than 10 bytes, it will almost make the inline dir almost no use,
since we have to store dot and dotdot first and then the real file
names. Too small space isn't good but adds overhead of converting from
inline to external block.

Thanks
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tao Ma Oct. 27, 2011, 12:51 a.m. UTC | #5
On 10/27/2011 06:28 AM, Andreas Dilger wrote:
> On 2011-10-26, at 8:38 AM, Tao Ma wrote:
>> On 10/26/2011 04:36 PM, Andreas Dilger wrote:
>>> On 2011-10-26, at 1:34 AM, Tao Ma wrote:
>>>> +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
>>>> +			    void *buffer, loff_t pos, unsigned len)
>>>> +{
>>>> +	header = IHDR(inode, ext4_raw_inode(iloc));
>>>> +	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
>>>> +					    EXT4_I(inode)->i_inline_off);
>>>> +	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
>>>> +	       buffer + pos, len);
>>>> +}
>>>> +
>>>> +int ext4_init_inline_data(handle_t *handle, struct inode *inode,
>>>> +			  struct ext4_iloc *iloc)
>>>> +{
>>>> +	size = ext4_get_max_inline_size(inode);
>>>> +	value = kzalloc(size, GFP_NOFS);
>>>> +	if (!value)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
>>>> +}
>>>
>>> Since file data is changed very rarely, instead of consuming the full
>>> xattr space that may not be needed, wouldn't it be better to change
>>> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
>>> the exact-sized buffer into the xattr?  That will allow other xattrs
>>> to be stored in this space as well as the inline data.
>>
>> I am just worried about the cpu usage. You know, the xattr values in
>> ext4 has to be packed so if we change the content of an inline file
>> frequently(say append), the inline xattr value will be removed and added
>> frequently which should consume much cpu cycles.  What's more, the other
>> xattr values has to be moved also if they are not aligned to the end of
>> the inode. I am not sure whether it is good for performance or not.
> 
> I'd also guess it isn't the most CPU efficient mechanism, but the main
> question is whether this extra CPU usage is even noticeable compared
> to the IO time?  Even with the added CPU usage, there is a dramatic
> reduction in the IO (no external block to write), so it would always
> be a net win to do it that way.
It seems so. anyway, I will do some tests for file appending to see how
much these 2 methods differs.
> 
>> Another side effect is that we have to write the whole inline data every
>> time as a new xattr value replace every time while the current solution
>> just needs to memcpy the appended bytes.
> 
> What about only storing a new xattr if the file size is increasing, or
> when it is truncated to zero?  If the write is <= existing xattr size
> then it can use the same mechanism as today (inline overwrite of the
> xattr buffer, and update of the xattr checksum).  That avoids overhead
> for the case of repeatedly writing a small same-size value into the file.
> If some application is appending 1 byte at a time to a file, I think
> the CPU overhead in the xattr code is the least of their worries.
> 
> The main reason I don't like to consume all of the xattr space right
> away is that this will cause OTHER xattrs to immediately be pushed
> into the external xattr block (e.g. selinux, security, etc) and then
> we will be even worse off than before (file data in inode, xattr in
> external block, and added complexity for no benefit).
To be honest, with inode size = 256, we don't have much spaces left in
the inode. With current i_extra_isize 28, we have only 92 bytes left for
xattrs(4 bytes for the xattr header magic and 4 bytes for the gap
between ext4_xattr_entry and the value, 256 - 128 - 28 - 4 - 4). So
considering one ext4_xattr_entry have 16 bytes and with the miminum
namelen of 4, if we support 2 entries(one for inline data and one for a
real xattr), these will take 40 bytes. And only 52 bytes are left. I
don't think these bytes are enough for 2 xattr values. ;) So why not
take all of them(72 bytes)? As for inode size > 256, the inline data
will only takes half of the spaces left and leaves the space for other
xattrs. Does it make sense?

btw, I have no idea of what a normal acl xattr takes, but if it takes
more than 10 bytes, it will almost make the inline dir almost no use,
since we have to store dot and dotdot first and then the real file
names. Too small space isn't good but adds overhead of converting from
inline to external block.

Thanks
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Oct. 27, 2011, 9:57 a.m. UTC | #6
On 2011-10-26, at 6:51 PM, Tao Ma <tm@tao.ma> wrote:
> On 10/27/2011 06:28 AM, Andreas Dilger wrote:
>> On 2011-10-26, at 8:38 AM, Tao Ma wrote:
>>> On 10/26/2011 04:36 PM, Andreas Dilger wrote:
>>>> Since file data is changed very rarely, instead of consuming the full
>>>> xattr space that may not be needed, wouldn't it be better to change
>>>> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
>>>> the exact-sized buffer into the xattr?  That will allow other xattrs
>>>> to be stored in this space as well as the inline data.
>>> 
>>> I am just worried about the cpu usage. You know, the xattr values in
>>> ext4 has to be packed so if we change the content of an inline file
>>> frequently (say append), the inline xattr value will be removed and added

Given the small size of the space, it seems unlikely that apps would be growing the size of the file many times before it overflowed the inode xattr space, so I don't think this is a valid concern. I think such small files will normally be written once at the proper size, or if they are written repeatedly the offset will not change. 

>>> frequently which should consume much cpu cycles.  What's more, the other
>>> xattr values has to be moved also if they are not aligned to the end of
>>> the inode. I am not sure whether it is good for performance or not.
>> 
>> I'd also guess it isn't the most CPU efficient mechanism, but the main
>> question is whether this extra CPU usage is even noticeable compared
>> to the IO time?  Even with the added CPU usage, there is a dramatic
>> reduction in the IO (no external block to write), so it would always
>> be a net win to do it that way.

> It seems so. anyway, I will do some tests for file appending to see how
> much these 2 methods differs.

Great. 

>>> Another side effect is that we have to write the whole inline data every
>>> time as a new xattr value replace every time while the current solution
>>> just needs to memcpy the appended bytes.
>> 
>> What about only storing a new xattr if the file size is increasing, or
>> when it is truncated to zero?  If the write is <= existing xattr size
>> then it can use the same mechanism as today (inline overwrite of the
>> xattr buffer, and update of the xattr checksum).  That avoids overhead
>> for the case of repeatedly writing a small same-size value into the file.
>> If some application is appending 1 byte at a time to a file, I think
>> the CPU overhead in the xattr code is the least of their worries.
>> 
>> The main reason I don't like to consume all of the xattr space right
>> away is that this will cause OTHER xattrs to immediately be pushed
>> into the external xattr block (e.g. selinux, security, etc) and then
>> we will be even worse off than before (file data in inode, xattr in
>> external block, and added complexity for no benefit).

> To be honest, with inode size = 256, we don't have much spaces left in
> the inode. With current i_extra_isize 28, we have only 92 bytes left for
> xattrs(4 bytes for the xattr header magic and 4 bytes for the gap
> between ext4_xattr_entry and the value, 256 - 128 - 28 - 4 - 4). So
> considering one ext4_xattr_entry have 16 bytes and with the miminum
> namelen of 4, if we support 2 entries(one for inline data and one for a
> real xattr), these will take 40 bytes. And only 52 bytes are left. I
> don't think these bytes are enough for 2 xattr values. ;)

This is enough for an empty dir (24 bytes) and another 28 bytes for a security xattr. 

> So why not
> take all of them(72 bytes)? As for inode size > 256, the inline data
> will only takes half of the spaces left and leaves the space for other
> xattrs. Does it make sense?

No, because if ANY other xattr exists it will be pushed to an external block, and then if this data xattr grows (much more likely than the other xattr changing) it won't fit into the inode, and now performance is permanently worse than before. 

> btw, I have no idea of what a normal acl xattr takes, but if it takes
> more than 10 bytes, it will almost make the inline dir almost no use,
> since we have to store dot and dotdot first and then the real file
> names. Too small space isn't good but adds overhead of converting from
> inline to external block.

In our environment we use at least 512-byte inodes on the metadata server, but I still don't want half if that space wasted on this xattr if so much is not needed. 

Cheers, Andreas --
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tao Ma Oct. 27, 2011, 2:53 p.m. UTC | #7
On 10/27/2011 05:57 PM, Andreas Dilger wrote:
> On 2011-10-26, at 6:51 PM, Tao Ma <tm@tao.ma> wrote:
>> On 10/27/2011 06:28 AM, Andreas Dilger wrote:
>>> On 2011-10-26, at 8:38 AM, Tao Ma wrote:
>>>> On 10/26/2011 04:36 PM, Andreas Dilger wrote:
>>>>> Since file data is changed very rarely, instead of consuming the full
>>>>> xattr space that may not be needed, wouldn't it be better to change
>>>>> ext4_write_inline_data() to just to the ext4_xattr_ibody_set() to save
>>>>> the exact-sized buffer into the xattr?  That will allow other xattrs
>>>>> to be stored in this space as well as the inline data.
>>>>
>>>> I am just worried about the cpu usage. You know, the xattr values in
>>>> ext4 has to be packed so if we change the content of an inline file
>>>> frequently (say append), the inline xattr value will be removed and added
> 
> Given the small size of the space, it seems unlikely that apps would be growing the size of the file many times before it overflowed the inode xattr space, so I don't think this is a valid concern. I think such small files will normally be written once at the proper size, or if they are written repeatedly the offset will not change. 
OK.
> 
>>>> frequently which should consume much cpu cycles.  What's more, the other
>>>> xattr values has to be moved also if they are not aligned to the end of
>>>> the inode. I am not sure whether it is good for performance or not.
>>>
>>> I'd also guess it isn't the most CPU efficient mechanism, but the main
>>> question is whether this extra CPU usage is even noticeable compared
>>> to the IO time?  Even with the added CPU usage, there is a dramatic
>>> reduction in the IO (no external block to write), so it would always
>>> be a net win to do it that way.
> 
>> It seems so. anyway, I will do some tests for file appending to see how
>> much these 2 methods differs.
> 
> Great. 
> 
>>>> Another side effect is that we have to write the whole inline data every
>>>> time as a new xattr value replace every time while the current solution
>>>> just needs to memcpy the appended bytes.
>>>
>>> What about only storing a new xattr if the file size is increasing, or
>>> when it is truncated to zero?  If the write is <= existing xattr size
>>> then it can use the same mechanism as today (inline overwrite of the
>>> xattr buffer, and update of the xattr checksum).  That avoids overhead
>>> for the case of repeatedly writing a small same-size value into the file.
>>> If some application is appending 1 byte at a time to a file, I think
>>> the CPU overhead in the xattr code is the least of their worries.
>>>
>>> The main reason I don't like to consume all of the xattr space right
>>> away is that this will cause OTHER xattrs to immediately be pushed
>>> into the external xattr block (e.g. selinux, security, etc) and then
>>> we will be even worse off than before (file data in inode, xattr in
>>> external block, and added complexity for no benefit).
> 
>> To be honest, with inode size = 256, we don't have much spaces left in
>> the inode. With current i_extra_isize 28, we have only 92 bytes left for
>> xattrs(4 bytes for the xattr header magic and 4 bytes for the gap
>> between ext4_xattr_entry and the value, 256 - 128 - 28 - 4 - 4). So
>> considering one ext4_xattr_entry have 16 bytes and with the miminum
>> namelen of 4, if we support 2 entries(one for inline data and one for a
>> real xattr), these will take 40 bytes. And only 52 bytes are left. I
>> don't think these bytes are enough for 2 xattr values. ;)
> 
> This is enough for an empty dir (24 bytes) and another 28 bytes for a security xattr. 
> 
>> So why not
>> take all of them(72 bytes)? As for inode size > 256, the inline data
>> will only takes half of the spaces left and leaves the space for other
>> xattrs. Does it make sense?
> 
> No, because if ANY other xattr exists it will be pushed to an external block, and then if this data xattr grows (much more likely than the other xattr changing) it won't fit into the inode, and now performance is permanently worse than before. 
OK, since it seems that lustre uses xattr heavily, I will try my best to
avoid the performance regression for xattr operations.
> 
>> btw, I have no idea of what a normal acl xattr takes, but if it takes
>> more than 10 bytes, it will almost make the inline dir almost no use,
>> since we have to store dot and dotdot first and then the real file
>> names. Too small space isn't good but adds overhead of converting from
>> inline to external block.
> 
> In our environment we use at least 512-byte inodes on the metadata server, but I still don't want half if that space wasted on this xattr if so much is not needed. 
Thanks for the info.

btw, I have another idea about using the not-used extent space for
storing inline data like what we do for a symlink. So I will still use a
xattr entry to indicate whether the inode will have inline data or not.
If yes, the initialized xattr value len will be zero while the extent
space(60 bytes) will be used to store the inline data. And if the file
size is larger than 60, it will begin to insert xattr values. In such
case, we supports inline data and don't use too much space after the
i_extra_isize. What do you think of it?

Thanks
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger Nov. 2, 2011, 9:16 p.m. UTC | #8
On 2011-10-27, at 8:53 AM, Tao Ma wrote:
> On 10/27/2011 05:57 PM, Andreas Dilger wrote:
>> if ANY other xattr exists it will be pushed to an external block, and
>> then if this data xattr grows (much more likely than the other xattr
>> changing) it won't fit into the inode, and now performance is
>> permanently worse than before.
> 
> OK, since it seems that lustre uses xattr heavily, I will try my best to
> avoid the performance regression for xattr operations.

I don't even think it is very much a Lustre problem, since it always
stores file data in a separate filesystem from the metadata, and
60-byte files are going to have terrible performance either way.

My main concern is for SELinux (enabled by default on most systems
today).  If the "small file" data is stored in the xattr space, and
this pushes the SELinux xattr to an external block, we have added
code complexity and a gratuitous format change (data in inode and
metadata in block, instead of metadata in inode and data in block)
with no real benefit at all.

>> In our environment we use at least 512-byte inodes on the metadata server, but I still don't want half if that space wasted on this xattr if so much is not needed.
> 
> btw, I have another idea about using the not-used extent space for
> storing inline data like what we do for a symlink. So I will still use a
> xattr entry to indicate whether the inode will have inline data or not.
> If yes, the initialized xattr value len will be zero while the extent
> space(60 bytes) will be used to store the inline data. And if the file
> size is larger than 60, it will begin to insert xattr values. In such
> case, we supports inline data and don't use too much space after the
> i_extra_isize. What do you think of it?

I think this is an interesting idea.  Since only the "data" xattr could
use this space, it gives us an extra 60 bytes of space to be used in
the inode and does not consume the xattr space.  The main drawback is
that this would add special case handling based on the xattr name, but
I think it is worthwhile to investigate how complex that code is and
what kind of performance improvement it gives.

Looking at my FC13 installation, it seems like a large number of
files could benefit from just 60 bytes of inline storage.  So more
than 10% of all of the files on the filesystem would fit in i_blocks.
This filesystem includes a lot of source and build files, but I also
think this is pretty typical of normal Linux usage.

# find / -xdev -type f -size -61c | wc -l
35661
# find / -xdev -type f | wc -l
335515

The "fsstats" tool is useful for collecting interesting data like this:
(http://www.pdsi-scidac.org/fsstats/files/fsstats-1.4.5.tar.gz)
and it shows the same is true for directories as well:


directory size (entries): Range of entries, count of directories in range,
 number of dirs in range as a % of total num of dirs, number of dirs in
 this range or smaller as a % total number of dirs, total entries in range,
 number of entries in range as a % of total number of entries, number of
 entries in this range or smaller as a % of total number of entries.

  count=33476 avg=11.38 ents
  min=0.00 ents max=3848.00 ents
  [   0-   1 ents]: 11257 (33.63%) ( 33.63%)   9968.00 ents ( 2.62%) (  2.62%)
  [   2-   3 ents]:  7080 (21.15%) ( 54.78%)  16608.00 ents ( 4.36%) (  6.97%)
  [   4-   7 ents]:  5793 (17.30%) ( 72.08%)  30674.00 ents ( 8.05%) ( 15.02%)
  [   8-  15 ents]:  3971 (11.86%) ( 83.94%)  43315.00 ents (11.37%) ( 26.39%)
  [  16-  31 ents]:  2731 ( 8.16%) ( 92.10%)  59612.00 ents (15.64%) ( 42.04%)
  [  32-  63 ents]:  1610 ( 4.81%) ( 96.91%)  69326.00 ents (18.19%) ( 60.23%)
  [  64- 127 ents]:   705 ( 2.11%) ( 99.02%)  61633.00 ents (16.17%) ( 76.40%)
  [ 128- 255 ents]:   236 ( 0.70%) ( 99.72%)  40005.00 ents (10.50%) ( 86.90%)
  [ 256- 511 ents]:    66 ( 0.20%) ( 99.92%)  21923.00 ents ( 5.75%) ( 92.66%)
  [ 512-1023 ents]:    19 ( 0.06%) ( 99.98%)  14249.00 ents ( 3.74%) ( 96.40%)
  [1024-2047 ents]:     6 ( 0.02%) ( 99.99%)   7756.00 ents ( 2.04%) ( 98.43%)
  [2048-4095 ents]:     2 ( 0.01%) (100.00%)   5979.00 ents ( 1.57%) (100.00%)

A simple test of the performance gains might be running "file" on
everything in /etc and /usr, and measuring this with blktrace to
see what kind of seek reduction is seen from not doing seeks to
read the small files from an external block.



I think it is still useful to try to store the data in the large inode
xattr space if it is larger than i_blocks, especially for larger inodes,
but if there is not enough space for all the xattrs to fit into the
inode, I think "data" should be the first one to be pushed out of the
inode since that changes the format back to a normal ext* file.


We might also consider a reiserfs-like approach where multiple small
files could be packed into the same shared xattr block, but then the
xattr name would need to change from "data" to e.g. "inode.generation"
so that it can be located within the block shared between inodes.

Tail packing is more complex, so such a change would only make sense
if real-world testing showed a benefit.  There is already the concept
of shared external xattr blocks, so maybe it isn't too bad.  Together
with bigalloc, it might make sense to be able to pack many small files
into one cluster if there is a binomial distribution of file sizes?

Cheers, Andreas
--
Andreas Dilger 
Principal Engineer
Whamcloud, Inc.



--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tao Ma Nov. 3, 2011, 4:23 a.m. UTC | #9
On 11/03/2011 05:16 AM, Andreas Dilger wrote:
> On 2011-10-27, at 8:53 AM, Tao Ma wrote:
>> On 10/27/2011 05:57 PM, Andreas Dilger wrote:
>>> if ANY other xattr exists it will be pushed to an external block, and
>>> then if this data xattr grows (much more likely than the other xattr
>>> changing) it won't fit into the inode, and now performance is
>>> permanently worse than before.
>>
>> OK, since it seems that lustre uses xattr heavily, I will try my best to
>> avoid the performance regression for xattr operations.
> 
> I don't even think it is very much a Lustre problem, since it always
> stores file data in a separate filesystem from the metadata, and
> 60-byte files are going to have terrible performance either way.
> 
> My main concern is for SELinux (enabled by default on most systems
> today).  If the "small file" data is stored in the xattr space, and
> this pushes the SELinux xattr to an external block, we have added
> code complexity and a gratuitous format change (data in inode and
> metadata in block, instead of metadata in inode and data in block)
> with no real benefit at all.
Fair enough. Thanks for the explanation.
> 
>>> In our environment we use at least 512-byte inodes on the metadata server, but I still don't want half if that space wasted on this xattr if so much is not needed.
>>
>> btw, I have another idea about using the not-used extent space for
>> storing inline data like what we do for a symlink. So I will still use a
>> xattr entry to indicate whether the inode will have inline data or not.
>> If yes, the initialized xattr value len will be zero while the extent
>> space(60 bytes) will be used to store the inline data. And if the file
>> size is larger than 60, it will begin to insert xattr values. In such
>> case, we supports inline data and don't use too much space after the
>> i_extra_isize. What do you think of it?
> 
> I think this is an interesting idea.  Since only the "data" xattr could
> use this space, it gives us an extra 60 bytes of space to be used in
> the inode and does not consume the xattr space.  The main drawback is
> that this would add special case handling based on the xattr name, but
> I think it is worthwhile to investigate how complex that code is and
> what kind of performance improvement it gives.
> 
> Looking at my FC13 installation, it seems like a large number of
> files could benefit from just 60 bytes of inline storage.  So more
> than 10% of all of the files on the filesystem would fit in i_blocks.
> This filesystem includes a lot of source and build files, but I also
> think this is pretty typical of normal Linux usage.
> 
> # find / -xdev -type f -size -61c | wc -l
> 35661
> # find / -xdev -type f | wc -l
> 335515
> 
> The "fsstats" tool is useful for collecting interesting data like this:
> (http://www.pdsi-scidac.org/fsstats/files/fsstats-1.4.5.tar.gz)
> and it shows the same is true for directories as well:
> 
> 
> directory size (entries): Range of entries, count of directories in range,
>  number of dirs in range as a % of total num of dirs, number of dirs in
>  this range or smaller as a % total number of dirs, total entries in range,
>  number of entries in range as a % of total number of entries, number of
>  entries in this range or smaller as a % of total number of entries.
> 
>   count=33476 avg=11.38 ents
>   min=0.00 ents max=3848.00 ents
>   [   0-   1 ents]: 11257 (33.63%) ( 33.63%)   9968.00 ents ( 2.62%) (  2.62%)
>   [   2-   3 ents]:  7080 (21.15%) ( 54.78%)  16608.00 ents ( 4.36%) (  6.97%)
>   [   4-   7 ents]:  5793 (17.30%) ( 72.08%)  30674.00 ents ( 8.05%) ( 15.02%)
>   [   8-  15 ents]:  3971 (11.86%) ( 83.94%)  43315.00 ents (11.37%) ( 26.39%)
>   [  16-  31 ents]:  2731 ( 8.16%) ( 92.10%)  59612.00 ents (15.64%) ( 42.04%)
>   [  32-  63 ents]:  1610 ( 4.81%) ( 96.91%)  69326.00 ents (18.19%) ( 60.23%)
>   [  64- 127 ents]:   705 ( 2.11%) ( 99.02%)  61633.00 ents (16.17%) ( 76.40%)
>   [ 128- 255 ents]:   236 ( 0.70%) ( 99.72%)  40005.00 ents (10.50%) ( 86.90%)
>   [ 256- 511 ents]:    66 ( 0.20%) ( 99.92%)  21923.00 ents ( 5.75%) ( 92.66%)
>   [ 512-1023 ents]:    19 ( 0.06%) ( 99.98%)  14249.00 ents ( 3.74%) ( 96.40%)
>   [1024-2047 ents]:     6 ( 0.02%) ( 99.99%)   7756.00 ents ( 2.04%) ( 98.43%)
>   [2048-4095 ents]:     2 ( 0.01%) (100.00%)   5979.00 ents ( 1.57%) (100.00%)
> 
> A simple test of the performance gains might be running "file" on
> everything in /etc and /usr, and measuring this with blktrace to
> see what kind of seek reduction is seen from not doing seeks to
> read the small files from an external block.
Thanks for the test tools, and yes, I will try to test it when I finish
my V2.
> I think it is still useful to try to store the data in the large inode
> xattr space if it is larger than i_blocks, especially for larger inodes,
> but if there is not enough space for all the xattrs to fit into the
> inode, I think "data" should be the first one to be pushed out of the
> inode since that changes the format back to a normal ext* file.
Oh, that does mean that we need to change the normal way we handling
xattr set. I am not sure of it. Maybe we need an option in sysfs that
can be tuned so that the user can tell us whether his inline file
content is more important or the xattr. :)
> 
> 
> We might also consider a reiserfs-like approach where multiple small
> files could be packed into the same shared xattr block, but then the
> xattr name would need to change from "data" to e.g. "inode.generation"
> so that it can be located within the block shared between inodes.
I'd like to defer it to a later version since it means that we still
need to do 2 I/Os for a small file, the same as a ext* file. Having said
that, it does be helpful for a sequence of small file read(file
iteration or tar) if the underlying block device do a good readahead
job. And this should help in the bigalloc case I mentioned in another
thread about changing extent lengh to cluster(the case is when we tar a
kernel source and do 'sync', it is very time-consuming for a bigalloc
volume).
> 
> Tail packing is more complex, so such a change would only make sense
> if real-world testing showed a benefit.  There is already the concept
> of shared external xattr blocks, so maybe it isn't too bad.  Together
> with bigalloc, it might make sense to be able to pack many small files
> into one cluster if there is a binomial distribution of file sizes?
It is a bit complicate than the current code base. So let me first
implement all the suggestions above and then try to investigate whether
it really helps.

Thanks
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0..9a60193 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -846,6 +846,10 @@  struct ext4_inode_info {
 	/* on-disk additional length */
 	__u16 i_extra_isize;
 
+	/* Indicate the inline data space. */
+	u16 i_inline_off;
+	u16 i_inline_size;
+
 #ifdef CONFIG_QUOTA
 	/* quota space reservation, managed internally by quota code */
 	qsize_t i_reserved_quota;
@@ -1261,6 +1265,7 @@  enum {
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
+	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6638f0e..017e119 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3386,12 +3386,15 @@  static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 
 static inline void ext4_iget_extra_inode(struct inode *inode,
 					 struct ext4_inode *raw_inode,
-					 struct ext4_inode_info *ei)
+					 struct ext4_inode_info *ei,
+					 struct ext4_iloc *iloc)
 {
 	__le32 *magic = (void *)raw_inode +
 			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
-	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+		ext4_find_inline_data(inode, iloc);
+	}
 }
 
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
@@ -3505,7 +3508,7 @@  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else
-			ext4_iget_extra_inode(inode, raw_inode, ei);
+			ext4_iget_extra_inode(inode, raw_inode, ei, &iloc);
 	} else
 		ei->i_extra_isize = 0;
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc..37418a9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -944,6 +944,222 @@  ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
+#define EXT4_XATTR_SYSTEM_DATA_NAME		"data"
+
+/*
+ * the max inline size we can have for an inline inode.
+ *
+ * Currently we uses the maximum free space if inode size is 256
+ * and half of the space left if inode size > 256.
+ * This can be tuned later and the codes should work with
+ * the old size since if an inode has been initialized already,
+ * it will uses the value it detected.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+	if (EXT4_SB(inode->i_sb)->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE)
+		return 0;
+
+	if (EXT4_I(inode)->i_inline_off)
+		return EXT4_I(inode)->i_inline_size;
+
+	if (EXT4_SB(inode->i_sb)->s_inode_size == 256)
+		return EXT4_XATTR_SIZE(EXT4_SB(inode->i_sb)->s_inode_size -
+			EXT4_GOOD_OLD_INODE_SIZE -
+			EXT4_I(inode)->i_extra_isize -
+			sizeof(struct ext4_xattr_ibody_header) -
+			EXT4_XATTR_LEN(sizeof(EXT4_XATTR_SYSTEM_DATA_NAME)) -
+			EXT4_XATTR_ROUND);
+
+	return EXT4_XATTR_SIZE((EXT4_SB(inode->i_sb)->s_inode_size -
+		EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize) / 2);
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+	return EXT4_I(inode)->i_inline_off;
+}
+
+int ext4_find_inline_data(struct inode *inode, struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+		.iloc = *iloc,
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM_DATA,
+		.name = EXT4_XATTR_SYSTEM_DATA_NAME,
+	};
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		return error;
+	if (!is.s.not_found) {
+		EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+						(void *)ext4_raw_inode(iloc));
+		EXT4_I(inode)->i_inline_size =
+				le32_to_cpu(is.s.here->e_value_size);
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	}
+	return 0;
+}
+
+void* ext4_get_inline_data_pos(struct inode *inode, struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+	header = IHDR(inode, ext4_raw_inode(iloc));
+	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+					    EXT4_I(inode)->i_inline_off);
+
+	return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+
+}
+
+void ext4_read_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+			   void *buffer, size_t len)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+	header = IHDR(inode, ext4_raw_inode(iloc));
+	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+					    EXT4_I(inode)->i_inline_off);
+	BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+	memcpy(buffer,
+	      (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+}
+
+void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+			    void *buffer, loff_t pos, unsigned len)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+	header = IHDR(inode, ext4_raw_inode(iloc));
+	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+					    EXT4_I(inode)->i_inline_off);
+	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+	       buffer + pos, len);
+}
+
+int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+			  struct ext4_iloc *iloc)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int size, error;
+	char *value;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+		.iloc = *iloc,
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM_DATA,
+		.name = EXT4_XATTR_SYSTEM_DATA_NAME,
+	};
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+		return -ENOSPC;
+
+	BUG_ON(ei->i_inline_off != 0);
+
+	size = ext4_get_max_inline_size(inode);
+	/*
+	 * XXX: We'd better try to check whether we can insert it into inode
+	 * before allocating the value.
+	 */
+	value = kzalloc(size, GFP_NOFS);
+	if (!value)
+		return -ENOMEM;
+
+	i.value = value;
+	i.value_len = size;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUG_ON(!is.s.not_found);
+
+	error = ext4_journal_get_write_access(handle, iloc->bh);
+	if (error)
+		goto out;
+
+	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+	if (error) {
+		if (error == -ENOSPC)
+			ext4_clear_inode_state(inode,
+					       EXT4_STATE_MAY_INLINE_DATA);
+		goto out;
+	}
+
+	get_bh(iloc->bh);
+	error = ext4_mark_iloc_dirty(handle, inode, iloc);
+
+	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+				      (void *)ext4_raw_inode(iloc));
+	EXT4_I(inode)->i_inline_size = size;
+	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+	kfree(value);
+	return error;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = 0, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM_DATA,
+		.name = EXT4_XATTR_SYSTEM_DATA_NAME,
+		.value = NULL,
+		.value_len = 0,
+	};
+	int error;
+
+	if (!ei->i_inline_off)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+	if (error)
+		goto out;
+
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+	EXT4_I(inode)->i_inline_off = 0;
+	EXT4_I(inode)->i_inline_size = 0;
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+	return error;
+}
+
 /*
  * ext4_xattr_set_handle()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 25b7387..ca3b05b 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@ 
 #define EXT4_XATTR_INDEX_TRUSTED		4
 #define	EXT4_XATTR_INDEX_LUSTRE			5
 #define EXT4_XATTR_INDEX_SECURITY	        6
+#define EXT4_XATTR_INDEX_SYSTEM_DATA		7
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
@@ -88,6 +89,18 @@  extern void ext4_exit_xattr(void);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data(struct inode *inode, struct ext4_iloc *iloc);
+extern void* ext4_get_inline_data_pos(struct inode *inode,
+				      struct ext4_iloc *iloc);
+extern void ext4_read_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+				  void *buffer, size_t len);
+extern void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+				   void *buffer, loff_t pos, unsigned len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+				 struct ext4_iloc *iloc);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
 # else  /* CONFIG_EXT4_FS_XATTR */
 
 static inline int
@@ -141,6 +154,61 @@  ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 
 #define ext4_xattr_handlers	NULL
 
+static inline int ext4_find_inline_data(struct inode *inode,
+					struct ext4_iloc *iloc)
+{
+	return 0;
+}
+
+static inline void* ext4_get_inline_data_pos(struct inode *inode,
+					     struct ext4_iloc *iloc)
+{
+	return NULL;
+}
+
+static inline int ext4_has_inline_data(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int ext4_get_max_inline_size(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int ext4_find_inline_data(struct inode *inode,
+					struct ext4_iloc *iloc)
+{
+	return 0;
+}
+
+static inline void ext4_read_inline_data(struct inode *inode,
+					 struct ext4_iloc *iloc,
+					 void *buffer, size_t len)
+{
+	return;
+}
+
+static inline void ext4_write_inline_data(struct inode *inode,
+					  struct ext4_iloc *iloc,
+					  void *buffer, loff_t pos,
+					  unsigned len)
+{
+	return;
+}
+
+static inline int ext4_init_inline_data(handle_t *handle,
+					struct inode *inode,
+					struct ext4_iloc *iloc)
+{
+	return 0;
+}
+
+static inline int ext4_destroy_inline_data(handle_t *handle,
+					   struct inode *inode)
+{
+	return 0;
+}
 # endif  /* CONFIG_EXT4_FS_XATTR */
 
 #ifdef CONFIG_EXT4_FS_SECURITY