diff mbox

ext3: return 32/64-bit dir name hash according to usage type

Message ID 4F998F9F.6020802@redhat.com
State Not Applicable, archived
Headers show

Commit Message

Eric Sandeen April 26, 2012, 6:10 p.m. UTC
This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
ext4: return 32/64-bit dir name hash according to usage type
by Fan Yong <yong.fan@whamcloud.com>

Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir().  However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.
    
Allow ext3 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions.

This patch does implement a new ext3_dir_llseek op, because with 64-bit
hashes, nfs will attempt to seek to a hash "offset" which is much
larger than ext3's s_maxbytes.  So for dx dirs, we call
generic_file_llseek_size() with the appropriate max hash value as the
maximum seekable size.  Otherwise we just pass through to
generic_file_llseek().
    
Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Patch-updated-by: Eric Sandeen <sandeen@redhat.com>
(blame us if something is not correct)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

J. Bruce Fields April 26, 2012, 6:26 p.m. UTC | #1
How are you testing this?

--b.

On Thu, Apr 26, 2012 at 01:10:39PM -0500, Eric Sandeen wrote:
> This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
> ext4: return 32/64-bit dir name hash according to usage type
> by Fan Yong <yong.fan@whamcloud.com>
> 
> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
> to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
> and telldir().  However, this causes problems if there are 32-bit hash
> collisions, since the NFSv2 server can get stuck resending the same
> entries from the directory repeatedly.
>     
> Allow ext3 to return a full 64-bit hash (both major and minor) for
> telldir to decrease the chance of hash collisions.
> 
> This patch does implement a new ext3_dir_llseek op, because with 64-bit
> hashes, nfs will attempt to seek to a hash "offset" which is much
> larger than ext3's s_maxbytes.  So for dx dirs, we call
> generic_file_llseek_size() with the appropriate max hash value as the
> maximum seekable size.  Otherwise we just pass through to
> generic_file_llseek().
>     
> Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
> Patch-updated-by: Eric Sandeen <sandeen@redhat.com>
> (blame us if something is not correct)
> 
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---
> 
> diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
> index cc761ad..92490e9 100644
> --- a/fs/ext3/dir.c
> +++ b/fs/ext3/dir.c
> @@ -21,30 +21,15 @@
>   *
>   */
>  
> +#include <linux/compat.h>
>  #include "ext3.h"
>  
>  static unsigned char ext3_filetype_table[] = {
>  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
>  };
>  
> -static int ext3_readdir(struct file *, void *, filldir_t);
>  static int ext3_dx_readdir(struct file * filp,
>  			   void * dirent, filldir_t filldir);
> -static int ext3_release_dir (struct inode * inode,
> -				struct file * filp);
> -
> -const struct file_operations ext3_dir_operations = {
> -	.llseek		= generic_file_llseek,
> -	.read		= generic_read_dir,
> -	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
> -	.unlocked_ioctl	= ext3_ioctl,
> -#ifdef CONFIG_COMPAT
> -	.compat_ioctl	= ext3_compat_ioctl,
> -#endif
> -	.fsync		= ext3_sync_file,	/* BKL held */
> -	.release	= ext3_release_dir,
> -};
> -
>  
>  static unsigned char get_dtype(struct super_block *sb, int filetype)
>  {
> @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
>  	return (ext3_filetype_table[filetype]);
>  }
>  
> +/**
> + * Check if the given dir-inode refers to an htree-indexed directory
> + * (or a directory which chould potentially get coverted to use htree
> + * indexing).
> + *
> + * Return 1 if it is a dx dir, 0 if not
> + */
> +static int is_dx_dir(struct inode *inode)
> +{
> +	struct super_block *sb = inode->i_sb;
> +
> +	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
> +		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
> +	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
> +	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
> +		return 1;
> +
> +	return 0;
> +}
>  
>  int ext3_check_dir_entry (const char * function, struct inode * dir,
>  			  struct ext3_dir_entry_2 * de,
> @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp,
>  	unsigned long offset;
>  	int i, stored;
>  	struct ext3_dir_entry_2 *de;
> -	struct super_block *sb;
>  	int err;
>  	struct inode *inode = filp->f_path.dentry->d_inode;
> +	struct super_block *sb = inode->i_sb;
>  	int ret = 0;
>  	int dir_has_error = 0;
>  
> -	sb = inode->i_sb;
> -
> -	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
> -				    EXT3_FEATURE_COMPAT_DIR_INDEX) &&
> -	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
> -	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
> +	if (is_dx_dir(inode)) {
>  		err = ext3_dx_readdir(filp, dirent, filldir);
>  		if (err != ERR_BAD_DX_DIR) {
>  			ret = err;
> @@ -227,22 +226,87 @@ out:
>  	return ret;
>  }
>  
> +static inline int is_32bit_api(void)
> +{
> +#ifdef CONFIG_COMPAT
> +	return is_compat_task();
> +#else
> +	return (BITS_PER_LONG == 32);
> +#endif
> +}
> +
>  /*
>   * These functions convert from the major/minor hash to an f_pos
> - * value.
> + * value for dx directories
>   *
> - * Currently we only use major hash numer.  This is unfortunate, but
> - * on 32-bit machines, the same VFS interface is used for lseek and
> - * llseek, so if we use the 64 bit offset, then the 32-bit versions of
> - * lseek/telldir/seekdir will blow out spectacularly, and from within
> - * the ext2 low-level routine, we don't know if we're being called by
> - * a 64-bit version of the system call or the 32-bit version of the
> - * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
> - * cookie.  Sigh.
> + * Upper layer (for example NFS) should specify FMODE_32BITHASH or
> + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
> + * directly on both 32-bit and 64-bit nodes, under such case, neither
> + * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
>   */
> -#define hash2pos(major, minor)	(major >> 1)
> -#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
> -#define pos2min_hash(pos)	(0)
> +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return major >> 1;
> +	else
> +		return ((__u64)(major >> 1) << 32) | (__u64)minor;
> +}
> +
> +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return (pos << 1) & 0xffffffff;
> +	else
> +		return ((pos >> 32) << 1) & 0xffffffff;
> +}
> +
> +static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return 0;
> +	else
> +		return pos & 0xffffffff;
> +}
> +
> +/*
> + * Return 32- or 64-bit end-of-file for dx directories
> + */
> +static inline loff_t ext3_get_htree_eof(struct file *filp)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return EXT3_HTREE_EOF_32BIT;
> +	else
> +		return EXT3_HTREE_EOF_64BIT;
> +}
> +
> +
> +/*
> + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
> + * non-htree and htree directories, where the "offset" is in terms
> + * of the filename hash value instead of the byte offset.
> + *
> + * Because we may return a 64-bit hash that is well beyond s_maxbytes,
> + * we need to pass the max hash as the maximum allowable offset in
> + * the htree directory case.
> + *
> + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
> + *       will be invalid once the directory was converted into a dx directory
> + */
> +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
> +{
> +	struct inode *inode = file->f_mapping->host;
> +	int dx_dir = is_dx_dir(inode);
> +
> +	if (likely(dx_dir))
> +		return generic_file_llseek_size(file, offset, origin,
> +					        ext3_get_htree_eof(file));
> +	else
> +		return generic_file_llseek(file, offset, origin);
> +}
>  
>  /*
>   * This structure holds the nodes of the red-black tree used to store
> @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root)
>  }
>  
>  
> -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
> +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
> +							   loff_t pos)
>  {
>  	struct dir_private_info *p;
>  
>  	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
>  	if (!p)
>  		return NULL;
> -	p->curr_hash = pos2maj_hash(pos);
> -	p->curr_minor_hash = pos2min_hash(pos);
> +	p->curr_hash = pos2maj_hash(filp, pos);
> +	p->curr_minor_hash = pos2min_hash(filp, pos);
>  	return p;
>  }
>  
> @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent,
>  		printk("call_filldir: called with null fname?!?\n");
>  		return 0;
>  	}
> -	curr_pos = hash2pos(fname->hash, fname->minor_hash);
> +	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
>  	while (fname) {
>  		error = filldir(dirent, fname->name,
>  				fname->name_len, curr_pos,
> @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp,
>  	int	ret;
>  
>  	if (!info) {
> -		info = ext3_htree_create_dir_info(filp->f_pos);
> +		info = ext3_htree_create_dir_info(filp, filp->f_pos);
>  		if (!info)
>  			return -ENOMEM;
>  		filp->private_data = info;
>  	}
>  
> -	if (filp->f_pos == EXT3_HTREE_EOF)
> +	if (filp->f_pos == ext3_get_htree_eof(filp))
>  		return 0;	/* EOF */
>  
>  	/* Some one has messed with f_pos; reset the world */
> @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp,
>  		free_rb_tree_fname(&info->root);
>  		info->curr_node = NULL;
>  		info->extra_fname = NULL;
> -		info->curr_hash = pos2maj_hash(filp->f_pos);
> -		info->curr_minor_hash = pos2min_hash(filp->f_pos);
> +		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
> +		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
>  	}
>  
>  	/*
> @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
>  			if (ret < 0)
>  				return ret;
>  			if (ret == 0) {
> -				filp->f_pos = EXT3_HTREE_EOF;
> +				filp->f_pos = ext3_get_htree_eof(filp);
>  				break;
>  			}
>  			info->curr_node = rb_first(&info->root);
> @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp,
>  			info->curr_minor_hash = fname->minor_hash;
>  		} else {
>  			if (info->next_hash == ~0) {
> -				filp->f_pos = EXT3_HTREE_EOF;
> +				filp->f_pos = ext3_get_htree_eof(filp);
>  				break;
>  			}
>  			info->curr_hash = info->next_hash;
> @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
>  
>  	return 0;
>  }
> +
> +const struct file_operations ext3_dir_operations = {
> +	.llseek		= ext3_dir_llseek,
> +	.read		= generic_read_dir,
> +	.readdir	= ext3_readdir,
> +	.unlocked_ioctl = ext3_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= ext3_compat_ioctl,
> +#endif
> +	.fsync		= ext3_sync_file,
> +	.release	= ext3_release_dir,
> +};
> diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
> index b6515fd..fe5bef7 100644
> --- a/fs/ext3/ext3.h
> +++ b/fs/ext3/ext3.h
> @@ -920,7 +920,11 @@ struct dx_hash_info
>  	u32		*seed;
>  };
>  
> -#define EXT3_HTREE_EOF	0x7fffffff
> +
> +/* 32 and 64 bit signed EOF for dx directories */
> +#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
> +#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
> +
>  
>  /*
>   * Control parameters used by ext3_htree_next_block
> diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
> index d10231d..ede315c 100644
> --- a/fs/ext3/hash.c
> +++ b/fs/ext3/hash.c
> @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
>  		return -1;
>  	}
>  	hash = hash & ~1;
> -	if (hash == (EXT3_HTREE_EOF << 1))
> -		hash = (EXT3_HTREE_EOF-1) << 1;
> +	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
> +		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
>  	hinfo->hash = hash;
>  	hinfo->minor_hash = minor_hash;
>  	return 0;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen April 26, 2012, 6:28 p.m. UTC | #2
On 4/26/12 1:26 PM, J. Bruce Fields wrote:
> How are you testing this?

Basically as was suggested in the first ext4 patch series:

I created a filesystem with 600,000 files in a dir:

for x in $(seq 1 600000); do touch $x; done

then exported and mounted that fs to localhost:

mount -t nfs localhost:/mnt/export /mnt/nfs

and then looked for dups:

ls -l | dup -d 

If you have more .... substantial nfs testing I could do I'm all ears :)

-Eric


> --b.
> 
> On Thu, Apr 26, 2012 at 01:10:39PM -0500, Eric Sandeen wrote:
>> This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
>> ext4: return 32/64-bit dir name hash according to usage type
>> by Fan Yong <yong.fan@whamcloud.com>
>>
>> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
>> to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
>> and telldir().  However, this causes problems if there are 32-bit hash
>> collisions, since the NFSv2 server can get stuck resending the same
>> entries from the directory repeatedly.
>>     
>> Allow ext3 to return a full 64-bit hash (both major and minor) for
>> telldir to decrease the chance of hash collisions.
>>
>> This patch does implement a new ext3_dir_llseek op, because with 64-bit
>> hashes, nfs will attempt to seek to a hash "offset" which is much
>> larger than ext3's s_maxbytes.  So for dx dirs, we call
>> generic_file_llseek_size() with the appropriate max hash value as the
>> maximum seekable size.  Otherwise we just pass through to
>> generic_file_llseek().
>>     
>> Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
>> Patch-updated-by: Eric Sandeen <sandeen@redhat.com>
>> (blame us if something is not correct)
>>
>> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
>> ---
>>
>> diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
>> index cc761ad..92490e9 100644
>> --- a/fs/ext3/dir.c
>> +++ b/fs/ext3/dir.c
>> @@ -21,30 +21,15 @@
>>   *
>>   */
>>  
>> +#include <linux/compat.h>
>>  #include "ext3.h"
>>  
>>  static unsigned char ext3_filetype_table[] = {
>>  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
>>  };
>>  
>> -static int ext3_readdir(struct file *, void *, filldir_t);
>>  static int ext3_dx_readdir(struct file * filp,
>>  			   void * dirent, filldir_t filldir);
>> -static int ext3_release_dir (struct inode * inode,
>> -				struct file * filp);
>> -
>> -const struct file_operations ext3_dir_operations = {
>> -	.llseek		= generic_file_llseek,
>> -	.read		= generic_read_dir,
>> -	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
>> -	.unlocked_ioctl	= ext3_ioctl,
>> -#ifdef CONFIG_COMPAT
>> -	.compat_ioctl	= ext3_compat_ioctl,
>> -#endif
>> -	.fsync		= ext3_sync_file,	/* BKL held */
>> -	.release	= ext3_release_dir,
>> -};
>> -
>>  
>>  static unsigned char get_dtype(struct super_block *sb, int filetype)
>>  {
>> @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
>>  	return (ext3_filetype_table[filetype]);
>>  }
>>  
>> +/**
>> + * Check if the given dir-inode refers to an htree-indexed directory
>> + * (or a directory which chould potentially get coverted to use htree
>> + * indexing).
>> + *
>> + * Return 1 if it is a dx dir, 0 if not
>> + */
>> +static int is_dx_dir(struct inode *inode)
>> +{
>> +	struct super_block *sb = inode->i_sb;
>> +
>> +	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
>> +		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
>> +	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
>> +	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
>> +		return 1;
>> +
>> +	return 0;
>> +}
>>  
>>  int ext3_check_dir_entry (const char * function, struct inode * dir,
>>  			  struct ext3_dir_entry_2 * de,
>> @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp,
>>  	unsigned long offset;
>>  	int i, stored;
>>  	struct ext3_dir_entry_2 *de;
>> -	struct super_block *sb;
>>  	int err;
>>  	struct inode *inode = filp->f_path.dentry->d_inode;
>> +	struct super_block *sb = inode->i_sb;
>>  	int ret = 0;
>>  	int dir_has_error = 0;
>>  
>> -	sb = inode->i_sb;
>> -
>> -	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
>> -				    EXT3_FEATURE_COMPAT_DIR_INDEX) &&
>> -	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
>> -	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
>> +	if (is_dx_dir(inode)) {
>>  		err = ext3_dx_readdir(filp, dirent, filldir);
>>  		if (err != ERR_BAD_DX_DIR) {
>>  			ret = err;
>> @@ -227,22 +226,87 @@ out:
>>  	return ret;
>>  }
>>  
>> +static inline int is_32bit_api(void)
>> +{
>> +#ifdef CONFIG_COMPAT
>> +	return is_compat_task();
>> +#else
>> +	return (BITS_PER_LONG == 32);
>> +#endif
>> +}
>> +
>>  /*
>>   * These functions convert from the major/minor hash to an f_pos
>> - * value.
>> + * value for dx directories
>>   *
>> - * Currently we only use major hash numer.  This is unfortunate, but
>> - * on 32-bit machines, the same VFS interface is used for lseek and
>> - * llseek, so if we use the 64 bit offset, then the 32-bit versions of
>> - * lseek/telldir/seekdir will blow out spectacularly, and from within
>> - * the ext2 low-level routine, we don't know if we're being called by
>> - * a 64-bit version of the system call or the 32-bit version of the
>> - * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
>> - * cookie.  Sigh.
>> + * Upper layer (for example NFS) should specify FMODE_32BITHASH or
>> + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
>> + * directly on both 32-bit and 64-bit nodes, under such case, neither
>> + * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
>>   */
>> -#define hash2pos(major, minor)	(major >> 1)
>> -#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
>> -#define pos2min_hash(pos)	(0)
>> +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
>> +{
>> +	if ((filp->f_mode & FMODE_32BITHASH) ||
>> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> +		return major >> 1;
>> +	else
>> +		return ((__u64)(major >> 1) << 32) | (__u64)minor;
>> +}
>> +
>> +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
>> +{
>> +	if ((filp->f_mode & FMODE_32BITHASH) ||
>> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> +		return (pos << 1) & 0xffffffff;
>> +	else
>> +		return ((pos >> 32) << 1) & 0xffffffff;
>> +}
>> +
>> +static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
>> +{
>> +	if ((filp->f_mode & FMODE_32BITHASH) ||
>> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> +		return 0;
>> +	else
>> +		return pos & 0xffffffff;
>> +}
>> +
>> +/*
>> + * Return 32- or 64-bit end-of-file for dx directories
>> + */
>> +static inline loff_t ext3_get_htree_eof(struct file *filp)
>> +{
>> +	if ((filp->f_mode & FMODE_32BITHASH) ||
>> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
>> +		return EXT3_HTREE_EOF_32BIT;
>> +	else
>> +		return EXT3_HTREE_EOF_64BIT;
>> +}
>> +
>> +
>> +/*
>> + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
>> + * non-htree and htree directories, where the "offset" is in terms
>> + * of the filename hash value instead of the byte offset.
>> + *
>> + * Because we may return a 64-bit hash that is well beyond s_maxbytes,
>> + * we need to pass the max hash as the maximum allowable offset in
>> + * the htree directory case.
>> + *
>> + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
>> + *       will be invalid once the directory was converted into a dx directory
>> + */
>> +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
>> +{
>> +	struct inode *inode = file->f_mapping->host;
>> +	int dx_dir = is_dx_dir(inode);
>> +
>> +	if (likely(dx_dir))
>> +		return generic_file_llseek_size(file, offset, origin,
>> +					        ext3_get_htree_eof(file));
>> +	else
>> +		return generic_file_llseek(file, offset, origin);
>> +}
>>  
>>  /*
>>   * This structure holds the nodes of the red-black tree used to store
>> @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root)
>>  }
>>  
>>  
>> -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
>> +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
>> +							   loff_t pos)
>>  {
>>  	struct dir_private_info *p;
>>  
>>  	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
>>  	if (!p)
>>  		return NULL;
>> -	p->curr_hash = pos2maj_hash(pos);
>> -	p->curr_minor_hash = pos2min_hash(pos);
>> +	p->curr_hash = pos2maj_hash(filp, pos);
>> +	p->curr_minor_hash = pos2min_hash(filp, pos);
>>  	return p;
>>  }
>>  
>> @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent,
>>  		printk("call_filldir: called with null fname?!?\n");
>>  		return 0;
>>  	}
>> -	curr_pos = hash2pos(fname->hash, fname->minor_hash);
>> +	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
>>  	while (fname) {
>>  		error = filldir(dirent, fname->name,
>>  				fname->name_len, curr_pos,
>> @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp,
>>  	int	ret;
>>  
>>  	if (!info) {
>> -		info = ext3_htree_create_dir_info(filp->f_pos);
>> +		info = ext3_htree_create_dir_info(filp, filp->f_pos);
>>  		if (!info)
>>  			return -ENOMEM;
>>  		filp->private_data = info;
>>  	}
>>  
>> -	if (filp->f_pos == EXT3_HTREE_EOF)
>> +	if (filp->f_pos == ext3_get_htree_eof(filp))
>>  		return 0;	/* EOF */
>>  
>>  	/* Some one has messed with f_pos; reset the world */
>> @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp,
>>  		free_rb_tree_fname(&info->root);
>>  		info->curr_node = NULL;
>>  		info->extra_fname = NULL;
>> -		info->curr_hash = pos2maj_hash(filp->f_pos);
>> -		info->curr_minor_hash = pos2min_hash(filp->f_pos);
>> +		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
>> +		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
>>  	}
>>  
>>  	/*
>> @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
>>  			if (ret < 0)
>>  				return ret;
>>  			if (ret == 0) {
>> -				filp->f_pos = EXT3_HTREE_EOF;
>> +				filp->f_pos = ext3_get_htree_eof(filp);
>>  				break;
>>  			}
>>  			info->curr_node = rb_first(&info->root);
>> @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp,
>>  			info->curr_minor_hash = fname->minor_hash;
>>  		} else {
>>  			if (info->next_hash == ~0) {
>> -				filp->f_pos = EXT3_HTREE_EOF;
>> +				filp->f_pos = ext3_get_htree_eof(filp);
>>  				break;
>>  			}
>>  			info->curr_hash = info->next_hash;
>> @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
>>  
>>  	return 0;
>>  }
>> +
>> +const struct file_operations ext3_dir_operations = {
>> +	.llseek		= ext3_dir_llseek,
>> +	.read		= generic_read_dir,
>> +	.readdir	= ext3_readdir,
>> +	.unlocked_ioctl = ext3_ioctl,
>> +#ifdef CONFIG_COMPAT
>> +	.compat_ioctl	= ext3_compat_ioctl,
>> +#endif
>> +	.fsync		= ext3_sync_file,
>> +	.release	= ext3_release_dir,
>> +};
>> diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
>> index b6515fd..fe5bef7 100644
>> --- a/fs/ext3/ext3.h
>> +++ b/fs/ext3/ext3.h
>> @@ -920,7 +920,11 @@ struct dx_hash_info
>>  	u32		*seed;
>>  };
>>  
>> -#define EXT3_HTREE_EOF	0x7fffffff
>> +
>> +/* 32 and 64 bit signed EOF for dx directories */
>> +#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
>> +#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
>> +
>>  
>>  /*
>>   * Control parameters used by ext3_htree_next_block
>> diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
>> index d10231d..ede315c 100644
>> --- a/fs/ext3/hash.c
>> +++ b/fs/ext3/hash.c
>> @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
>>  		return -1;
>>  	}
>>  	hash = hash & ~1;
>> -	if (hash == (EXT3_HTREE_EOF << 1))
>> -		hash = (EXT3_HTREE_EOF-1) << 1;
>> +	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
>> +		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
>>  	hinfo->hash = hash;
>>  	hinfo->minor_hash = minor_hash;
>>  	return 0;
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara April 26, 2012, 6:57 p.m. UTC | #3
On Thu 26-04-12 13:10:39, Eric Sandeen wrote:
> This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
> ext4: return 32/64-bit dir name hash according to usage type
> by Fan Yong <yong.fan@whamcloud.com>
> 
> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
> to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
> and telldir().  However, this causes problems if there are 32-bit hash
> collisions, since the NFSv2 server can get stuck resending the same
> entries from the directory repeatedly.
>     
> Allow ext3 to return a full 64-bit hash (both major and minor) for
> telldir to decrease the chance of hash collisions.
> 
> This patch does implement a new ext3_dir_llseek op, because with 64-bit
> hashes, nfs will attempt to seek to a hash "offset" which is much
> larger than ext3's s_maxbytes.  So for dx dirs, we call
> generic_file_llseek_size() with the appropriate max hash value as the
> maximum seekable size.  Otherwise we just pass through to
> generic_file_llseek().
>     
> Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
> Patch-updated-by: Eric Sandeen <sandeen@redhat.com>
> (blame us if something is not correct)
  Thanks Eric. The patch looks good so I've added it to my tree.

								Honza

> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---
> 
> diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
> index cc761ad..92490e9 100644
> --- a/fs/ext3/dir.c
> +++ b/fs/ext3/dir.c
> @@ -21,30 +21,15 @@
>   *
>   */
>  
> +#include <linux/compat.h>
>  #include "ext3.h"
>  
>  static unsigned char ext3_filetype_table[] = {
>  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
>  };
>  
> -static int ext3_readdir(struct file *, void *, filldir_t);
>  static int ext3_dx_readdir(struct file * filp,
>  			   void * dirent, filldir_t filldir);
> -static int ext3_release_dir (struct inode * inode,
> -				struct file * filp);
> -
> -const struct file_operations ext3_dir_operations = {
> -	.llseek		= generic_file_llseek,
> -	.read		= generic_read_dir,
> -	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
> -	.unlocked_ioctl	= ext3_ioctl,
> -#ifdef CONFIG_COMPAT
> -	.compat_ioctl	= ext3_compat_ioctl,
> -#endif
> -	.fsync		= ext3_sync_file,	/* BKL held */
> -	.release	= ext3_release_dir,
> -};
> -
>  
>  static unsigned char get_dtype(struct super_block *sb, int filetype)
>  {
> @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
>  	return (ext3_filetype_table[filetype]);
>  }
>  
> +/**
> + * Check if the given dir-inode refers to an htree-indexed directory
> + * (or a directory which chould potentially get coverted to use htree
> + * indexing).
> + *
> + * Return 1 if it is a dx dir, 0 if not
> + */
> +static int is_dx_dir(struct inode *inode)
> +{
> +	struct super_block *sb = inode->i_sb;
> +
> +	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
> +		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
> +	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
> +	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
> +		return 1;
> +
> +	return 0;
> +}
>  
>  int ext3_check_dir_entry (const char * function, struct inode * dir,
>  			  struct ext3_dir_entry_2 * de,
> @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp,
>  	unsigned long offset;
>  	int i, stored;
>  	struct ext3_dir_entry_2 *de;
> -	struct super_block *sb;
>  	int err;
>  	struct inode *inode = filp->f_path.dentry->d_inode;
> +	struct super_block *sb = inode->i_sb;
>  	int ret = 0;
>  	int dir_has_error = 0;
>  
> -	sb = inode->i_sb;
> -
> -	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
> -				    EXT3_FEATURE_COMPAT_DIR_INDEX) &&
> -	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
> -	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
> +	if (is_dx_dir(inode)) {
>  		err = ext3_dx_readdir(filp, dirent, filldir);
>  		if (err != ERR_BAD_DX_DIR) {
>  			ret = err;
> @@ -227,22 +226,87 @@ out:
>  	return ret;
>  }
>  
> +static inline int is_32bit_api(void)
> +{
> +#ifdef CONFIG_COMPAT
> +	return is_compat_task();
> +#else
> +	return (BITS_PER_LONG == 32);
> +#endif
> +}
> +
>  /*
>   * These functions convert from the major/minor hash to an f_pos
> - * value.
> + * value for dx directories
>   *
> - * Currently we only use major hash numer.  This is unfortunate, but
> - * on 32-bit machines, the same VFS interface is used for lseek and
> - * llseek, so if we use the 64 bit offset, then the 32-bit versions of
> - * lseek/telldir/seekdir will blow out spectacularly, and from within
> - * the ext2 low-level routine, we don't know if we're being called by
> - * a 64-bit version of the system call or the 32-bit version of the
> - * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
> - * cookie.  Sigh.
> + * Upper layer (for example NFS) should specify FMODE_32BITHASH or
> + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
> + * directly on both 32-bit and 64-bit nodes, under such case, neither
> + * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
>   */
> -#define hash2pos(major, minor)	(major >> 1)
> -#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
> -#define pos2min_hash(pos)	(0)
> +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return major >> 1;
> +	else
> +		return ((__u64)(major >> 1) << 32) | (__u64)minor;
> +}
> +
> +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return (pos << 1) & 0xffffffff;
> +	else
> +		return ((pos >> 32) << 1) & 0xffffffff;
> +}
> +
> +static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return 0;
> +	else
> +		return pos & 0xffffffff;
> +}
> +
> +/*
> + * Return 32- or 64-bit end-of-file for dx directories
> + */
> +static inline loff_t ext3_get_htree_eof(struct file *filp)
> +{
> +	if ((filp->f_mode & FMODE_32BITHASH) ||
> +	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
> +		return EXT3_HTREE_EOF_32BIT;
> +	else
> +		return EXT3_HTREE_EOF_64BIT;
> +}
> +
> +
> +/*
> + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
> + * non-htree and htree directories, where the "offset" is in terms
> + * of the filename hash value instead of the byte offset.
> + *
> + * Because we may return a 64-bit hash that is well beyond s_maxbytes,
> + * we need to pass the max hash as the maximum allowable offset in
> + * the htree directory case.
> + *
> + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
> + *       will be invalid once the directory was converted into a dx directory
> + */
> +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
> +{
> +	struct inode *inode = file->f_mapping->host;
> +	int dx_dir = is_dx_dir(inode);
> +
> +	if (likely(dx_dir))
> +		return generic_file_llseek_size(file, offset, origin,
> +					        ext3_get_htree_eof(file));
> +	else
> +		return generic_file_llseek(file, offset, origin);
> +}
>  
>  /*
>   * This structure holds the nodes of the red-black tree used to store
> @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root)
>  }
>  
>  
> -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
> +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
> +							   loff_t pos)
>  {
>  	struct dir_private_info *p;
>  
>  	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
>  	if (!p)
>  		return NULL;
> -	p->curr_hash = pos2maj_hash(pos);
> -	p->curr_minor_hash = pos2min_hash(pos);
> +	p->curr_hash = pos2maj_hash(filp, pos);
> +	p->curr_minor_hash = pos2min_hash(filp, pos);
>  	return p;
>  }
>  
> @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent,
>  		printk("call_filldir: called with null fname?!?\n");
>  		return 0;
>  	}
> -	curr_pos = hash2pos(fname->hash, fname->minor_hash);
> +	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
>  	while (fname) {
>  		error = filldir(dirent, fname->name,
>  				fname->name_len, curr_pos,
> @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp,
>  	int	ret;
>  
>  	if (!info) {
> -		info = ext3_htree_create_dir_info(filp->f_pos);
> +		info = ext3_htree_create_dir_info(filp, filp->f_pos);
>  		if (!info)
>  			return -ENOMEM;
>  		filp->private_data = info;
>  	}
>  
> -	if (filp->f_pos == EXT3_HTREE_EOF)
> +	if (filp->f_pos == ext3_get_htree_eof(filp))
>  		return 0;	/* EOF */
>  
>  	/* Some one has messed with f_pos; reset the world */
> @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp,
>  		free_rb_tree_fname(&info->root);
>  		info->curr_node = NULL;
>  		info->extra_fname = NULL;
> -		info->curr_hash = pos2maj_hash(filp->f_pos);
> -		info->curr_minor_hash = pos2min_hash(filp->f_pos);
> +		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
> +		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
>  	}
>  
>  	/*
> @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
>  			if (ret < 0)
>  				return ret;
>  			if (ret == 0) {
> -				filp->f_pos = EXT3_HTREE_EOF;
> +				filp->f_pos = ext3_get_htree_eof(filp);
>  				break;
>  			}
>  			info->curr_node = rb_first(&info->root);
> @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp,
>  			info->curr_minor_hash = fname->minor_hash;
>  		} else {
>  			if (info->next_hash == ~0) {
> -				filp->f_pos = EXT3_HTREE_EOF;
> +				filp->f_pos = ext3_get_htree_eof(filp);
>  				break;
>  			}
>  			info->curr_hash = info->next_hash;
> @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
>  
>  	return 0;
>  }
> +
> +const struct file_operations ext3_dir_operations = {
> +	.llseek		= ext3_dir_llseek,
> +	.read		= generic_read_dir,
> +	.readdir	= ext3_readdir,
> +	.unlocked_ioctl = ext3_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= ext3_compat_ioctl,
> +#endif
> +	.fsync		= ext3_sync_file,
> +	.release	= ext3_release_dir,
> +};
> diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
> index b6515fd..fe5bef7 100644
> --- a/fs/ext3/ext3.h
> +++ b/fs/ext3/ext3.h
> @@ -920,7 +920,11 @@ struct dx_hash_info
>  	u32		*seed;
>  };
>  
> -#define EXT3_HTREE_EOF	0x7fffffff
> +
> +/* 32 and 64 bit signed EOF for dx directories */
> +#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
> +#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
> +
>  
>  /*
>   * Control parameters used by ext3_htree_next_block
> diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
> index d10231d..ede315c 100644
> --- a/fs/ext3/hash.c
> +++ b/fs/ext3/hash.c
> @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
>  		return -1;
>  	}
>  	hash = hash & ~1;
> -	if (hash == (EXT3_HTREE_EOF << 1))
> -		hash = (EXT3_HTREE_EOF-1) << 1;
> +	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
> +		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
>  	hinfo->hash = hash;
>  	hinfo->minor_hash = minor_hash;
>  	return 0;
>
diff mbox

Patch

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index cc761ad..92490e9 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -21,30 +21,15 @@ 
  *
  */
 
+#include <linux/compat.h>
 #include "ext3.h"
 
 static unsigned char ext3_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ext3_readdir(struct file *, void *, filldir_t);
 static int ext3_dx_readdir(struct file * filp,
 			   void * dirent, filldir_t filldir);
-static int ext3_release_dir (struct inode * inode,
-				struct file * filp);
-
-const struct file_operations ext3_dir_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
-	.unlocked_ioctl	= ext3_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= ext3_compat_ioctl,
-#endif
-	.fsync		= ext3_sync_file,	/* BKL held */
-	.release	= ext3_release_dir,
-};
-
 
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -55,6 +40,25 @@  static unsigned char get_dtype(struct super_block *sb, int filetype)
 	return (ext3_filetype_table[filetype]);
 }
 
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which chould potentially get coverted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+		     EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+		return 1;
+
+	return 0;
+}
 
 int ext3_check_dir_entry (const char * function, struct inode * dir,
 			  struct ext3_dir_entry_2 * de,
@@ -94,18 +98,13 @@  static int ext3_readdir(struct file * filp,
 	unsigned long offset;
 	int i, stored;
 	struct ext3_dir_entry_2 *de;
-	struct super_block *sb;
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
 	int ret = 0;
 	int dir_has_error = 0;
 
-	sb = inode->i_sb;
-
-	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-				    EXT3_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
-	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+	if (is_dx_dir(inode)) {
 		err = ext3_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
 			ret = err;
@@ -227,22 +226,87 @@  out:
 	return ret;
 }
 
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+	return is_compat_task();
+#else
+	return (BITS_PER_LONG == 32);
+#endif
+}
+
 /*
  * These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
  *
- * Currently we only use major hash numer.  This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie.  Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
  */
-#define hash2pos(major, minor)	(major >> 1)
-#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos)	(0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return major >> 1;
+	else
+		return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return (pos << 1) & 0xffffffff;
+	else
+		return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return 0;
+	else
+		return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext3_get_htree_eof(struct file *filp)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return EXT3_HTREE_EOF_32BIT;
+	else
+		return EXT3_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond s_maxbytes,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
+ *       will be invalid once the directory was converted into a dx directory
+ */
+loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	int dx_dir = is_dx_dir(inode);
+
+	if (likely(dx_dir))
+		return generic_file_llseek_size(file, offset, origin,
+					        ext3_get_htree_eof(file));
+	else
+		return generic_file_llseek(file, offset, origin);
+}
 
 /*
  * This structure holds the nodes of the red-black tree used to store
@@ -303,15 +367,16 @@  static void free_rb_tree_fname(struct rb_root *root)
 }
 
 
-static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
+							   loff_t pos)
 {
 	struct dir_private_info *p;
 
 	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->curr_hash = pos2maj_hash(pos);
-	p->curr_minor_hash = pos2min_hash(pos);
+	p->curr_hash = pos2maj_hash(filp, pos);
+	p->curr_minor_hash = pos2min_hash(filp, pos);
 	return p;
 }
 
@@ -401,7 +466,7 @@  static int call_filldir(struct file * filp, void * dirent,
 		printk("call_filldir: called with null fname?!?\n");
 		return 0;
 	}
-	curr_pos = hash2pos(fname->hash, fname->minor_hash);
+	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
 	while (fname) {
 		error = filldir(dirent, fname->name,
 				fname->name_len, curr_pos,
@@ -426,13 +491,13 @@  static int ext3_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = ext3_htree_create_dir_info(filp->f_pos);
+		info = ext3_htree_create_dir_info(filp, filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
 	}
 
-	if (filp->f_pos == EXT3_HTREE_EOF)
+	if (filp->f_pos == ext3_get_htree_eof(filp))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
@@ -440,8 +505,8 @@  static int ext3_dx_readdir(struct file * filp,
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp->f_pos);
+		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
 	}
 
 	/*
@@ -473,7 +538,7 @@  static int ext3_dx_readdir(struct file * filp,
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = EXT3_HTREE_EOF;
+				filp->f_pos = ext3_get_htree_eof(filp);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -493,7 +558,7 @@  static int ext3_dx_readdir(struct file * filp,
 			info->curr_minor_hash = fname->minor_hash;
 		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = EXT3_HTREE_EOF;
+				filp->f_pos = ext3_get_htree_eof(filp);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -512,3 +577,15 @@  static int ext3_release_dir (struct inode * inode, struct file * filp)
 
 	return 0;
 }
+
+const struct file_operations ext3_dir_operations = {
+	.llseek		= ext3_dir_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ext3_readdir,
+	.unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
+	.fsync		= ext3_sync_file,
+	.release	= ext3_release_dir,
+};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index b6515fd..fe5bef7 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -920,7 +920,11 @@  struct dx_hash_info
 	u32		*seed;
 };
 
-#define EXT3_HTREE_EOF	0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT3_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+#define EXT3_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
+
 
 /*
  * Control parameters used by ext3_htree_next_block
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index d10231d..ede315c 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -198,8 +198,8 @@  int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		return -1;
 	}
 	hash = hash & ~1;
-	if (hash == (EXT3_HTREE_EOF << 1))
-		hash = (EXT3_HTREE_EOF-1) << 1;
+	if (hash == (EXT3_HTREE_EOF_32BIT << 1))
+		hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
 	hinfo->hash = hash;
 	hinfo->minor_hash = minor_hash;
 	return 0;