diff mbox series

[RFC,07/13] charsets: utf8: Hook-up utf-8 code to charsets library

Message ID 20180112071234.29470-8-krisman@collabora.co.uk
State Superseded, archived
Headers show
Series UTF-8 case insensitive lookups for EXT4 | expand

Commit Message

Gabriel Krisman Bertazi Jan. 12, 2018, 7:12 a.m. UTC
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
 lib/charsets/Makefile    |   2 +-
 lib/charsets/utf8_core.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 lib/charsets/utf8_core.c

Comments

Weber, Olaf (HPC Data Management & Storage) Jan. 12, 2018, 10:38 a.m. UTC | #1
Hi Gabriel,

A couple of comments inline below.

Olaf Weber

> -----Original Message-----
> From: Gabriel Krisman Bertazi [mailto:krisman@collabora.co.uk]
> Sent: Friday, January 12, 2018 08:12
> To: tytso@mit.edu; david@fromorbit.com; bpm@sgi.com; olaf@sgi.com
> Cc: linux-ext4@vger.kernel.org; linux-fsdevel@vger.kernel.org;
> kernel@lists.collabora.co.uk; alvaro.soliverez@collabora.co.uk; Gabriel
> Krisman Bertazi <krisman@collabora.co.uk>
> Subject: [PATCH RFC 07/13] charsets: utf8: Hook-up utf-8 code to charsets
> library
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
> ---
>  lib/charsets/Makefile    |   2 +-
>  lib/charsets/utf8_core.c | 178
> +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 179 insertions(+), 1 deletion(-)
>  create mode 100644 lib/charsets/utf8_core.c
> 
> diff --git a/lib/charsets/Makefile b/lib/charsets/Makefile
> index 95389c4193b0..5e2fa7c20a47 100644
> --- a/lib/charsets/Makefile
> +++ b/lib/charsets/Makefile
> @@ -4,7 +4,7 @@ obj-$(CONFIG_CHARSETS) += charsets.o
> 
>  obj-$(CONFIG_CHARSETS) += ascii.o
> 
> -utf8-y += utf8norm.o
> +utf8-y += utf8_core.o utf8norm.o
>  obj-$(CONFIG_UTF8_NORMALIZATION) +=  utf8.o
> 
>  $(obj)/utf8norm.o: $(obj)/utf8data.h
> diff --git a/lib/charsets/utf8_core.c b/lib/charsets/utf8_core.c
> new file mode 100644
> index 000000000000..94427670e96e
> --- /dev/null
> +++ b/lib/charsets/utf8_core.c
> @@ -0,0 +1,178 @@
> +/*
> + * Copyright (c) 2017 Collabora Ltd.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + */
> +
> +#include <linux/charsets.h>
> +#include <linux/utf8norm.h>
> +#include <linux/slab.h>
> +#include <linux/parser.h>
> +#include <linux/string.h>
> +
> +static int utf8_strncmp(const struct charset *charset, const char *str1,
> +			const char *str2, int len)
> +{
> +	const struct utf8data *data = utf8nfkdi(charset->version);
> +	struct utf8cursor cur1, cur2;
> +	unsigned char c1, c2;
> +	int r, i;
> +
> +	r = utf8cursor(&cur1, data, str1);
> +	if (r < 0)
> +		return -EIO;
> +	r = utf8cursor(&cur2, data, str2);
> +	if (r < 0)
> +		return -EIO;
> +
> +	for (i = 0 ; i < len ; i++) {
> +		c1 = utf8byte(&cur1);
> +		c2 = utf8byte(&cur2);
> +
> +		if (!c1 || !c2 || c1 != c2)
> +			return 1;
> +
> +	}
> +
> +	return 0;
> +}

This function is broken, but the reasons why illustrate the traps and pitfalls of working
with utf8 code and limited length buffers.

As written, if str1 or str2 doesn't trip the len check, then utf8_strncmp() returns 1 (not equal).
It does this even if the strings are equal. The check in the loop would have to be something
like this instead:
		if (c1 != c2)
			return 1;
		if (!c1) /* implies !c2 as well */
			return 0;

But this is not the only problem. The 'len' limit applies to the input strings. So you need to tell
the utf8byte() routine that it applies. In other words, use utf8ncursor() which takes an additional
length parameter to set up the cursors.

With this change, utf8byte() will return 0 when it hits the end of the input string due to seeing a
null byte or having consumed all characters, provided that it is not in the middle of a utf8 sequence
or a an incomplete sequence of Unicode characters.

Finally, note that utf8byte() returns an int, not a char. It does this for the same reasons getc() does.

So utf8_strncmp() becomes something like the following. I'm using EINVAL instead of EIO, and note
that -EINVAL does not imply that str1 and str2 are not equal when compared as a sequence of bytes.

static int utf8_strncmp(const struct charset *charset,
			const char *str1,
			const char *str2,
			int len)
{
	const struct utf8data *data = utf8nfkdi(charset->version);
	struct utf8cursor cur1;
	struct utf8cursor cur2;
	int c1;
	int c2;

	if (utf8ncursor(&cur1, data, str1, len) < 0)
		return -EINVAL;
	if (utf8ncursor(&cur2, data, str2, len) < 0)
		return -EINVAL;

	do {
		c1 = utf8byte(&cur1);
		c2 = utf8byte(&cur2);

		if (c1 < 0 || c2 < 0)
			return -EINVAL;
		if (c1 != c2)
			return 1;
	} while (c1);

	return 0;
}


> +
> +static int utf8_strncasecmp(const struct charset *charset, const char *str1,
> +			    const char *str2, int len)
> +{
> +	const struct utf8data *data = utf8nfkdicf(charset->version);
> +	struct utf8cursor cur1, cur2;
> +	unsigned char c1, c2;
> +	int r, i;
> +
> +	r = utf8cursor(&cur1, data, str1);
> +	if (r < 0)
> +		return -EIO;
> +
> +	r = utf8cursor(&cur2, data, str2);
> +	if (r < 0)
> +		return -EIO;
> +
> +	for (i = 0 ; i < len ; i++) {
> +		c1 = utf8byte(&cur1);
> +		c2 = utf8byte(&cur2);
> +
> +		if (!c1 || !c2 || c1 != c2)
> +			return 1;
> +	}
> +
> +	return 0;
> +}

Same comments as above apply here.

> +
> +int utf8_casefold(const struct charset *charset, const char *str, int len,
> +		  char **folded_str)
> +{
> +	const struct utf8data *data = utf8nfkdicf(charset->version);
> +	struct utf8cursor cur;
> +	int i;
> +	char buffer[1024];
> +
> +	if (utf8cursor(&cur, data, str))
> +		return -EIO;
> +
> +	for (i = 0; i < (1024-1); i++) {
> +		buffer[i] = utf8byte(&cur);
> +		if (!buffer[i])
> +			break;
> +	}
> +	buffer[i] = '\0';
> +	*folded_str = kstrdup(buffer, GFP_NOFS);
> +	if (!*folded_str)
> +		return -ENOMEM;
> +
> +	return i;
> +}

I'm not sure a 1 buffer on the stack will be welcome. Maybe just use
utf8nlen() to get the target size and eat the cost of doing the normalization
twice. An advantage of using utf8nlen() is that it will validate the input string
as well.

Here too you should use utf8ncursor() to account for the len parameter. 

int utf8_casefold(const struct charset *charset,
		const char *str, int len,
		char **folded_str)
{
	const struct utf8data *data = utf8nfkdicf(charset->version);
	struct utf8cursor cur;
	char *s;
	int c;
	ssize_t size;

	size = utf8nlen(data, str, len);
	if (size < 0)
		return -EINVAL;
	s = kmalloc(size + 1, GFP_NOFS);
	if (!s)
		return -ENOMEM;
	*folded_string = s;
	/*
	 * utf8nlen() verified that str is well-formed, so
	 * utf8ncursor() and utf8byte() will not fail.
	 */
	utf8ncursor(&cur, data, str, len);
	do {
		c = utf8byte(&cur);
		*s++ = c;
	} while (c);

	return size;
}

The do-while loop could be written as follows as well, but IIRC that style is discouraged these days.

	while ((*s++ = utf8byte(&cur))
		;


> +
> +int utf8_normalize(const struct charset *charset, const char *str, int len,
> +		   char **normalization)
> +{
> +	const struct utf8data *data = utf8nfkdi(charset->version);
> +	struct utf8cursor cur;
> +	int i;
> +	char buffer[1024];
> +
> +	if (utf8cursor(&cur, data, str))
> +		return -EIO;
> +
> +	for (i = 0; i < (1024-1); i++) {
> +		buffer[i] = utf8byte(&cur);
> +		if (!buffer[i])
> +			break;
> +	}
> +	buffer[i] = '\0';
> +	*normalization = kstrdup(buffer, GFP_NOFS);
> +	if (!*normalization)
> +		return -ENOMEM;
> +
> +	return i;
> +}

Similar here.

> +
> +static const struct charset_ops utf8_ops = {
> +	.strncmp = utf8_strncmp,
> +	.strncasecmp = utf8_strncasecmp,
> +	.casefold = utf8_casefold,
> +	.normalize = utf8_normalize,
> +};
> +
> +static struct charset *utf8_load_charset(void *pargs)
> +{
> +	int maj, min, rev;
> +	unsigned int age;
> +	struct charset *charset;
> +	substring_t *args = pargs;
> +
> +	if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
> +	    match_int(&args[2], &rev))
> +		return NULL;
> +
> +	age = UNICODE_AGE(maj, min, rev);
> +
> +	if (!utf8version_is_supported(age))
> +		return NULL;

Maybe utf8version_is_supported() should be changed to take 'maj', 'min', 'rev' as separate parameters.

Olaf

> +
> +	charset = kmalloc(sizeof(struct charset), GFP_KERNEL);
> +	if (!charset)
> +		return NULL;
> +
> +	charset->info = NULL;
> +	charset->version = age;
> +	charset->ops = &utf8_ops;
> +
> +	return charset;
> +}
> +
> +static struct charset_info utf8_info = {
> +	.name = "utf8",
> +	.match_token = "utf8-%d.%d.%d",
> +	.load_charset = utf8_load_charset,
> +};
> +
> +static int __init init_utf8(void)
> +{
> +	charset_register(&utf8_info);
> +	return 0;
> +}
> +
> +static void __exit exit_utf8(void)
> +{
> +}
> +
> +module_init(init_utf8);
> +module_exit(exit_utf8);
> +MODULE_AUTHOR("Gabriel Krisman Bertazi");
> +MODULE_DESCRIPTION("UTF-8 charset operations for filesystems");
> +MODULE_LICENSE("GPL");
> +
> --
> 2.15.1
Gabriel Krisman Bertazi Jan. 16, 2018, 4:50 p.m. UTC | #2
"Weber, Olaf (HPC Data Management & Storage)" <olaf.weber@hpe.com>
writes:

> But this is not the only problem. The 'len' limit applies to the input strings. So you need to tell
> the utf8byte() routine that it applies. In other words, use utf8ncursor() which takes an additional
> length parameter to set up the cursors.
>
> With this change, utf8byte() will return 0 when it hits the end of the input string due to seeing a
> null byte or having consumed all characters, provided that it is not in the middle of a utf8 sequence
> or a an incomplete sequence of Unicode characters.
>
> Finally, note that utf8byte() returns an int, not a char. It does this for the same reasons getc() does.
>
> So utf8_strncmp() becomes something like the following. I'm using EINVAL instead of EIO, and note
> that -EINVAL does not imply that str1 and str2 are not equal when compared as a sequence of bytes.
>
> static int utf8_strncmp(const struct charset *charset,
> 			const char *str1,
> 			const char *str2,
> 			int len)
> {
> 	const struct utf8data *data = utf8nfkdi(charset->version);
> 	struct utf8cursor cur1;
> 	struct utf8cursor cur2;
> 	int c1;
> 	int c2;
>
> 	if (utf8ncursor(&cur1, data, str1, len) < 0)
> 		return -EINVAL;
> 	if (utf8ncursor(&cur2, data, str2, len) < 0)
> 		return -EINVAL;
>
> 	do {
> 		c1 = utf8byte(&cur1);
> 		c2 = utf8byte(&cur2);
>
> 		if (c1 < 0 || c2 < 0)
> 			return -EINVAL;
> 		if (c1 != c2)
> 			return 1;
> 	} while (c1);
>
> 	return 0;
> }

Hi Olaf,

Thanks for your review and deep explanations.

I get your point and I've added a test case to trigger it in the
test_ucd module.

One question that I have, on the other hand: Take the version you
shared, I want to avoid the -EINVAL for the case when strings s1
and s2 should match as equal, but strlen(s1) < strlen (s2).  In this
case:

strncmp (s1, s2, strlen (s2)) => Returns 0.  Matches Ok
strncmp (s1, s2, strlen (s1)) => Returns -EINVAL

I know -EINVAL doesn't mean they don't match, but this case seems too
error prone.

I suppose we just could:

 (1) let the caller deal with it, which is error prone.  Or,

 (2) Require two lens on strncmp, one for each string, Or,

 (3) use utf8cursor for the second string, which plays bad with non-null
 terminated strings, which is important for filesystems.

Do you see an alternative? I'm pending towards option 2.  Are you ok
with that?
Weber, Olaf (HPC Data Management & Storage) Jan. 16, 2018, 10:19 p.m. UTC | #3
Hi Gabriel,

Comments inline

> -----Original Message-----
> From: Gabriel Krisman Bertazi [mailto:krisman@collabora.co.uk]
> Sent: Tuesday, January 16, 2018 17:51
> To: Weber, Olaf (HPC Data Management & Storage) <olaf.weber@hpe.com>
> Cc: tytso@mit.edu; david@fromorbit.com; linux-ext4@vger.kernel.org;
> linux-fsdevel@vger.kernel.org; kernel@lists.collabora.co.uk;
> alvaro.soliverez@collabora.co.uk
> Subject: Re: [PATCH RFC 07/13] charsets: utf8: Hook-up utf-8 code to
> charsets library
> 
> "Weber, Olaf (HPC Data Management & Storage)" <olaf.weber@hpe.com> writes:
> 
> > But this is not the only problem. The 'len' limit applies to the input strings. So you need to tell
> > the utf8byte() routine that it applies. In other words, use utf8ncursor() which takes an additional
> > length parameter to set up the cursors.
> >
> > With this change, utf8byte() will return 0 when it hits the end of the input string due to seeing a
> > null byte or having consumed all characters, provided that it is not in the middle of a utf8 sequence
> > or a an incomplete sequence of Unicode characters.
> >
> > Finally, note that utf8byte() returns an int, not a char. It does this for the same reasons getc() does.
> >
> > So utf8_strncmp() becomes something like the following. I'm using EINVAL instead of EIO, and note
> > that -EINVAL does not imply that str1 and str2 are not equal when compared as a sequence of bytes.
> >
> > static int utf8_strncmp(const struct charset *charset,
> > 			const char *str1,
> > 			const char *str2,
> > 			int len)
> > {
> > 	const struct utf8data *data = utf8nfkdi(charset->version);
> > 	struct utf8cursor cur1;
> > 	struct utf8cursor cur2;
> > 	int c1;
> > 	int c2;
> >
> > 	if (utf8ncursor(&cur1, data, str1, len) < 0)
> > 		return -EINVAL;
> > 	if (utf8ncursor(&cur2, data, str2, len) < 0)
> > 		return -EINVAL;
> >
> > 	do {
> > 		c1 = utf8byte(&cur1);
> > 		c2 = utf8byte(&cur2);
> >
> > 		if (c1 < 0 || c2 < 0)
> > 			return -EINVAL;
> > 		if (c1 != c2)
> > 			return 1;
> > 	} while (c1);
> >
> > 	return 0;
> > }
> 
> Hi Olaf,
> 
> Thanks for your review and deep explanations.
> 
> I get your point and I've added a test case to trigger it in the
> test_ucd module.
> 
> One question that I have, on the other hand: Take the version you
> shared, I want to avoid the -EINVAL for the case when strings s1
> and s2 should match as equal, but strlen(s1) < strlen (s2).  In this
> case:
> 
> strncmp (s1, s2, strlen (s2)) => Returns 0.  Matches Ok
> strncmp (s1, s2, strlen (s1)) => Returns -EINVAL
> 
> I know -EINVAL doesn't mean they don't match, but this case seems too
> error prone.

If I understand your question correctly, the case of interest is

	strncmp(s1, s2, len), where len <= strlen(s1) and len <= strlen(s2)

As far as I can tell the code I sketched above handles that case in the
way you expect/want, when taking the complications introduced by
Unicode into account. Using utf8ncursor() ensures we do get an -EINVAL
if, and only if, we read beyond the end (len) of the source string as part of
the normalization process. But if we are at an acceptable boundary in the
source string when we see the end of the string, utf8byte() returns 0,
indicating a normal/non-error end of the scan.

I think it may be worth to write some tests to (hopefully) confirm that
the code really does what I intended it to do. The most likely case to
fail would be where you hit the len-imposed end after a codepoint
with CCC != 0.

> I suppose we just could:
> 
>  (1) let the caller deal with it, which is error prone.  Or,

The caller does have to do something when it gets -EINVAL. You have to
define the desired semantics of that case.

In the original XFS-filesystem code my choice was to treat invalid UTF-8
sequences as binary blobs for the sake of comparisons.

>  (2) Require two lens on strncmp, one for each string, Or,

As a general rule this is certainly correct: each string has its
own associated maximum length within which it should have
been null-terminated.  So whether you need one len per
string depends on the sources of the strings. In the original
XFS-based code there are some tricks related to this.

>  (3) use utf8cursor for the second string, which plays bad with non-null
>  terminated strings, which is important for filesystems.

I agree, if the string is not null-terminated, then utf8ncursor() is the only
Viable interface.

> Do you see an alternative? I'm pending towards option 2.  Are you ok
> with that?

I'd say the proposed XFS code did a variant of your option 2. It also used
the available interfaces in a way that attempted to avoid memory
allocation unless absolutely necessary.

At the time I worked under the assumptions that both allocating
memory and normalizing a string were expensive, but also that I
could not permanently or even semi-permanently store normalized
forms of directory entries.

So the XFS code as written made a copy of the normalized form of the entry
being worked on, but gets the normalized bytes of the on-disk directory
entries on the fly.

This also drove the design of utf8(n)cursor()/utf8byte(): I wanted to
avoid having to do memory management as much as possible, and also
needed to work with a limited (and fixed-size) stack footprint. Basically
these interfaces were written the way they are to make that (micro-)
optimization possible. Within those constraints I tried to make them
easy to use. I may not have succeeded.

Olaf
Gabriel Krisman Bertazi Jan. 23, 2018, 3:33 a.m. UTC | #4
"Weber, Olaf (HPC Data Management & Storage)" <olaf.weber@hpe.com>
writes:


>> One question that I have, on the other hand: Take the version you
>> shared, I want to avoid the -EINVAL for the case when strings s1
>> and s2 should match as equal, but strlen(s1) < strlen (s2).  In this
>> case:
>> 
>> strncmp (s1, s2, strlen (s2)) => Returns 0.  Matches Ok
>> strncmp (s1, s2, strlen (s1)) => Returns -EINVAL
>> 
>> I know -EINVAL doesn't mean they don't match, but this case seems too
>> error prone.
>
> If I understand your question correctly, the case of interest is
>
> 	strncmp(s1, s2, len), where len <= strlen(s1) and len <= strlen(s2)
>
> As far as I can tell the code I sketched above handles that case in the
> way you expect/want, when taking the complications introduced by
> Unicode into account. Using utf8ncursor() ensures we do get an -EINVAL
> if, and only if, we read beyond the end (len) of the source string as part of
> the normalization process. But if we are at an acceptable boundary in the
> source string when we see the end of the string, utf8byte() returns 0,
> indicating a normal/non-error end of the scan.

Hey Olaf,

Sorry for the delay.

It is not quite that scenario.  The version that requires only 1 length
fails when utf8ncursor receives a len that is smaller than one of the
strings, which is a common case when something decomposes to a larger
string:

Take this case, for instance:

s1 = {0xc2, 0xbc, 0x00},   /* 'VULGAR FRACTION ONE QUARTER' decomposes to */
s2 = {0x31, 0xe2, 0x81, 0x84, 0x34, 0x00},  /* 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' */

If we do strncmp(s1, s2, strlen(s2)), it works fine.  But if we use
strlen(s1) on the third parameter, it fails.  As far as I understand,
the issue happens because utf8lookup will read to the maximum of len
characters, aborting the lookup in the middle of a sequence. Since we
don't hit a leaf for that code-point, it assumes an invalid sequence and
utf8byte aborts.

The easiest way to solve it is by receiving the two lens in strncmp.

> I think it may be worth to write some tests to (hopefully) confirm that
> the code really does what I intended it to do. The most likely case to
> fail would be where you hit the len-imposed end after a codepoint
> with CCC != 0.

The only test I had for this scenario happened to have strlen(s1) ==
strlen(s2).  I added the following, which I think catches this scenario:

/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'  decomposes to */
/* 'LETTER A' + 'COMBINING  OGONEK' + 'COMBINING DIAERESIS' */
  s1 = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
  s2 = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},

I tested your version, and it works correctly for this scenario too, as
long as we set the len parameter to use the largest string, s2, instead
of s1.

>> I suppose we just could:
>> 
>>  (1) let the caller deal with it, which is error prone.  Or,
>
> The caller does have to do something when it gets -EINVAL. You have to
> define the desired semantics of that case.
>
> In the original XFS-filesystem code my choice was to treat invalid UTF-8
> sequences as binary blobs for the sake of comparisons.
>
>>  (2) Require two lens on strncmp, one for each string, Or,
>
> As a general rule this is certainly correct: each string has its
> own associated maximum length within which it should have
> been null-terminated.  So whether you need one len per
> string depends on the sources of the strings. In the original
> XFS-based code there are some tricks related to this.

I've applied this solution and it is solving every test case correctly,
including those I mentioned above.  Since it looks like the best
approach, I applied the other things you commented, and modified the
comparison functions code to receive 2 lens.  I should submit a v2
shortly, once I'm done with dealing with some changes to the fs part.

Thanks!
diff mbox series

Patch

diff --git a/lib/charsets/Makefile b/lib/charsets/Makefile
index 95389c4193b0..5e2fa7c20a47 100644
--- a/lib/charsets/Makefile
+++ b/lib/charsets/Makefile
@@ -4,7 +4,7 @@  obj-$(CONFIG_CHARSETS) += charsets.o
 
 obj-$(CONFIG_CHARSETS) += ascii.o
 
-utf8-y += utf8norm.o
+utf8-y += utf8_core.o utf8norm.o
 obj-$(CONFIG_UTF8_NORMALIZATION) +=  utf8.o
 
 $(obj)/utf8norm.o: $(obj)/utf8data.h
diff --git a/lib/charsets/utf8_core.c b/lib/charsets/utf8_core.c
new file mode 100644
index 000000000000..94427670e96e
--- /dev/null
+++ b/lib/charsets/utf8_core.c
@@ -0,0 +1,178 @@ 
+/*
+ * Copyright (c) 2017 Collabora Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/charsets.h>
+#include <linux/utf8norm.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/string.h>
+
+static int utf8_strncmp(const struct charset *charset, const char *str1,
+			const char *str2, int len)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur1, cur2;
+	unsigned char c1, c2;
+	int r, i;
+
+	r = utf8cursor(&cur1, data, str1);
+	if (r < 0)
+		return -EIO;
+	r = utf8cursor(&cur2, data, str2);
+	if (r < 0)
+		return -EIO;
+
+	for (i = 0 ; i < len ; i++) {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (!c1 || !c2 || c1 != c2)
+			return 1;
+
+	}
+
+	return 0;
+}
+
+static int utf8_strncasecmp(const struct charset *charset, const char *str1,
+			    const char *str2, int len)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur1, cur2;
+	unsigned char c1, c2;
+	int r, i;
+
+	r = utf8cursor(&cur1, data, str1);
+	if (r < 0)
+		return -EIO;
+
+	r = utf8cursor(&cur2, data, str2);
+	if (r < 0)
+		return -EIO;
+
+	for (i = 0 ; i < len ; i++) {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (!c1 || !c2 || c1 != c2)
+			return 1;
+	}
+
+	return 0;
+}
+
+int utf8_casefold(const struct charset *charset, const char *str, int len,
+		  char **folded_str)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur;
+	int i;
+	char buffer[1024];
+
+	if (utf8cursor(&cur, data, str))
+		return -EIO;
+
+	for (i = 0; i < (1024-1); i++) {
+		buffer[i] = utf8byte(&cur);
+		if (!buffer[i])
+			break;
+	}
+	buffer[i] = '\0';
+	*folded_str = kstrdup(buffer, GFP_NOFS);
+	if (!*folded_str)
+		return -ENOMEM;
+
+	return i;
+}
+
+int utf8_normalize(const struct charset *charset, const char *str, int len,
+		   char **normalization)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur;
+	int i;
+	char buffer[1024];
+
+	if (utf8cursor(&cur, data, str))
+		return -EIO;
+
+	for (i = 0; i < (1024-1); i++) {
+		buffer[i] = utf8byte(&cur);
+		if (!buffer[i])
+			break;
+	}
+	buffer[i] = '\0';
+	*normalization = kstrdup(buffer, GFP_NOFS);
+	if (!*normalization)
+		return -ENOMEM;
+
+	return i;
+}
+
+static const struct charset_ops utf8_ops = {
+	.strncmp = utf8_strncmp,
+	.strncasecmp = utf8_strncasecmp,
+	.casefold = utf8_casefold,
+	.normalize = utf8_normalize,
+};
+
+static struct charset *utf8_load_charset(void *pargs)
+{
+	int maj, min, rev;
+	unsigned int age;
+	struct charset *charset;
+	substring_t *args = pargs;
+
+	if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+	    match_int(&args[2], &rev))
+		return NULL;
+
+	age = UNICODE_AGE(maj, min, rev);
+
+	if (!utf8version_is_supported(age))
+		return NULL;
+
+	charset = kmalloc(sizeof(struct charset), GFP_KERNEL);
+	if (!charset)
+		return NULL;
+
+	charset->info = NULL;
+	charset->version = age;
+	charset->ops = &utf8_ops;
+
+	return charset;
+}
+
+static struct charset_info utf8_info = {
+	.name = "utf8",
+	.match_token = "utf8-%d.%d.%d",
+	.load_charset = utf8_load_charset,
+};
+
+static int __init init_utf8(void)
+{
+	charset_register(&utf8_info);
+	return 0;
+}
+
+static void __exit exit_utf8(void)
+{
+}
+
+module_init(init_utf8);
+module_exit(exit_utf8);
+MODULE_AUTHOR("Gabriel Krisman Bertazi");
+MODULE_DESCRIPTION("UTF-8 charset operations for filesystems");
+MODULE_LICENSE("GPL");
+