diff mbox series

[e2fsprogs,3/9] libe2p: Helpers for configuring the encoding superblock fields

Message ID 20181015211220.27370-4-krisman@collabora.co.uk
State Superseded
Headers show
Series Support encoding awareness and casefold | expand

Commit Message

Gabriel Krisman Bertazi Oct. 15, 2018, 9:12 p.m. UTC
Implement helper functions to convert the encoding name and specific
parameters requested by the user on the command line into the format
that is written to disk.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
 lib/e2p/Makefile.in  |  8 +++--
 lib/e2p/e2p.h        |  4 +++
 lib/e2p/encoding.c   | 76 ++++++++++++++++++++++++++++++++++++++++++++
 lib/ext2fs/ext2_fs.h | 13 ++++++++
 4 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 lib/e2p/encoding.c

Comments

Theodore Ts'o Nov. 19, 2018, 4:27 a.m. UTC | #1
On Mon, Oct 15, 2018 at 05:12:14PM -0400, Gabriel Krisman Bertazi wrote:
> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
> index f1c405b76339..df8ced088f38 100644
> --- a/lib/ext2fs/ext2_fs.h
> +++ b/lib/ext2fs/ext2_fs.h
> @@ -1127,4 +1127,17 @@ struct mmp_struct {
>   */
>  #define EXT4_INLINE_DATA_DOTDOT_SIZE	(4)
>  
> +#define EXT4_ENC_STRICT_MODE_FL		(1 << 0) /* Reject invalid sequences? */

Why the question mark?

> +#define UTF8_NORMALIZATION_TYPE_NFKD	(1 << 1)
> +#define UTF8_CASEFOLD_TYPE_NFKDCF	(1 << 4)
> +
> +static const struct ext4_sb_encoding_map {
> +	char *name;
> +	__u16 default_flags;
> +} ext4_encoding_map[] = {
> +	/* 0x0 */ { "ascii", 0x0},
> +	/* 0x1 */ {"utf8-10.0.0", UTF8_NORMALIZATION_TYPE_NFKD|UTF8_CASEFOLD_TYPE_NFKDCF},
> +	{0x0, 0x0},
> +};
> +
>  #endif	/* _LINUX_EXT2_FS_H */

What uses this?  I can't find any other references in either the kernel or
e2fsprogs patches.

						- Ted
Gabriel Krisman Bertazi Nov. 19, 2018, 3:28 p.m. UTC | #2
"Theodore Y. Ts'o" <tytso@mit.edu> writes:

> On Mon, Oct 15, 2018 at 05:12:14PM -0400, Gabriel Krisman Bertazi wrote:
>> +#define EXT4_ENC_STRICT_MODE_FL		(1 << 0) /* Reject invalid sequences? */
>
> Why the question mark?

Hi Ted,

The question mark is very redundant for a flag, I admit :).  It meant to
say "Whether to reject invalid sequences" or something like that.  Will
fix in the v2.

>> +#define UTF8_NORMALIZATION_TYPE_NFKD	(1 << 1)
>> +#define UTF8_CASEFOLD_TYPE_NFKDCF	(1 << 4)
>> +
>> +static const struct ext4_sb_encoding_map {
>> +	char *name;
>> +	__u16 default_flags;
>> +} ext4_encoding_map[] = {
>> +	/* 0x0 */ { "ascii", 0x0},
>> +	/* 0x1 */ {"utf8-10.0.0", UTF8_NORMALIZATION_TYPE_NFKD|UTF8_CASEFOLD_TYPE_NFKDCF},
>> +	{0x0, 0x0},
>> +};
>> +
>>  #endif	/* _LINUX_EXT2_FS_H */
>
> What uses this?  I can't find any other references in either the kernel or
> e2fsprogs patches.

Only the instance ext4_encoding_map, itself, is used in this patch and
in the next one, which modifies mkefs.  It stores the string for
comparison with what the user passed in the command line.

I guess naming the structure is unnecessary, since we have only this
single const static instance.  I will change that in the v2, as well.

The current series doesn't include the huge utf8 stuff, which makes use
of the rest of the flags, but I will add that in v2 as well.


Thanks!
Theodore Ts'o Nov. 21, 2018, 4:32 a.m. UTC | #3
On Mon, Nov 19, 2018 at 10:28:48AM -0500, Gabriel Krisman Bertazi wrote:
> 
> >> +#define UTF8_NORMALIZATION_TYPE_NFKD	(1 << 1)
> >> +#define UTF8_CASEFOLD_TYPE_NFKDCF	(1 << 4)

Where do these values come from?  And why are they (1 << 1) and (1 << 4),
respectively?

I just noticed that these are used in utf8's default flags, when then
end up getting set in the superblock.  So if these are official ext4
code points, they should have a EXT4_ prefix, not a UTF8_ prefix.  It
also seems that it's not possible to set them in mke2fs (only the
"strict" flag can be set or unset in e2p_str2encoding_flags).

So are we going to support something other than NFKD, or not?  If it's
in the superblock, then we need to make sure the kernel does something
sane if they are something other than the default.  And if we are just
going to make it be a rule that all ext4 file systems with encoding
type utf8 v10 will be NFKD, then we should let it be configurable in
the superblock.

> >> +
> >> +static const struct ext4_sb_encoding_map {
> >> +	char *name;
> >> +	__u16 default_flags;
> >> +} ext4_encoding_map[] = {
> >> +	/* 0x0 */ { "ascii", 0x0},
> >> +	/* 0x1 */ {"utf8-10.0.0", UTF8_NORMALIZATION_TYPE_NFKD|UTF8_CASEFOLD_TYPE_NFKDCF},
 
It might be enough to just use "utf8-10.0".  Internally in the Unicode
standard, they only use the X.Y notation, and given that we're already
using the utf8 short-name, as opposed to something like "UTF-8
encoding of Unicode 10.0.0", it might be better to shorten it to utf-8.

I also noticed that Unicode 11.0 has been released in June 2018.  For
poeple interested in scripts like Georgian Mtavruli (which has new
case folding rules, so it's not just academic on our part), Hanifi
Rohingya, Mayan Numberals, Historic Sanskrit etc., in their ext4 file
names, I'm sure they'll appreciate it.  :-)

Oh, and I think the FSF will be happier if we use Unicode 11.0, since
it also features (in addition to a number of new emoji's), the
Copyleft Symbol.  :-)

					- Ted
Gabriel Krisman Bertazi Nov. 21, 2018, 7:33 p.m. UTC | #4
"Theodore Y. Ts'o" <tytso@mit.edu> writes:

> On Mon, Nov 19, 2018 at 10:28:48AM -0500, Gabriel Krisman Bertazi wrote:
>> 
>> >> +#define UTF8_NORMALIZATION_TYPE_NFKD	(1 << 1)
>> >> +#define UTF8_CASEFOLD_TYPE_NFKDCF	(1 << 4)
>
> Where do these values come from?  And why are they (1 << 1) and (1 << 4),
> respectively?
>
> I just noticed that these are used in utf8's default flags, when then
> end up getting set in the superblock.  So if these are official ext4
> code points, they should have a EXT4_ prefix, not a UTF8_ prefix.  It
> also seems that it's not possible to set them in mke2fs (only the
> "strict" flag can be set or unset in e2p_str2encoding_flags).

Hi,

They come from the nls.h kernel header.  These flags are passed to the
NLS system to describe the behavior of normalization/casefold functions.

In order to maintain compatibility to previous kernel users, the utf8
module (and others, eventually), still support the "no
normalization/casefold" policy (which I call 'plain' in the kernel).
When I merged utf8n into utf8, it became up to a flag set when loading
the nls table to decide what kind of normalization, if any, should be
done.

> So are we going to support something other than NFKD, or not?  If it's
> in the superblock, then we need to make sure the kernel does something
> sane if they are something other than the default.  And if we are just
> going to make it be a rule that all ext4 file systems with encoding
> type utf8 v10 will be NFKD, then we should let it be configurable in
> the superblock.

The NLS code in the kernel supports PLAIN and NFKD, but there is no real
reason for ext4 users to request PLAIN at all, which is only for
backward compatibility with filesystems that used the utf8 module
beforehand, so it can't be configured in e2fsprogs.  It still makes
sense to store the normalization type in the superblock though, in case
we support other normalization forms in the future and need to do some
conversion.

That said, I am not planning to support other normalization forms in
ext4 in the future.

If the kernel (nls_load_version) finds any value other than TYPE_PLAIN
(0x0) or TYPE_NFKD in the superblock when loading the NLS table, it will
fail the table creation, which, in turn,  fails the mount operation.

If you agree with the design above, I will just fix the EXT4_ prefix.

>
>> >> +
>> >> +static const struct ext4_sb_encoding_map {
>> >> +	char *name;
>> >> +	__u16 default_flags;
>> >> +} ext4_encoding_map[] = {
>> >> +	/* 0x0 */ { "ascii", 0x0},
>> >> +	/* 0x1 */ {"utf8-10.0.0", UTF8_NORMALIZATION_TYPE_NFKD|UTF8_CASEFOLD_TYPE_NFKDCF},
>
> It might be enough to just use "utf8-10.0".  Internally in the Unicode
> standard, they only use the X.Y notation, and given that we're already
> using the utf8 short-name, as opposed to something like "UTF-8
> encoding of Unicode 10.0.0", it might be better to shorten it to utf-8.
>
> I also noticed that Unicode 11.0 has been released in June 2018.  For
> poeple interested in scripts like Georgian Mtavruli (which has new
> case folding rules, so it's not just academic on our part), Hanifi
> Rohingya, Mayan Numberals, Historic Sanskrit etc., in their ext4 file
> names, I'm sure they'll appreciate it.  :-)
>
> Oh, and I think the FSF will be happier if we use Unicode 11.0, since
> it also features (in addition to a number of new emoji's), the
> Copyleft Symbol.  :-)

I can do the update!
diff mbox series

Patch

diff --git a/lib/e2p/Makefile.in b/lib/e2p/Makefile.in
index 2b0aa1915130..68d534cdaf11 100644
--- a/lib/e2p/Makefile.in
+++ b/lib/e2p/Makefile.in
@@ -19,7 +19,8 @@  all::	e2p.pc
 OBJS=		feature.o fgetflags.o fsetflags.o fgetversion.o fsetversion.o \
 		getflags.o getversion.o hashstr.o iod.o ls.o ljs.o mntopts.o \
 		parse_num.o pe.o pf.o ps.o setflags.o setversion.o uuid.o \
-		ostype.o percent.o crypto_mode.o fgetproject.o fsetproject.o
+		ostype.o percent.o crypto_mode.o fgetproject.o fsetproject.o \
+		encoding.o
 
 SRCS=		$(srcdir)/feature.c $(srcdir)/fgetflags.c \
 		$(srcdir)/fsetflags.c $(srcdir)/fgetversion.c \
@@ -29,7 +30,7 @@  SRCS=		$(srcdir)/feature.c $(srcdir)/fgetflags.c \
 		$(srcdir)/pe.c $(srcdir)/pf.c $(srcdir)/ps.c \
 		$(srcdir)/setflags.c $(srcdir)/setversion.c $(srcdir)/uuid.c \
 		$(srcdir)/ostype.c $(srcdir)/percent.c $(srcdir)/crypto_mode.c \
-		$(srcdir)/fgetproject.c $(srcdir)/fsetproject.c
+		$(srcdir)/fgetproject.c $(srcdir)/fsetproject.c $(srcdir)/encoding.c
 HFILES= e2p.h
 
 LIBRARY= libe2p
@@ -147,6 +148,9 @@  getversion.o: $(srcdir)/getversion.c $(top_builddir)/lib/config.h \
 hashstr.o: $(srcdir)/hashstr.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/e2p.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
+encoding.o: $(srcdir)/encoding.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/e2p.h \
+ $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
 iod.o: $(srcdir)/iod.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/e2p.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
diff --git a/lib/e2p/e2p.h b/lib/e2p/e2p.h
index d70b59a5d358..c39074abe8eb 100644
--- a/lib/e2p/e2p.h
+++ b/lib/e2p/e2p.h
@@ -80,3 +80,7 @@  unsigned int e2p_percent(int percent, unsigned int base);
 
 const char *e2p_encmode2string(int num);
 int e2p_string2encmode(char *string);
+
+int e2p_str2encoding(const char *string);
+const char *e2p_encoding2str(int encoding);
+int e2p_str2encoding_flags(int encoding, char *param, __u16 *flags);
diff --git a/lib/e2p/encoding.c b/lib/e2p/encoding.c
new file mode 100644
index 000000000000..6904db73b94c
--- /dev/null
+++ b/lib/e2p/encoding.c
@@ -0,0 +1,76 @@ 
+/*
+ * encoding.c --- convert between encoding magic numbers and strings
+ *
+ * Copyright (C) 2018  Collabora Ltd.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "e2p.h"
+
+static const struct enc_flags {
+	__u16 flag;
+	char *param;
+} encoding_flags[] = {
+	{ EXT4_ENC_STRICT_MODE_FL, "strict" },
+	{0, NULL},
+};
+
+/* Return a positive number < 0xff indicating the encoding magic number
+ * or a negative value indicating error. */
+int e2p_str2encoding(const char *string)
+{
+	int i;
+
+	for (i = 0 ; ext4_encoding_map[i].name; i++)
+		if (!strcmp(string, ext4_encoding_map[i].name))
+			return i;
+
+	return -EINVAL;
+}
+
+const char *e2p_encoding2str(int encoding)
+{
+	return ext4_encoding_map[encoding].name;
+}
+
+int e2p_str2encoding_flags(int encoding, char *param, __u16 *flags)
+{
+	char *f = strtok(param, "-");
+	const struct enc_flags *fl;
+	int neg = 0;
+
+	while (f) {
+		neg = 0;
+		if (!strncmp ("no", f, 2)) {
+			neg = 1;
+			f += 2;
+		}
+
+		for (fl = encoding_flags; fl->param; fl++) {
+			if (!strcmp(fl->param, f)) {
+				if (neg)
+					*flags &= ~fl->flag;
+				else
+					*flags |= fl->flag;
+
+				goto next_flag;
+			}
+		}
+		return -EINVAL;
+	next_flag:
+		f = strtok(NULL, "-");
+	}
+	return 0;
+}
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index f1c405b76339..df8ced088f38 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -1127,4 +1127,17 @@  struct mmp_struct {
  */
 #define EXT4_INLINE_DATA_DOTDOT_SIZE	(4)
 
+#define EXT4_ENC_STRICT_MODE_FL		(1 << 0) /* Reject invalid sequences? */
+#define UTF8_NORMALIZATION_TYPE_NFKD	(1 << 1)
+#define UTF8_CASEFOLD_TYPE_NFKDCF	(1 << 4)
+
+static const struct ext4_sb_encoding_map {
+	char *name;
+	__u16 default_flags;
+} ext4_encoding_map[] = {
+	/* 0x0 */ { "ascii", 0x0},
+	/* 0x1 */ {"utf8-10.0.0", UTF8_NORMALIZATION_TYPE_NFKD|UTF8_CASEFOLD_TYPE_NFKDCF},
+	{0x0, 0x0},
+};
+
 #endif	/* _LINUX_EXT2_FS_H */