From patchwork Tue Jul 3 17:06:41 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938813 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCK1cZRz9s3C for ; Wed, 4 Jul 2018 03:07:25 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932834AbeGCRHX (ORCPT ); Tue, 3 Jul 2018 13:07:23 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33342 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753190AbeGCRHQ (ORCPT ); Tue, 3 Jul 2018 13:07:16 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 1062A287CA8 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 01/20] nls: Wrap uni2char/char2uni callers Date: Tue, 3 Jul 2018 13:06:41 -0400 Message-Id: <20180703170700.9306-2-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Generated with the following coccinele script: @@ expression A, B, C, D; @@ ( - A->uni2char(B, C, D) + nls_uni2char(A, B, C, D) | - A->char2uni(B, C, D) + nls_char2uni(A, B, C, D) ) Signed-off-by: Gabriel Krisman Bertazi --- fs/befs/linuxvfs.c | 4 ++-- fs/cifs/cifs_unicode.c | 9 +++++---- fs/cifs/dir.c | 7 ++++--- fs/fat/dir.c | 8 ++++---- fs/fat/namei_vfat.c | 6 +++--- fs/hfs/trans.c | 9 +++++---- fs/hfsplus/unicode.c | 6 +++--- fs/isofs/joliet.c | 3 ++- fs/jfs/jfs_unicode.c | 7 +++---- fs/nls/nls_euc-jp.c | 4 ++-- fs/nls/nls_koi8-ru.c | 6 +++--- fs/ntfs/unistr.c | 8 ++++---- include/linux/nls.h | 13 +++++++++++++ 13 files changed, 53 insertions(+), 37 deletions(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 4700b4534439..0ba368fbfad4 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -542,7 +542,7 @@ befs_utf2nls(struct super_block *sb, const char *in, /* convert from Unicode to nls */ if (uni > MAX_WCHAR_T) goto conv_err; - unilen = nls->uni2char(uni, &result[o], in_len - o); + unilen = nls_uni2char(nls, uni, &result[o], in_len - o); if (unilen < 0) goto conv_err; } @@ -616,7 +616,7 @@ befs_nls2utf(struct super_block *sb, const char *in, for (i = o = 0; i < in_len; i += unilen, o += utflen) { /* convert from nls to unicode */ - unilen = nls->char2uni(&in[i], in_len - i, &uni); + unilen = nls_char2uni(nls, &in[i], in_len - i, &uni); if (unilen < 0) goto conv_err; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index b380e0871372..3b5d48433f23 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -148,7 +148,7 @@ cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, return len; /* if character not one of seven in special remap set */ - len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); + len = nls_uni2char(cp, src_char, target, NLS_MAX_CHARSET_SIZE); if (len <= 0) goto surrogate_pair; @@ -292,7 +292,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, } for (i = 0; len && *from; i++, from += charlen, len -= charlen) { - charlen = codepage->char2uni(from, len, &wchar_to); + charlen = nls_char2uni(codepage, from, len, &wchar_to); if (charlen < 1) { cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n", *from, charlen); @@ -518,7 +518,8 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, * as they use backslash as separator. */ if (dst_char == 0) { - charlen = cp->char2uni(source + i, srclen - i, &tmp); + charlen = nls_char2uni(cp, source + i, srclen - i, + &tmp); dst_char = cpu_to_le16(tmp); /* @@ -608,7 +609,7 @@ cifs_local_to_utf16_bytes(const char *from, int len, wchar_t wchar_to; for (i = 0; len && *from; i++, from += charlen, len -= charlen) { - charlen = codepage->char2uni(from, len, &wchar_to); + charlen = nls_char2uni(codepage, from, len, &wchar_to); /* Failed conversion defaults to a question mark */ if (charlen < 1) charlen = 1; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ddae52bd1993..239877d9db69 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -911,7 +911,7 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) hash = init_name_hash(dentry); for (i = 0; i < q->len; i += charlen) { - charlen = codepage->char2uni(&q->name[i], q->len - i, &c); + charlen = nls_char2uni(codepage, &q->name[i], q->len - i, &c); /* error out if we can't convert the character */ if (unlikely(charlen < 0)) return charlen; @@ -940,8 +940,9 @@ static int cifs_ci_compare(const struct dentry *dentry, for (i = 0; i < len; i += l1) { /* Convert characters in both strings to UTF-16. */ - l1 = codepage->char2uni(&str[i], len - i, &c1); - l2 = codepage->char2uni(&name->name[i], name->len - i, &c2); + l1 = nls_char2uni(codepage, &str[i], len - i, &c1); + l2 = nls_char2uni(codepage, &name->name[i], name->len - i, + &c2); /* * If we can't convert either character, just declare it to diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8e100c3bf72c..6dd8d386d0ef 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -153,7 +153,7 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; - charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE); + charlen = nls_uni2char(nls, ec, op, NLS_MAX_CHARSET_SIZE); if (charlen > 0) { op += charlen; len -= charlen; @@ -195,7 +195,7 @@ fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) { int charlen; - charlen = t->char2uni(c, clen, uni); + charlen = nls_char2uni(t, c, clen, uni); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; @@ -210,7 +210,7 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, int charlen; wchar_t wc; - charlen = t->char2uni(c, clen, &wc); + charlen = nls_char2uni(t, c, clen, &wc); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; @@ -220,7 +220,7 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, if (!nc) nc = *c; - charlen = t->char2uni(&nc, 1, uni); + charlen = nls_char2uni(t, &nc, 1, uni); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 9a5469120caa..5f4f3fe059b8 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -289,7 +289,7 @@ static inline int to_shortname_char(struct nls_table *nls, return 1; } - len = nls->uni2char(*src, buf, buf_size); + len = nls_uni2char(nls, *src, buf, buf_size); if (len <= 0) { info->valid = 0; buf[0] = '_'; @@ -544,8 +544,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, ip += 5; i += 5; } else { - charlen = nls->char2uni(ip, len - i, - (wchar_t *)op); + charlen = nls_char2uni(nls, ip, len - i, + (wchar_t *)op); if (charlen < 0) return -EINVAL; ip += charlen; diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index 39f5e343bf4d..2fae312edb31 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -49,7 +49,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) while (srclen > 0) { if (nls_disk) { - size = nls_disk->char2uni(src, srclen, &ch); + size = nls_char2uni(nls_disk, src, srclen, + &ch); if (size <= 0) { ch = '?'; size = 1; @@ -62,7 +63,7 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) } if (ch == '/') ch = ':'; - size = nls_io->uni2char(ch, dst, dstlen); + size = nls_uni2char(nls_io, ch, dst, dstlen); if (size < 0) { if (size == -ENAMETOOLONG) goto out; @@ -110,7 +111,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr wchar_t ch; while (srclen > 0) { - size = nls_io->char2uni(src, srclen, &ch); + size = nls_char2uni(nls_io, src, srclen, &ch); if (size < 0) { ch = '?'; size = 1; @@ -120,7 +121,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr if (ch == ':') ch = '/'; if (nls_disk) { - size = nls_disk->uni2char(ch, dst, dstlen); + size = nls_uni2char(nls_disk, ch, dst, dstlen); if (size < 0) { if (size == -ENAMETOOLONG) goto out; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index dfa90c21948f..4f2908169535 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -190,7 +190,7 @@ int hfsplus_uni2asc(struct super_block *sb, c0 = ':'; break; } - res = nls->uni2char(c0, op, len); + res = nls_uni2char(nls, c0, op, len); if (res < 0) { if (res == -ENAMETOOLONG) goto out; @@ -233,7 +233,7 @@ int hfsplus_uni2asc(struct super_block *sb, cc = c0; } done: - res = nls->uni2char(cc, op, len); + res = nls_uni2char(nls, cc, op, len); if (res < 0) { if (res == -ENAMETOOLONG) goto out; @@ -256,7 +256,7 @@ int hfsplus_uni2asc(struct super_block *sb, static inline int asc2unichar(struct super_block *sb, const char *astr, int len, wchar_t *uc) { - int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); + int size = nls_char2uni(HFSPLUS_SB(sb)->nls, astr, len, uc); if (size <= 0) { *uc = '?'; size = 1; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index be8b6a9d0b92..56fac73b27a5 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -25,7 +25,8 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) while ((ch = get_unaligned(ip)) && len) { int llen; - llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE); + llen = nls_uni2char(nls, be16_to_cpu(ch), op, + NLS_MAX_CHARSET_SIZE); if (llen > 0) op += llen; else diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c index 0148e2e4d97a..4ca88ef661e9 100644 --- a/fs/jfs/jfs_unicode.c +++ b/fs/jfs/jfs_unicode.c @@ -41,9 +41,8 @@ int jfs_strfromUCS_le(char *to, const __le16 * from, for (i = 0; (i < len) && from[i]; i++) { int charlen; charlen = - codepage->uni2char(le16_to_cpu(from[i]), - &to[outlen], - NLS_MAX_CHARSET_SIZE); + nls_uni2char(codepage, le16_to_cpu(from[i]), + &to[outlen], NLS_MAX_CHARSET_SIZE); if (charlen > 0) outlen += charlen; else @@ -88,7 +87,7 @@ static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, if (codepage) { for (i = 0; len && *from; i++, from += charlen, len -= charlen) { - charlen = codepage->char2uni(from, len, &to[i]); + charlen = nls_char2uni(codepage, from, len, &to[i]); if (charlen < 1) { jfs_err("jfs_strtoUCS: char2uni returned %d.", charlen); diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 162b3f160353..eec257545f04 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -413,7 +413,7 @@ static int uni2char(const wchar_t uni, if (!p_nls) return -EINVAL; - if ((n = p_nls->uni2char(uni, out, boundlen)) < 0) + if ((n = nls_uni2char(p_nls, uni, out, boundlen)) < 0) return n; /* translate SJIS into EUC-JP */ @@ -543,7 +543,7 @@ static int char2uni(const unsigned char *rawstring, int boundlen, sjis_temp[1] = 0x00; } - if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0) + if ( (n = nls_char2uni(p_nls, sjis_temp, sizeof(sjis_temp), uni)) < 0) return n; return euc_offset; diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index a80a741a8676..32781252110d 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -28,12 +28,12 @@ static int uni2char(const wchar_t uni, else if (uni == 0x255d || uni == 0x256c) return 0; else - return p_nls->uni2char(uni, out, boundlen); + return nls_uni2char(p_nls, uni, out, boundlen); return 1; } else /* fast path */ - return p_nls->uni2char(uni, out, boundlen); + return nls_uni2char(p_nls, uni, out, boundlen); } static int char2uni(const unsigned char *rawstring, int boundlen, @@ -47,7 +47,7 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return 1; } - n = p_nls->char2uni(rawstring, boundlen, uni); + n = nls_char2uni(p_nls, rawstring, boundlen, uni); return n; } diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c index 005ca4b0f132..e0a5f33441df 100644 --- a/fs/ntfs/unistr.c +++ b/fs/ntfs/unistr.c @@ -269,8 +269,8 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); if (likely(ucs)) { for (i = o = 0; i < ins_len; i += wc_len) { - wc_len = nls->char2uni(ins + i, ins_len - i, - &wc); + wc_len = nls_char2uni(nls, ins + i, + ins_len - i, &wc); if (likely(wc_len >= 0 && o < NTFS_MAX_NAME_LEN)) { if (likely(wc)) { @@ -355,8 +355,8 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, goto mem_err_out; } for (i = o = 0; i < ins_len; i++) { -retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, - ns_len - o); +retry: wc = nls_uni2char(nls, le16_to_cpu(ins[i]), + ns + o, ns_len - o); if (wc > 0) { o += wc; continue; diff --git a/include/linux/nls.h b/include/linux/nls.h index 499e486b3722..5073ecd57279 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -59,6 +59,19 @@ extern int utf8s_to_utf16s(const u8 *s, int len, extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u8 *s, int maxlen); +static inline int nls_uni2char(const struct nls_table *table, wchar_t uni, + unsigned char *out, int boundlen) +{ + return table->uni2char(uni, out, boundlen); +} + +static inline int nls_char2uni(const struct nls_table *table, + const unsigned char *rawstring, + int boundlen, wchar_t *uni) +{ + return table->char2uni(rawstring, boundlen, uni); +} + static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) { unsigned char nc = t->charset2lower[c]; From patchwork Tue Jul 3 17:06:42 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938814 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCP56lYz9s1B for ; Wed, 4 Jul 2018 03:07:29 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753588AbeGCRH0 (ORCPT ); Tue, 3 Jul 2018 13:07:26 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33344 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752997AbeGCRHT (ORCPT ); Tue, 3 Jul 2018 13:07:19 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 36AFF2605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 02/20] nls: Wrap charset field access Date: Tue, 3 Jul 2018 13:06:42 -0400 Message-Id: <20180703170700.9306-3-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Which will simplify further patches. No behavior changes intended. @@ struct nls_table *c; @@ - c->charset + nls_charset_name(c) Signed-off-by: Gabriel Krisman Bertazi --- fs/befs/linuxvfs.c | 4 ++-- fs/cifs/cifs_unicode.c | 6 +++--- fs/cifs/cifsfs.c | 2 +- fs/cifs/connect.c | 2 +- fs/fat/inode.c | 6 ++++-- fs/hfs/super.c | 6 ++++-- fs/hfsplus/options.c | 2 +- fs/isofs/inode.c | 5 +++-- fs/jfs/jfs_unicode.c | 2 +- fs/jfs/super.c | 3 ++- fs/nls/nls_base.c | 2 +- fs/ntfs/inode.c | 2 +- fs/ntfs/super.c | 6 +++--- fs/ntfs/unistr.c | 5 +++-- fs/udf/super.c | 3 ++- include/linux/nls.h | 5 +++++ 16 files changed, 37 insertions(+), 24 deletions(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 0ba368fbfad4..8b7af0a9011a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -555,7 +555,7 @@ befs_utf2nls(struct super_block *sb, const char *in, conv_err: befs_error(sb, "Name using character set %s contains a character that " - "cannot be converted to unicode.", nls->charset); + "cannot be converted to unicode.", nls_charset_name(nls)); befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; @@ -635,7 +635,7 @@ befs_nls2utf(struct super_block *sb, const char *in, conv_err: befs_error(sb, "Name using character set %s contains a character that " - "cannot be converted to unicode.", nls->charset); + "cannot be converted to unicode.", nls_charset_name(nls)); befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 3b5d48433f23..ca0a514ddad6 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -156,7 +156,7 @@ cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, surrogate_pair: /* convert SURROGATE_PAIR and IVS */ - if (strcmp(cp->charset, "utf8")) + if (strcmp(nls_charset_name(cp), "utf8")) goto unknown; len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); if (len <= 0) @@ -271,7 +271,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, wchar_t wchar_to; /* needed to quiet sparse */ /* special case for utf8 to handle no plane0 chars */ - if (!strcmp(codepage->charset, "utf8")) { + if (!strcmp(nls_charset_name(codepage), "utf8")) { /* * convert utf8 -> utf16, we assume we have enough space * as caller should have assumed conversion does not overflow @@ -530,7 +530,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, goto ctoUTF16; /* convert SURROGATE_PAIR */ - if (strcmp(cp->charset, "utf8") || !wchar_to) + if (strcmp(nls_charset_name(cp), "utf8") || !wchar_to) goto unknown; if (*(source + i) & 0x80) { charlen = utf8_to_utf32(source + i, 6, &u); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d5aa7ae917bf..6bd2774d7ccd 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -401,7 +401,7 @@ cifs_show_nls(struct seq_file *s, struct nls_table *cur) /* Display iocharset= option if it's not default charset */ def = load_nls_default(); if (def != cur) - seq_printf(s, ",iocharset=%s", cur->charset); + seq_printf(s, ",iocharset=%s", nls_charset_name(cur)); unload_nls(def); } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a57da1b88bdf..cc8cf8fda6ee 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3154,7 +3154,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) old->mnt_dir_mode != new->mnt_dir_mode) return 0; - if (strcmp(old->local_nls->charset, new->local_nls->charset)) + if (strcmp(nls_charset_name(old->local_nls), nls_charset_name(new->local_nls))) return 0; if (old->actimeo != new->actimeo) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 065dc919a0ce..5b8cf1498a38 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -948,10 +948,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",allow_utime=%04o", opts->allow_utime); if (sbi->nls_disk) /* strip "cp" prefix from displayed option */ - seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]); + seq_printf(m, ",codepage=%s", + &nls_charset_name(sbi->nls_disk)[2]); if (isvfat) { if (sbi->nls_io) - seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); + seq_printf(m, ",iocharset=%s", + nls_charset_name(sbi->nls_io)); switch (opts->shortname) { case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95: diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 173876782f73..b16ca01180a5 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -151,9 +151,11 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls_disk) - seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset); + seq_printf(seq, ",codepage=%s", + nls_charset_name(sbi->nls_disk)); if (sbi->nls_io) - seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset); + seq_printf(seq, ",iocharset=%s", + nls_charset_name(sbi->nls_io)); if (sbi->s_quiet) seq_printf(seq, ",quiet"); return 0; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 047e05c57560..2d6644465566 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -230,7 +230,7 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls) - seq_printf(seq, ",nls=%s", sbi->nls->charset); + seq_printf(seq, ",nls=%s", nls_charset_name(sbi->nls)); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_puts(seq, ",nodecompose"); if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ec3fba7d492f..2f1b87a6107b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -519,8 +519,9 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) #ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset && - strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) - seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); + strcmp(nls_charset_name(sbi->s_nls_iocharset), CONFIG_NLS_DEFAULT) != 0) + seq_printf(m, ",iocharset=%s", + nls_charset_name(sbi->s_nls_iocharset)); #endif return 0; } diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c index 4ca88ef661e9..1e89b3b8caa7 100644 --- a/fs/jfs/jfs_unicode.c +++ b/fs/jfs/jfs_unicode.c @@ -92,7 +92,7 @@ static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, jfs_err("jfs_strtoUCS: char2uni returned %d.", charlen); jfs_err("charset = %s, char = 0x%x", - codepage->charset, *from); + nls_charset_name(codepage), *from); return charlen; } } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1b9264fd54b6..1f4542e94261 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -736,7 +736,8 @@ static int jfs_show_options(struct seq_file *seq, struct dentry *root) if (sbi->flag & JFS_DISCARD) seq_printf(seq, ",discard=%u", sbi->minblks_trim); if (sbi->nls_tab) - seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); + seq_printf(seq, ",iocharset=%s", + nls_charset_name(sbi->nls_tab)); if (sbi->flag & JFS_ERR_CONTINUE) seq_printf(seq, ",errors=continue"); if (sbi->flag & JFS_ERR_PANIC) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 52ccd34b1e79..e5d083b6e2b2 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -277,7 +277,7 @@ static struct nls_table *find_nls(char *charset) struct nls_table *nls; spin_lock(&nls_lock); for (nls = tables; nls; nls = nls->next) { - if (!strcmp(nls->charset, charset)) + if (!strcmp(nls_charset_name(nls), charset)) break; if (nls->alias && !strcmp(nls->alias, charset)) break; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index decaf75d1cd5..0f1cd52cef0f 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2313,7 +2313,7 @@ int ntfs_show_options(struct seq_file *sf, struct dentry *root) seq_printf(sf, ",fmask=0%o", vol->fmask); seq_printf(sf, ",dmask=0%o", vol->dmask); } - seq_printf(sf, ",nls=%s", vol->nls_map->charset); + seq_printf(sf, ",nls=%s", nls_charset_name(vol->nls_map)); if (NVolCaseSensitive(vol)) seq_printf(sf, ",case_sensitive"); if (NVolShowSystemFiles(vol)) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index bb7159f697f2..1c68c33e9816 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -224,7 +224,7 @@ static bool parse_options(ntfs_volume *vol, char *opt) } ntfs_error(vol->sb, "NLS character set %s not " "found. Using previous one %s.", - v, old_nls->charset); + v, nls_charset_name(old_nls)); nls_map = old_nls; } else /* nls_map */ { unload_nls(old_nls); @@ -274,7 +274,7 @@ static bool parse_options(ntfs_volume *vol, char *opt) "on remount."); return false; } /* else (!vol->nls_map) */ - ntfs_debug("Using NLS character set %s.", nls_map->charset); + ntfs_debug("Using NLS character set %s.", nls_charset_name(nls_map)); vol->nls_map = nls_map; } else /* (!nls_map) */ { if (!vol->nls_map) { @@ -285,7 +285,7 @@ static bool parse_options(ntfs_volume *vol, char *opt) return false; } ntfs_debug("Using default NLS character set (%s).", - vol->nls_map->charset); + nls_charset_name(vol->nls_map)); } } if (mft_zone_multiplier != -1) { diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c index e0a5f33441df..a30911979a55 100644 --- a/fs/ntfs/unistr.c +++ b/fs/ntfs/unistr.c @@ -297,7 +297,7 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, if (wc_len < 0) { ntfs_error(vol->sb, "Name using character set %s contains " "characters that cannot be converted to " - "Unicode.", nls->charset); + "Unicode.", nls_charset_name(nls)); i = -EILSEQ; } else /* if (o >= NTFS_MAX_NAME_LEN) */ { ntfs_error(vol->sb, "Name is too long (maximum length for a " @@ -386,7 +386,8 @@ retry: wc = nls_uni2char(nls, le16_to_cpu(ins[i]), conversion_err: ntfs_error(vol->sb, "Unicode name contains characters that cannot be " "converted to character set %s. You might want to " - "try to use the mount option nls=utf8.", nls->charset); + "try to use the mount option nls=utf8.", + nls_charset_name(nls)); if (ns != *outs) kfree(ns); if (wc != -ENAMETOOLONG) diff --git a/fs/udf/super.c b/fs/udf/super.c index 0c504c8031d3..6bee50cd4b0e 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -365,7 +365,8 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) seq_puts(seq, ",utf8"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) - seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + seq_printf(seq, ",iocharset=%s", + nls_charset_name(sbi->s_nls_map)); return 0; } diff --git a/include/linux/nls.h b/include/linux/nls.h index 5073ecd57279..cacbcd7d63e6 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -72,6 +72,11 @@ static inline int nls_char2uni(const struct nls_table *table, return table->char2uni(rawstring, boundlen, uni); } +static inline const char *nls_charset_name(const struct nls_table *table) +{ + return table->charset; +} + static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) { unsigned char nc = t->charset2lower[c]; From patchwork Tue Jul 3 17:06:43 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938815 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCQ3lQ9z9s3Z for ; Wed, 4 Jul 2018 03:07:30 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933994AbeGCRHZ (ORCPT ); Tue, 3 Jul 2018 13:07:25 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33354 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751735AbeGCRHX (ORCPT ); Tue, 3 Jul 2018 13:07:23 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 68BC9287CA8 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 03/20] nls: Wrap charset hooks in ops structure Date: Tue, 3 Jul 2018 13:06:43 -0400 Message-Id: <20180703170700.9306-4-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org With the exception of the struct declaration, this patch was generated by the following Coccinelle script: @nlstable@ identifier p; expression uni2char_fn; expression char2uni_fn; @@ static struct nls_table p = { - .char2uni = char2uni_fn, - .uni2char = uni2char_fn, + .ops = &charset_ops, }; @createops@ identifier nlstable.p; expression nlstable.uni2char_fn; expression nlstable.char2uni_fn; @@ +static const struct nls_ops charset_ops = { + .uni2char = uni2char_fn, + .char2uni = char2uni_fn, +}; + static struct nls_table p = {}; @@ struct nls_table *c; @@ ( - c->uni2char + c->ops->uni2char | - c->char2uni + c->ops->char2uni ) Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/mac-celtic.c | 8 ++++++-- fs/nls/mac-centeuro.c | 8 ++++++-- fs/nls/mac-croatian.c | 8 ++++++-- fs/nls/mac-cyrillic.c | 8 ++++++-- fs/nls/mac-gaelic.c | 8 ++++++-- fs/nls/mac-greek.c | 8 ++++++-- fs/nls/mac-iceland.c | 8 ++++++-- fs/nls/mac-inuit.c | 8 ++++++-- fs/nls/mac-roman.c | 8 ++++++-- fs/nls/mac-romanian.c | 8 ++++++-- fs/nls/mac-turkish.c | 8 ++++++-- fs/nls/nls_ascii.c | 8 ++++++-- fs/nls/nls_base.c | 8 ++++++-- fs/nls/nls_cp1250.c | 8 ++++++-- fs/nls/nls_cp1251.c | 8 ++++++-- fs/nls/nls_cp1255.c | 8 ++++++-- fs/nls/nls_cp437.c | 8 ++++++-- fs/nls/nls_cp737.c | 8 ++++++-- fs/nls/nls_cp775.c | 8 ++++++-- fs/nls/nls_cp850.c | 8 ++++++-- fs/nls/nls_cp852.c | 8 ++++++-- fs/nls/nls_cp855.c | 8 ++++++-- fs/nls/nls_cp857.c | 8 ++++++-- fs/nls/nls_cp860.c | 8 ++++++-- fs/nls/nls_cp861.c | 8 ++++++-- fs/nls/nls_cp862.c | 8 ++++++-- fs/nls/nls_cp863.c | 8 ++++++-- fs/nls/nls_cp864.c | 8 ++++++-- fs/nls/nls_cp865.c | 8 ++++++-- fs/nls/nls_cp866.c | 8 ++++++-- fs/nls/nls_cp869.c | 8 ++++++-- fs/nls/nls_cp874.c | 8 ++++++-- fs/nls/nls_cp932.c | 8 ++++++-- fs/nls/nls_cp936.c | 8 ++++++-- fs/nls/nls_cp949.c | 8 ++++++-- fs/nls/nls_cp950.c | 8 ++++++-- fs/nls/nls_euc-jp.c | 8 ++++++-- fs/nls/nls_iso8859-1.c | 8 ++++++-- fs/nls/nls_iso8859-13.c | 8 ++++++-- fs/nls/nls_iso8859-14.c | 8 ++++++-- fs/nls/nls_iso8859-15.c | 8 ++++++-- fs/nls/nls_iso8859-2.c | 8 ++++++-- fs/nls/nls_iso8859-3.c | 8 ++++++-- fs/nls/nls_iso8859-4.c | 8 ++++++-- fs/nls/nls_iso8859-5.c | 8 ++++++-- fs/nls/nls_iso8859-6.c | 8 ++++++-- fs/nls/nls_iso8859-7.c | 8 ++++++-- fs/nls/nls_iso8859-9.c | 8 ++++++-- fs/nls/nls_koi8-r.c | 8 ++++++-- fs/nls/nls_koi8-ru.c | 8 ++++++-- fs/nls/nls_koi8-u.c | 8 ++++++-- fs/nls/nls_utf8.c | 8 ++++++-- fs/udf/unicode.c | 4 ++-- include/linux/nls.h | 16 ++++++++++------ 54 files changed, 324 insertions(+), 112 deletions(-) diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 266c2d7d50bd..1b59b04f26f2 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -577,10 +577,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macceltic", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 9789c6057551..d5b8f38f97b6 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -507,10 +507,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "maccenteuro", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index bb19e7a07d43..32de6accd526 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -577,10 +577,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "maccroatian", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 2a7dea36acba..34d5c1c05ff1 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -472,10 +472,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "maccyrillic", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 77b001653588..2aabf5213176 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -542,10 +542,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macgaelic", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index 1eccf499e2eb..df62909ef57e 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -472,10 +472,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macgreek", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index cbd0875c6d69..8daa68b995bc 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -577,10 +577,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "maciceland", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index fba8357aaf03..b0799693502a 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -507,10 +507,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macinuit", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index b6a98a5208cd..ba358b864b05 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -612,10 +612,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macroman", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index 25547f023638..7a8a7f9a0bbc 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -577,10 +577,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macromanian", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index b5454bc7b7fa..eb3c5e53ec88 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -577,10 +577,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "macturkish", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index a2620650d5e4..6bad3e779284 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -142,10 +142,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "ascii", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index e5d083b6e2b2..0bb0acf6893f 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -520,10 +520,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table default_table = { .charset = "default", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index ace3e19d3407..08902e86fc8e 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -323,10 +323,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp1250", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 9273ddfd08a1..2bb88c8cc5bf 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -277,10 +277,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp1251", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index 1caf5dfed85b..c6bf8d575c5b 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -358,11 +358,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp1255", .alias = "iso8859-8", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index 7ddb830da3fd..0f3f8bdbb62b 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -363,10 +363,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp437", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index c593f683a0cd..9383359ca25f 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -326,10 +326,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp737", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 554c863745f2..6c787b9079ed 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -295,10 +295,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp775", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index 56cccd14b40b..50a57138a571 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -291,10 +291,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp850", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index 7cdc05ac1d40..0cbb199f1cd5 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -313,10 +313,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp852", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index 7426eea05663..530b77c86363 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -275,10 +275,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp855", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index 098309733ebd..0db642ec6f45 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -277,10 +277,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp857", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index 84224478e731..44a40dac26bd 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -340,10 +340,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp860", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index dc873e4be092..50e08174fc48 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -363,10 +363,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp861", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index d5263e3c5566..3505f3437972 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -397,10 +397,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp862", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index 051c9832e36a..e3489cdc0c04 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -357,10 +357,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp863", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index 97eb1273b2f7..d4185bc7f1bf 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -383,10 +383,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp864", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 111214228525..9f468944e577 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -363,10 +363,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp865", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index ffdcbc3fc38d..ee46fd5a76b1 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -281,10 +281,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp866", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 3b5a34589354..da29a4a53e1d 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -291,10 +291,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp869", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 8dfaa10710fa..642659b9ed89 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -249,11 +249,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp874", .alias = "tis-620", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 67b7398e8483..3e7bdefdca90 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7907,11 +7907,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return -EINVAL; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp932", .alias = "sjis", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index c96546cfec9f..b1fa2918992b 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11085,11 +11085,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp936", .alias = "gb2312", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 199171e97aa4..1d334095d86c 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13920,11 +13920,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp949", .alias = "euc-kr", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index 8e1418708209..d936160a48f9 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9456,11 +9456,15 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "cp950", .alias = "big5", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index eec257545f04..0af73982738b 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -549,10 +549,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return euc_offset; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "euc-jp", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, }; static int __init init_nls_euc_jp(void) diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 69ac020d43b1..6212b2925fa0 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -233,10 +233,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-1", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index afb3f8f275f0..8f0a23109207 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -261,10 +261,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-13", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index 046370f0b6f0..80ab77f37480 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -317,10 +317,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-14", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 7e34a841a056..5c02f93e7b20 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -283,10 +283,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-15", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index 7dd571181741..97afc1233da1 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -284,10 +284,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-2", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index 740b75ec4493..f835fcec3aae 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -284,10 +284,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-3", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index 8826021e32f5..14acb68fb013 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -284,10 +284,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-4", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 7c04057a1ad8..f559bbb25045 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -248,10 +248,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-5", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index d4a881400d74..e3d7e28363b8 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -239,10 +239,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-6", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 37b75d825a75..49fd2b24e492 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -293,10 +293,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-7", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 557b98250d37..876696f89626 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -248,10 +248,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "iso8859-9", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 811f232fccfb..6a85211402a8 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -299,10 +299,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "koi8-r", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index 32781252110d..c4e382fd0f13 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -51,10 +51,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "koi8-ru", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, }; static int __init init_nls_koi8_ru(void) diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 7e029e4c188a..5f91e9cdb165 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -306,10 +306,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "koi8-u", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index afcfbc4a14db..6988fffd5cf6 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -40,10 +40,14 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return n; } +static const struct nls_ops charset_ops = { + .uni2char = uni2char, + .char2uni = char2uni, +}; + static struct nls_table table = { .charset = "utf8", - .uni2char = uni2char, - .char2uni = char2uni, + .ops = &charset_ops, .charset2lower = identity, /* no conversion */ .charset2upper = identity, }; diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 45234791fec2..f1a9625ade43 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -178,7 +178,7 @@ static int udf_name_from_CS0(struct super_block *sb, } if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - conv_f = UDF_SB(sb)->s_nls_map->uni2char; + conv_f = UDF_SB(sb)->s_nls_map->ops->uni2char; else conv_f = NULL; @@ -286,7 +286,7 @@ static int udf_name_to_CS0(struct super_block *sb, return 0; if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - conv_f = UDF_SB(sb)->s_nls_map->char2uni; + conv_f = UDF_SB(sb)->s_nls_map->ops->char2uni; else conv_f = NULL; diff --git a/include/linux/nls.h b/include/linux/nls.h index cacbcd7d63e6..5d63fe6aa55e 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -22,12 +22,16 @@ typedef u16 wchar_t; /* Arbitrary Unicode character */ typedef u32 unicode_t; -struct nls_table { - const char *charset; - const char *alias; +struct nls_ops { int (*uni2char) (wchar_t uni, unsigned char *out, int boundlen); int (*char2uni) (const unsigned char *rawstring, int boundlen, wchar_t *uni); +}; + +struct nls_table { + const char *charset; + const char *alias; + const struct nls_ops *ops; const unsigned char *charset2lower; const unsigned char *charset2upper; struct module *owner; @@ -62,14 +66,14 @@ extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, static inline int nls_uni2char(const struct nls_table *table, wchar_t uni, unsigned char *out, int boundlen) { - return table->uni2char(uni, out, boundlen); + return table->ops->uni2char(uni, out, boundlen); } static inline int nls_char2uni(const struct nls_table *table, const unsigned char *rawstring, int boundlen, wchar_t *uni) { - return table->char2uni(rawstring, boundlen, uni); + return table->ops->char2uni(rawstring, boundlen, uni); } static inline const char *nls_charset_name(const struct nls_table *table) @@ -116,7 +120,7 @@ nls_nullsize(const struct nls_table *codepage) int charlen; char tmp[NLS_MAX_CHARSET_SIZE]; - charlen = codepage->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE); + charlen = codepage->ops->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE); return charlen > 0 ? charlen : 1; } From patchwork Tue Jul 3 17:06:44 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938816 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCR3mJTz9s3R for ; Wed, 4 Jul 2018 03:07:31 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934074AbeGCRH1 (ORCPT ); Tue, 3 Jul 2018 13:07:27 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33356 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933565AbeGCRHZ (ORCPT ); Tue, 3 Jul 2018 13:07:25 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id A77CD2605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 04/20] nls: Split default charset from NLS core Date: Tue, 3 Jul 2018 13:06:44 -0400 Message-Id: <20180703170700.9306-5-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Changes since v1: - Fix build as a module (kbuild test robot) Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/Makefile | 1 + fs/nls/nls_core.c | 94 ++++++++++++++++++++++++++++ fs/nls/{nls_base.c => nls_default.c} | 93 +++------------------------ 3 files changed, 102 insertions(+), 86 deletions(-) create mode 100644 fs/nls/nls_core.c rename fs/nls/{nls_base.c => nls_default.c} (90%) diff --git a/fs/nls/Makefile b/fs/nls/Makefile index ac54db297128..5f42ceff9d15 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -3,6 +3,7 @@ # Makefile for native language support # +nls_base-y := nls_core.o nls_default.o obj-$(CONFIG_NLS) += nls_base.o obj-$(CONFIG_NLS_CODEPAGE_437) += nls_cp437.o diff --git a/fs/nls/nls_core.c b/fs/nls/nls_core.c new file mode 100644 index 000000000000..3f7de8f4c5b2 --- /dev/null +++ b/fs/nls/nls_core.c @@ -0,0 +1,94 @@ +/* + * linux/fs/nls/nls_core.c + * + * Native language support--charsets and unicode translations. + * By Gordon Chaffee 1996, 1997 + * + * Unicode based case conversion 1999 by Wolfram Pienkoss + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct nls_table default_table; +static struct nls_table *tables = &default_table; +static DEFINE_SPINLOCK(nls_lock); + +int __register_nls(struct nls_table *nls, struct module *owner) +{ + struct nls_table ** tmp = &tables; + + if (nls->next) + return -EBUSY; + + nls->owner = owner; + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + spin_unlock(&nls_lock); + return -EBUSY; + } + tmp = &(*tmp)->next; + } + nls->next = tables; + tables = nls; + spin_unlock(&nls_lock); + return 0; +} +EXPORT_SYMBOL(__register_nls); + +int unregister_nls(struct nls_table * nls) +{ + struct nls_table ** tmp = &tables; + + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + *tmp = nls->next; + spin_unlock(&nls_lock); + return 0; + } + tmp = &(*tmp)->next; + } + spin_unlock(&nls_lock); + return -EINVAL; +} + +static struct nls_table *find_nls(char *charset) +{ + struct nls_table *nls; + spin_lock(&nls_lock); + for (nls = tables; nls; nls = nls->next) { + if (!strcmp(nls_charset_name(nls), charset)) + break; + if (nls->alias && !strcmp(nls->alias, charset)) + break; + } + if (nls && !try_module_get(nls->owner)) + nls = NULL; + spin_unlock(&nls_lock); + return nls; +} + +struct nls_table *load_nls(char *charset) +{ + return try_then_request_module(find_nls(charset), "nls_%s", charset); +} + +void unload_nls(struct nls_table *nls) +{ + if (nls) + module_put(nls->owner); +} + +EXPORT_SYMBOL(unregister_nls); +EXPORT_SYMBOL(unload_nls); +EXPORT_SYMBOL(load_nls); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_base.c b/fs/nls/nls_default.c similarity index 90% rename from fs/nls/nls_base.c rename to fs/nls/nls_default.c index 0bb0acf6893f..c5d7e8391b22 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_default.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls/nls_base.c + * linux/fs/nls/nls_default.c * * Native language support--charsets and unicode translations. * By Gordon Chaffee 1996, 1997 @@ -8,23 +8,17 @@ * */ +/* + * Sample implementation from Unicode home page. + * http://www.stonehand.com/unicode/standard/fss-utf.html + */ + #include -#include -#include -#include -#include -#include -#include #include +#include static struct nls_table default_table; -static struct nls_table *tables = &default_table; -static DEFINE_SPINLOCK(nls_lock); -/* - * Sample implementation from Unicode home page. - * http://www.stonehand.com/unicode/standard/fss-utf.html - */ struct utf8_table { int cmask; int cval; @@ -232,73 +226,6 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, } EXPORT_SYMBOL(utf16s_to_utf8s); -int __register_nls(struct nls_table *nls, struct module *owner) -{ - struct nls_table ** tmp = &tables; - - if (nls->next) - return -EBUSY; - - nls->owner = owner; - spin_lock(&nls_lock); - while (*tmp) { - if (nls == *tmp) { - spin_unlock(&nls_lock); - return -EBUSY; - } - tmp = &(*tmp)->next; - } - nls->next = tables; - tables = nls; - spin_unlock(&nls_lock); - return 0; -} -EXPORT_SYMBOL(__register_nls); - -int unregister_nls(struct nls_table * nls) -{ - struct nls_table ** tmp = &tables; - - spin_lock(&nls_lock); - while (*tmp) { - if (nls == *tmp) { - *tmp = nls->next; - spin_unlock(&nls_lock); - return 0; - } - tmp = &(*tmp)->next; - } - spin_unlock(&nls_lock); - return -EINVAL; -} - -static struct nls_table *find_nls(char *charset) -{ - struct nls_table *nls; - spin_lock(&nls_lock); - for (nls = tables; nls; nls = nls->next) { - if (!strcmp(nls_charset_name(nls), charset)) - break; - if (nls->alias && !strcmp(nls->alias, charset)) - break; - } - if (nls && !try_module_get(nls->owner)) - nls = NULL; - spin_unlock(&nls_lock); - return nls; -} - -struct nls_table *load_nls(char *charset) -{ - return try_then_request_module(find_nls(charset), "nls_%s", charset); -} - -void unload_nls(struct nls_table *nls) -{ - if (nls) - module_put(nls->owner); -} - static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, @@ -543,10 +470,4 @@ struct nls_table *load_nls_default(void) else return &default_table; } - -EXPORT_SYMBOL(unregister_nls); -EXPORT_SYMBOL(unload_nls); -EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); - -MODULE_LICENSE("Dual BSD/GPL"); From patchwork Tue Jul 3 17:06:45 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938818 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCT2TN8z9s3Z for ; Wed, 4 Jul 2018 03:07:33 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934141AbeGCRHc (ORCPT ); Tue, 3 Jul 2018 13:07:32 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33362 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933565AbeGCRH3 (ORCPT ); Tue, 3 Jul 2018 13:07:29 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 62058287CA8 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 05/20] nls: Split struct nls_charset from struct nls_table Date: Tue, 3 Jul 2018 13:06:45 -0400 Message-Id: <20180703170700.9306-6-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org struct nls_charset carries only the information required to register the charset with the NLS core and generate the nls_table on demand. nls_table is what is given to users to manipulate the encoding. With the exception of the following files (files declaration and default charset), which were edited by hand, the other files were generated with the following Coccinelle patch: Files edited by hand: - fs/nls/nls_base.c - include/linux/nls.h - fs/nls/nls_default.c @nlstable@ identifier p; expression charset_str; @@ static struct nls_table p = { - .charset = charset_str, }; @createops@ identifier nlstable.p; expression nlstable.charset_str; @@ +static struct nls_charset nls_charset; static struct nls_table p = { + .charset = &nls_charset, }; + +static struct nls_charset nls_charset = { + .charset = charset_str, + .tables = &p, +}; @@ expression A; @@ ? return - register_nls(A); + register_nls(&nls_charset); @@ expression A; @@ - unregister_nls(A); + unregister_nls(&nls_charset); @mvalias@ identifier p; expression alias_str; @@ static struct nls_table p = { - .alias = alias_str, }; @@ expression mvalias.alias_str; @@ static struct nls_charset nls_charset = { + .alias = alias_str, }; Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/mac-celtic.c | 12 ++++++++--- fs/nls/mac-centeuro.c | 12 ++++++++--- fs/nls/mac-croatian.c | 12 ++++++++--- fs/nls/mac-cyrillic.c | 12 ++++++++--- fs/nls/mac-gaelic.c | 12 ++++++++--- fs/nls/mac-greek.c | 12 ++++++++--- fs/nls/mac-iceland.c | 12 ++++++++--- fs/nls/mac-inuit.c | 12 ++++++++--- fs/nls/mac-roman.c | 12 ++++++++--- fs/nls/mac-romanian.c | 12 ++++++++--- fs/nls/mac-turkish.c | 12 ++++++++--- fs/nls/nls_ascii.c | 12 ++++++++--- fs/nls/nls_core.c | 48 +++++++++++++++++++++++++++-------------- fs/nls/nls_cp1250.c | 12 ++++++++--- fs/nls/nls_cp1251.c | 12 ++++++++--- fs/nls/nls_cp1255.c | 14 ++++++++---- fs/nls/nls_cp437.c | 12 ++++++++--- fs/nls/nls_cp737.c | 12 ++++++++--- fs/nls/nls_cp775.c | 12 ++++++++--- fs/nls/nls_cp850.c | 12 ++++++++--- fs/nls/nls_cp852.c | 12 ++++++++--- fs/nls/nls_cp855.c | 12 ++++++++--- fs/nls/nls_cp857.c | 12 ++++++++--- fs/nls/nls_cp860.c | 12 ++++++++--- fs/nls/nls_cp861.c | 12 ++++++++--- fs/nls/nls_cp862.c | 12 ++++++++--- fs/nls/nls_cp863.c | 12 ++++++++--- fs/nls/nls_cp864.c | 12 ++++++++--- fs/nls/nls_cp865.c | 12 ++++++++--- fs/nls/nls_cp866.c | 12 ++++++++--- fs/nls/nls_cp869.c | 12 ++++++++--- fs/nls/nls_cp874.c | 14 ++++++++---- fs/nls/nls_cp932.c | 14 ++++++++---- fs/nls/nls_cp936.c | 14 ++++++++---- fs/nls/nls_cp949.c | 14 ++++++++---- fs/nls/nls_cp950.c | 14 ++++++++---- fs/nls/nls_default.c | 9 ++++++-- fs/nls/nls_euc-jp.c | 12 ++++++++--- fs/nls/nls_iso8859-1.c | 12 ++++++++--- fs/nls/nls_iso8859-13.c | 12 ++++++++--- fs/nls/nls_iso8859-14.c | 12 ++++++++--- fs/nls/nls_iso8859-15.c | 12 ++++++++--- fs/nls/nls_iso8859-2.c | 12 ++++++++--- fs/nls/nls_iso8859-3.c | 12 ++++++++--- fs/nls/nls_iso8859-4.c | 12 ++++++++--- fs/nls/nls_iso8859-5.c | 12 ++++++++--- fs/nls/nls_iso8859-6.c | 12 ++++++++--- fs/nls/nls_iso8859-7.c | 12 ++++++++--- fs/nls/nls_iso8859-9.c | 12 ++++++++--- fs/nls/nls_koi8-r.c | 12 ++++++++--- fs/nls/nls_koi8-ru.c | 12 ++++++++--- fs/nls/nls_koi8-u.c | 12 ++++++++--- fs/nls/nls_utf8.c | 12 ++++++++--- include/linux/nls.h | 18 ++++++++++------ 54 files changed, 516 insertions(+), 183 deletions(-) diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 1b59b04f26f2..4fe7347c55d6 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -582,21 +582,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macceltic", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macceltic", + .tables = &table, +}; + static int __init init_nls_macceltic(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macceltic(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macceltic) diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index d5b8f38f97b6..2d115aae4240 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -512,21 +512,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "maccenteuro", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "maccenteuro", + .tables = &table, +}; + static int __init init_nls_maccenteuro(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_maccenteuro(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_maccenteuro) diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index 32de6accd526..b496b85fcde1 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -582,21 +582,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "maccroatian", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "maccroatian", + .tables = &table, +}; + static int __init init_nls_maccroatian(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_maccroatian(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_maccroatian) diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 34d5c1c05ff1..18c9e0eb8e58 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -477,21 +477,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "maccyrillic", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "maccyrillic", + .tables = &table, +}; + static int __init init_nls_maccyrillic(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_maccyrillic(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_maccyrillic) diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 2aabf5213176..8f8d6ae20f02 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -547,21 +547,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macgaelic", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macgaelic", + .tables = &table, +}; + static int __init init_nls_macgaelic(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macgaelic(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macgaelic) diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index df62909ef57e..0e2c12fe3447 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -477,21 +477,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macgreek", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macgreek", + .tables = &table, +}; + static int __init init_nls_macgreek(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macgreek(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macgreek) diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index 8daa68b995bc..414767fa47a4 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -582,21 +582,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "maciceland", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "maciceland", + .tables = &table, +}; + static int __init init_nls_maciceland(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_maciceland(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_maciceland) diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index b0799693502a..0e06fd3a0c8f 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -512,21 +512,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macinuit", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macinuit", + .tables = &table, +}; + static int __init init_nls_macinuit(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macinuit(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macinuit) diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index ba358b864b05..fcfd387cfaa8 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -617,21 +617,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macroman", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macroman", + .tables = &table, +}; + static int __init init_nls_macroman(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macroman(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macroman) diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index 7a8a7f9a0bbc..74027022a135 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -582,21 +582,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macromanian", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macromanian", + .tables = &table, +}; + static int __init init_nls_macromanian(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macromanian(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macromanian) diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index eb3c5e53ec88..0edc0f8b1f4d 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -582,21 +582,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "macturkish", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "macturkish", + .tables = &table, +}; + static int __init init_nls_macturkish(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_macturkish(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_macturkish) diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 6bad3e779284..3c3ee908d1ed 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -147,21 +147,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "ascii", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "ascii", + .tables = &table, +}; + static int __init init_nls_ascii(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_ascii(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_ascii) diff --git a/fs/nls/nls_core.c b/fs/nls/nls_core.c index 3f7de8f4c5b2..200a7f8165e6 100644 --- a/fs/nls/nls_core.c +++ b/fs/nls/nls_core.c @@ -16,13 +16,18 @@ #include #include -static struct nls_table default_table; -static struct nls_table *tables = &default_table; +extern struct nls_charset default_charset; +static struct nls_charset *charsets = &default_charset; static DEFINE_SPINLOCK(nls_lock); +static struct nls_table *nls_load_table(struct nls_charset *charset) +{ + /* For now, return the default table, which is the first one found. */ + return charset->tables; +} -int __register_nls(struct nls_table *nls, struct module *owner) +int __register_nls(struct nls_charset *nls, struct module *owner) { - struct nls_table ** tmp = &tables; + struct nls_charset **tmp = &charsets; if (nls->next) return -EBUSY; @@ -36,16 +41,16 @@ int __register_nls(struct nls_table *nls, struct module *owner) } tmp = &(*tmp)->next; } - nls->next = tables; - tables = nls; + nls->next = charsets; + charsets = nls; spin_unlock(&nls_lock); return 0; } EXPORT_SYMBOL(__register_nls); -int unregister_nls(struct nls_table * nls) +int unregister_nls(struct nls_charset * nls) { - struct nls_table ** tmp = &tables; + struct nls_charset **tmp = &charsets; spin_lock(&nls_lock); while (*tmp) { @@ -60,31 +65,42 @@ int unregister_nls(struct nls_table * nls) return -EINVAL; } -static struct nls_table *find_nls(char *charset) +static struct nls_charset *find_nls(const char *charset) { - struct nls_table *nls; + struct nls_charset *nls; spin_lock(&nls_lock); - for (nls = tables; nls; nls = nls->next) { - if (!strcmp(nls_charset_name(nls), charset)) + for (nls = charsets; nls; nls = nls->next) { + if (!strcmp(nls->charset, charset)) break; if (nls->alias && !strcmp(nls->alias, charset)) break; } - if (nls && !try_module_get(nls->owner)) - nls = NULL; + + if (!nls) + nls = ERR_PTR(-EINVAL); + else if (!try_module_get(nls->owner)) + nls = ERR_PTR(-EBUSY); + spin_unlock(&nls_lock); return nls; } struct nls_table *load_nls(char *charset) { - return try_then_request_module(find_nls(charset), "nls_%s", charset); + struct nls_charset *nls_charset; + + nls_charset = try_then_request_module(find_nls(charset), + "nls_%s", charset); + if (!IS_ERR(nls_charset)) + return NULL; + + return nls_load_table(nls_charset); } void unload_nls(struct nls_table *nls) { if (nls) - module_put(nls->owner); + module_put(nls->charset->owner); } EXPORT_SYMBOL(unregister_nls); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index 08902e86fc8e..080717694405 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -328,20 +328,26 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp1250", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp1250", + .tables = &table, +}; + static int __init init_nls_cp1250(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp1250(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp1250) diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 2bb88c8cc5bf..2fba498ab289 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -282,21 +282,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp1251", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp1251", + .tables = &table, +}; + static int __init init_nls_cp1251(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp1251(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp1251) diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index c6bf8d575c5b..c268e8d8c038 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -363,22 +363,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp1255", - .alias = "iso8859-8", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "iso8859-8", + .charset = "cp1255", + .tables = &table, +}; + static int __init init_nls_cp1255(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp1255(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp1255) diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index 0f3f8bdbb62b..f24f8691e720 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -368,21 +368,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp437", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp437", + .tables = &table, +}; + static int __init init_nls_cp437(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp437(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp437) diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index 9383359ca25f..f5a8b9e88165 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -331,21 +331,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp737", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp737", + .tables = &table, +}; + static int __init init_nls_cp737(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp737(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp737) diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 6c787b9079ed..d268bfb873e4 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -300,21 +300,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp775", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp775", + .tables = &table, +}; + static int __init init_nls_cp775(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp775(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp775) diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index 50a57138a571..b698b0df65e3 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -296,21 +296,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp850", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp850", + .tables = &table, +}; + static int __init init_nls_cp850(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp850(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp850) diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index 0cbb199f1cd5..738e95346b34 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -318,21 +318,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp852", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp852", + .tables = &table, +}; + static int __init init_nls_cp852(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp852(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp852) diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index 530b77c86363..9a1c4e307cb1 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -280,21 +280,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp855", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp855", + .tables = &table, +}; + static int __init init_nls_cp855(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp855(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp855) diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index 0db642ec6f45..782e31cb9f5a 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -282,21 +282,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp857", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp857", + .tables = &table, +}; + static int __init init_nls_cp857(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp857(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp857) diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index 44a40dac26bd..2ad1954b84e6 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -345,21 +345,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp860", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp860", + .tables = &table, +}; + static int __init init_nls_cp860(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp860(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp860) diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index 50e08174fc48..5930b0e6e8f1 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -368,21 +368,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp861", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp861", + .tables = &table, +}; + static int __init init_nls_cp861(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp861(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp861) diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index 3505f3437972..63c27b24a011 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -402,21 +402,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp862", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp862", + .tables = &table, +}; + static int __init init_nls_cp862(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp862(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp862) diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index e3489cdc0c04..aa815cdc7481 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -362,21 +362,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp863", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp863", + .tables = &table, +}; + static int __init init_nls_cp863(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp863(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp863) diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index d4185bc7f1bf..a20725f661e9 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -388,21 +388,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp864", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp864", + .tables = &table, +}; + static int __init init_nls_cp864(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp864(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp864) diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 9f468944e577..3d22ec2bd7af 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -368,21 +368,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp865", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp865", + .tables = &table, +}; + static int __init init_nls_cp865(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp865(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp865) diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index ee46fd5a76b1..35dc7b2f023a 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -286,21 +286,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp866", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp866", + .tables = &table, +}; + static int __init init_nls_cp866(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp866(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp866) diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index da29a4a53e1d..56504ab0f405 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -296,21 +296,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp869", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "cp869", + .tables = &table, +}; + static int __init init_nls_cp869(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp869(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp869) diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 642659b9ed89..41394620d000 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -254,22 +254,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp874", - .alias = "tis-620", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "tis-620", + .charset = "cp874", + .tables = &table, +}; + static int __init init_nls_cp874(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp874(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp874) diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 3e7bdefdca90..25fe26fb2603 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7912,22 +7912,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp932", - .alias = "sjis", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "sjis", + .charset = "cp932", + .tables = &table, +}; + static int __init init_nls_cp932(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp932(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp932) diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index b1fa2918992b..766f86b53a7b 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11090,22 +11090,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp936", - .alias = "gb2312", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "gb2312", + .charset = "cp936", + .tables = &table, +}; + static int __init init_nls_cp936(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp936(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp936) diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 1d334095d86c..138eec74bb3f 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13925,22 +13925,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp949", - .alias = "euc-kr", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "euc-kr", + .charset = "cp949", + .tables = &table, +}; + static int __init init_nls_cp949(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp949(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp949) diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index d936160a48f9..899da09fe0d7 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9461,22 +9461,28 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "cp950", - .alias = "big5", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .alias = "big5", + .charset = "cp950", + .tables = &table, +}; + static int __init init_nls_cp950(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_cp950(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_cp950) diff --git a/fs/nls/nls_default.c b/fs/nls/nls_default.c index c5d7e8391b22..ef8c0efb8a3c 100644 --- a/fs/nls/nls_default.c +++ b/fs/nls/nls_default.c @@ -17,7 +17,7 @@ #include #include -static struct nls_table default_table; +struct nls_charset default_charset; struct utf8_table { int cmask; @@ -453,12 +453,17 @@ static const struct nls_ops charset_ops = { }; static struct nls_table default_table = { - .charset = "default", + .charset = &default_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +struct nls_charset default_charset = { + .charset = "default", + .tables = &default_table, +}; + /* Returns a simple default translation table */ struct nls_table *load_nls_default(void) { diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 0af73982738b..8bc5d9991452 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -554,11 +554,17 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "euc-jp", + .charset = &nls_charset, .ops = &charset_ops, }; +static struct nls_charset nls_charset = { + .charset = "euc-jp", + .tables = &table, +}; + static int __init init_nls_euc_jp(void) { p_nls = load_nls("cp932"); @@ -566,7 +572,7 @@ static int __init init_nls_euc_jp(void) if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; - return register_nls(&table); + return register_nls(&nls_charset); } return -EINVAL; @@ -574,7 +580,7 @@ static int __init init_nls_euc_jp(void) static void __exit exit_nls_euc_jp(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); unload_nls(p_nls); } diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 6212b2925fa0..78e9c0169f69 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -238,21 +238,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-1", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-1", + .tables = &table, +}; + static int __init init_nls_iso8859_1(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_1(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_1) diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index 8f0a23109207..eb8665629e0f 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -266,21 +266,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-13", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-13", + .tables = &table, +}; + static int __init init_nls_iso8859_13(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_13(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_13) diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index 80ab77f37480..c8d5a48f869c 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -322,21 +322,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-14", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-14", + .tables = &table, +}; + static int __init init_nls_iso8859_14(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_14(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_14) diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 5c02f93e7b20..0611c6cb56b4 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -288,21 +288,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-15", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-15", + .tables = &table, +}; + static int __init init_nls_iso8859_15(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_15(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_15) diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index 97afc1233da1..5255d92a25eb 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -289,21 +289,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-2", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-2", + .tables = &table, +}; + static int __init init_nls_iso8859_2(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_2(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_2) diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index f835fcec3aae..ad1b84f3e102 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -289,21 +289,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-3", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-3", + .tables = &table, +}; + static int __init init_nls_iso8859_3(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_3(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_3) diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index 14acb68fb013..82469deee0ba 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -289,21 +289,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-4", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-4", + .tables = &table, +}; + static int __init init_nls_iso8859_4(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_4(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_4) diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index f559bbb25045..3f3cd0c28797 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -253,21 +253,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-5", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-5", + .tables = &table, +}; + static int __init init_nls_iso8859_5(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_5(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_5) diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index e3d7e28363b8..43e6675998bc 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -244,21 +244,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-6", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-6", + .tables = &table, +}; + static int __init init_nls_iso8859_6(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_6(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_6) diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 49fd2b24e492..83893e487f82 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -298,21 +298,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-7", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-7", + .tables = &table, +}; + static int __init init_nls_iso8859_7(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_7(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_7) diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 876696f89626..df03f97cd9d1 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -253,21 +253,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "iso8859-9", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "iso8859-9", + .tables = &table, +}; + static int __init init_nls_iso8859_9(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_iso8859_9(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_iso8859_9) diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 6a85211402a8..22918e154dbe 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -304,21 +304,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "koi8-r", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "koi8-r", + .tables = &table, +}; + static int __init init_nls_koi8_r(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_koi8_r(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_koi8_r) diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index c4e382fd0f13..f4edbc313706 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -56,11 +56,17 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "koi8-ru", + .charset = &nls_charset, .ops = &charset_ops, }; +static struct nls_charset nls_charset = { + .charset = "koi8-ru", + .tables = &table, +}; + static int __init init_nls_koi8_ru(void) { p_nls = load_nls("koi8-u"); @@ -68,7 +74,7 @@ static int __init init_nls_koi8_ru(void) if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; - return register_nls(&table); + return register_nls(&nls_charset); } return -EINVAL; @@ -76,7 +82,7 @@ static int __init init_nls_koi8_ru(void) static void __exit exit_nls_koi8_ru(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); unload_nls(p_nls); } diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 5f91e9cdb165..b2421625e98b 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -311,21 +311,27 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "koi8-u", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = charset2lower, .charset2upper = charset2upper, }; +static struct nls_charset nls_charset = { + .charset = "koi8-u", + .tables = &table, +}; + static int __init init_nls_koi8_u(void) { - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_koi8_u(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_koi8_u) diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index 6988fffd5cf6..aecf460827ac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -45,25 +45,31 @@ static const struct nls_ops charset_ops = { .char2uni = char2uni, }; +static struct nls_charset nls_charset; static struct nls_table table = { - .charset = "utf8", + .charset = &nls_charset, .ops = &charset_ops, .charset2lower = identity, /* no conversion */ .charset2upper = identity, }; +static struct nls_charset nls_charset = { + .charset = "utf8", + .tables = &table, +}; + static int __init init_nls_utf8(void) { int i; for (i=0; i<256; i++) identity[i] = i; - return register_nls(&table); + return register_nls(&nls_charset); } static void __exit exit_nls_utf8(void) { - unregister_nls(&table); + unregister_nls(&nls_charset); } module_init(init_nls_utf8) diff --git a/include/linux/nls.h b/include/linux/nls.h index 5d63fe6aa55e..cdc95cd9e5d4 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -29,15 +29,21 @@ struct nls_ops { }; struct nls_table { - const char *charset; - const char *alias; + const struct nls_charset *charset; const struct nls_ops *ops; const unsigned char *charset2lower; const unsigned char *charset2upper; - struct module *owner; struct nls_table *next; }; +struct nls_charset { + const char *charset; + const char *alias; + struct module *owner; + struct nls_table *tables; + struct nls_charset *next; +}; + /* this value hold the maximum octet of charset */ #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ @@ -49,8 +55,8 @@ enum utf16_endian { }; /* nls_base.c */ -extern int __register_nls(struct nls_table *, struct module *); -extern int unregister_nls(struct nls_table *); +extern int __register_nls(struct nls_charset *, struct module *); +extern int unregister_nls(struct nls_charset *); extern struct nls_table *load_nls(char *); extern void unload_nls(struct nls_table *); extern struct nls_table *load_nls_default(void); @@ -78,7 +84,7 @@ static inline int nls_char2uni(const struct nls_table *table, static inline const char *nls_charset_name(const struct nls_table *table) { - return table->charset; + return table->charset->charset; } static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) From patchwork Tue Jul 3 17:06:46 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938819 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCW6RjNz9s3Z for ; Wed, 4 Jul 2018 03:07:35 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934177AbeGCRHf (ORCPT ); Tue, 3 Jul 2018 13:07:35 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33368 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934143AbeGCRHc (ORCPT ); Tue, 3 Jul 2018 13:07:32 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 5CFEA2605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 06/20] nls: Add support for multiple versions of an encoding Date: Tue, 3 Jul 2018 13:06:46 -0400 Message-Id: <20180703170700.9306-7-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org NLS charsets wanting to use this feature can implement the load_create() hook, which creates a nls_table for that specific encoding version. The charset code is responsible for freeing the table memory when the module is removed. Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/nls_core.c | 45 ++++++++++++++++++++++++++++++++++++++------- include/linux/nls.h | 5 +++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/fs/nls/nls_core.c b/fs/nls/nls_core.c index 200a7f8165e6..19a5a66fe423 100644 --- a/fs/nls/nls_core.c +++ b/fs/nls/nls_core.c @@ -19,10 +19,27 @@ extern struct nls_charset default_charset; static struct nls_charset *charsets = &default_charset; static DEFINE_SPINLOCK(nls_lock); -static struct nls_table *nls_load_table(struct nls_charset *charset) + +static struct nls_table *nls_load_table(struct nls_charset *charset, + const char *version) { - /* For now, return the default table, which is the first one found. */ - return charset->tables; + struct nls_table *tbl; + + if (!charset->load_table) { + /* If there is no table_create, only 1 table is + * supported and it must have been loaded + * statically. + */ + return charset->tables; + } + + tbl = charset->load_table(version); + if (!tbl) { + /* Invalid version */ + return ERR_PTR(-EINVAL); + } + + return tbl; } int __register_nls(struct nls_charset *nls, struct module *owner) @@ -85,21 +102,35 @@ static struct nls_charset *find_nls(const char *charset) return nls; } -struct nls_table *load_nls(char *charset) +struct nls_table *load_nls_version(const char *charset, const char *version) { struct nls_charset *nls_charset; nls_charset = try_then_request_module(find_nls(charset), "nls_%s", charset); - if (!IS_ERR(nls_charset)) + if (IS_ERR(nls_charset)) + return ERR_PTR(-EINVAL); + + return nls_load_table(nls_charset, version); +} +EXPORT_SYMBOL(load_nls_version); + +struct nls_table *load_nls(char *charset) +{ + struct nls_table *table = load_nls_version(charset, NULL); + + /* Pre-versioned load_nls() didn't return error pointers. Let's + * keep the abi for now to prevent breakage. + */ + if (IS_ERR(table)) return NULL; - return nls_load_table(nls_charset); + return table; } void unload_nls(struct nls_table *nls) { - if (nls) + if (!IS_ERR_OR_NULL(nls)) module_put(nls->charset->owner); } diff --git a/include/linux/nls.h b/include/linux/nls.h index cdc95cd9e5d4..e422bd52afbb 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -30,6 +30,8 @@ struct nls_ops { struct nls_table { const struct nls_charset *charset; + unsigned int version; + const struct nls_ops *ops; const unsigned char *charset2lower; const unsigned char *charset2upper; @@ -42,6 +44,7 @@ struct nls_charset { struct module *owner; struct nls_table *tables; struct nls_charset *next; + struct nls_table *(*load_table)(const char *version); }; /* this value hold the maximum octet of charset */ @@ -58,6 +61,8 @@ enum utf16_endian { extern int __register_nls(struct nls_charset *, struct module *); extern int unregister_nls(struct nls_charset *); extern struct nls_table *load_nls(char *); +extern struct nls_table *load_nls_version(const char *charset, + const char *version); extern void unload_nls(struct nls_table *); extern struct nls_table *load_nls_default(void); #define register_nls(nls) __register_nls((nls), THIS_MODULE) From patchwork Tue Jul 3 17:06:47 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938820 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCZ0DmNz9s3x for ; Wed, 4 Jul 2018 03:07:38 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934191AbeGCRHg (ORCPT ); Tue, 3 Jul 2018 13:07:36 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33374 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934187AbeGCRHf (ORCPT ); Tue, 3 Jul 2018 13:07:35 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 6D2F72605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 07/20] nls: Add new interface for string comparisons Date: Tue, 3 Jul 2018 13:06:47 -0400 Message-Id: <20180703170700.9306-8-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org The existing stricmp() interface is limited by not accepting separated length parameters for each string being compared. This is a problem for charsets doing normalization or full casefold comparison, since different sized strings can still be matched. To resolve this problem, this patch implements a new interface, allowing charsets to do the comparison, if needed. The original stricmp is left in the code, while all callers are not converted, but was rewritten the new interface. Signed-off-by: Gabriel Krisman Bertazi --- include/linux/nls.h | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/include/linux/nls.h b/include/linux/nls.h index e422bd52afbb..d8b49f53c123 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -3,6 +3,7 @@ #define _LINUX_NLS_H #include +#include /* Unicode has changed over the years. Unicode code points no longer * fit into 16 bits; as of Unicode 5 valid code points range from 0 @@ -21,11 +22,18 @@ typedef u16 wchar_t; /* Arbitrary Unicode character */ typedef u32 unicode_t; +struct nls_table; struct nls_ops { int (*uni2char) (wchar_t uni, unsigned char *out, int boundlen); int (*char2uni) (const unsigned char *rawstring, int boundlen, wchar_t *uni); + int (*strncmp)(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2); + int (*strncasecmp)(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2); }; struct nls_table { @@ -106,10 +114,17 @@ static inline unsigned char nls_toupper(struct nls_table *t, unsigned char c) return nc ? nc : c; } -static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1, - const unsigned char *s2, int len) +static inline int nls_strncasecmp(struct nls_table *t, + const unsigned char *s1, size_t len1, + const unsigned char *s2, size_t len2) { - while (len--) { + if (t->ops->strncasecmp) + return t->ops->strncasecmp(t, s1, len1, s2, len2); + + if (len1 != len2) + return 1; + + while (len1--) { if (nls_tolower(t, *s1++) != nls_tolower(t, *s2++)) return 1; } @@ -117,6 +132,27 @@ static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1, return 0; } +static inline int nls_strncmp(struct nls_table *t, + const unsigned char *s1, size_t len1, + const unsigned char *s2, size_t len2) +{ + if (t->ops->strncmp) + return t->ops->strncmp(t, s1, len1, s2, len2); + + if (len1 != len2) + return 1; + + /* strnicmp did not return negative values. So let's keep the + * abi for now */ + return !!memcmp(s1, s2, len1); +} + +static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1, + const unsigned char *s2, int len) +{ + return nls_strncasecmp(t, s1, len, s2, len); +} + /* * nls_nullsize - return length of null character for codepage * @codepage - codepage for which to return length of NULL terminator From patchwork Tue Jul 3 17:06:48 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938821 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCg42NWz9s2g for ; Wed, 4 Jul 2018 03:07:43 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934187AbeGCRHm (ORCPT ); Tue, 3 Jul 2018 13:07:42 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33380 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934183AbeGCRHk (ORCPT ); Tue, 3 Jul 2018 13:07:40 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id DF8662605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 08/20] nls: Let charsets define the behavior of tolower/toupper Date: Tue, 3 Jul 2018 13:06:48 -0400 Message-Id: <20180703170700.9306-9-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Instead of always reading from a table, give the charset a chance to implement tolower and toupper it algorithmically. This patch was created using the semantic patch below, with the exception of the header files (hook definitions) and a fix to files that didn't have the tables statically allocated (koi8-u and cp932). @tbl@ identifier p; expression lower_tbl; expression upper_tbl; @@ static struct nls_table p = { - .charset2lower = lower_tbl, - .charset2upper = upper_tbl, }; @@ identifier charset_ops; expression tbl.lower_tbl; expression tbl.upper_tbl; @@ + static unsigned char charset_tolower(const struct nls_table *table, unsigned int c) + { + return lower_tbl[c]; + } + + static unsigned char charset_toupper(const struct nls_table *table, unsigned int c) + { + return upper_tbl[c]; + } static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, }; @@ struct nls_table *t; expression A; expression nc; @@ ( - nc = t->charset2lower[A] + nc = nls_tolower(t, A) | - nc = t->charset2upper[A] + nc = nls_toupper(t, A) ) <... - if(!nc) - nc = A; ...> Signed-off-by: Gabriel Krisman Bertazi --- fs/fat/dir.c | 5 +---- fs/nls/mac-celtic.c | 14 ++++++++++++-- fs/nls/mac-centeuro.c | 14 ++++++++++++-- fs/nls/mac-croatian.c | 14 ++++++++++++-- fs/nls/mac-cyrillic.c | 14 ++++++++++++-- fs/nls/mac-gaelic.c | 14 ++++++++++++-- fs/nls/mac-greek.c | 14 ++++++++++++-- fs/nls/mac-iceland.c | 14 ++++++++++++-- fs/nls/mac-inuit.c | 14 ++++++++++++-- fs/nls/mac-roman.c | 14 ++++++++++++-- fs/nls/mac-romanian.c | 14 ++++++++++++-- fs/nls/mac-turkish.c | 14 ++++++++++++-- fs/nls/nls_ascii.c | 14 ++++++++++++-- fs/nls/nls_cp1250.c | 14 ++++++++++++-- fs/nls/nls_cp1251.c | 14 ++++++++++++-- fs/nls/nls_cp1255.c | 14 ++++++++++++-- fs/nls/nls_cp437.c | 14 ++++++++++++-- fs/nls/nls_cp737.c | 14 ++++++++++++-- fs/nls/nls_cp775.c | 14 ++++++++++++-- fs/nls/nls_cp850.c | 14 ++++++++++++-- fs/nls/nls_cp852.c | 14 ++++++++++++-- fs/nls/nls_cp855.c | 14 ++++++++++++-- fs/nls/nls_cp857.c | 14 ++++++++++++-- fs/nls/nls_cp860.c | 14 ++++++++++++-- fs/nls/nls_cp861.c | 14 ++++++++++++-- fs/nls/nls_cp862.c | 14 ++++++++++++-- fs/nls/nls_cp863.c | 14 ++++++++++++-- fs/nls/nls_cp864.c | 14 ++++++++++++-- fs/nls/nls_cp865.c | 14 ++++++++++++-- fs/nls/nls_cp866.c | 14 ++++++++++++-- fs/nls/nls_cp869.c | 14 ++++++++++++-- fs/nls/nls_cp874.c | 14 ++++++++++++-- fs/nls/nls_cp932.c | 14 ++++++++++++-- fs/nls/nls_cp936.c | 14 ++++++++++++-- fs/nls/nls_cp949.c | 14 ++++++++++++-- fs/nls/nls_cp950.c | 14 ++++++++++++-- fs/nls/nls_default.c | 14 ++++++++++++-- fs/nls/nls_euc-jp.c | 7 ++++--- fs/nls/nls_iso8859-1.c | 14 ++++++++++++-- fs/nls/nls_iso8859-13.c | 14 ++++++++++++-- fs/nls/nls_iso8859-14.c | 14 ++++++++++++-- fs/nls/nls_iso8859-15.c | 14 ++++++++++++-- fs/nls/nls_iso8859-2.c | 14 ++++++++++++-- fs/nls/nls_iso8859-3.c | 14 ++++++++++++-- fs/nls/nls_iso8859-4.c | 14 ++++++++++++-- fs/nls/nls_iso8859-5.c | 14 ++++++++++++-- fs/nls/nls_iso8859-6.c | 14 ++++++++++++-- fs/nls/nls_iso8859-7.c | 14 ++++++++++++-- fs/nls/nls_iso8859-9.c | 14 ++++++++++++-- fs/nls/nls_koi8-r.c | 14 ++++++++++++-- fs/nls/nls_koi8-ru.c | 6 +++--- fs/nls/nls_koi8-u.c | 14 ++++++++++++-- fs/nls/nls_utf8.c | 14 ++++++++++++-- include/linux/nls.h | 12 ++++++++---- 54 files changed, 616 insertions(+), 114 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 6dd8d386d0ef..897ada46568e 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -215,10 +215,7 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, *uni = 0x003f; /* a question mark */ charlen = 1; } else if (charlen <= 1) { - unsigned char nc = t->charset2lower[*c]; - - if (!nc) - nc = *c; + unsigned char nc = nls_tolower(t, *c); charlen = nls_char2uni(t, &nc, 1, uni); if (charlen < 0) { diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 4fe7347c55d6..7207f9a14342 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -577,7 +577,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -586,8 +598,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 2d115aae4240..0664408e4451 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -507,7 +507,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -516,8 +528,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index b496b85fcde1..a4b7992ef8ec 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -577,7 +577,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -586,8 +598,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 18c9e0eb8e58..cb60563911ea 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -472,7 +472,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -481,8 +493,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 8f8d6ae20f02..e683881f4a13 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -542,7 +542,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -551,8 +563,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index 0e2c12fe3447..bd2245238512 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -472,7 +472,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -481,8 +493,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index 414767fa47a4..3ce3e27b3660 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -577,7 +577,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -586,8 +598,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index 0e06fd3a0c8f..6f12cccccb37 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -507,7 +507,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -516,8 +528,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index fcfd387cfaa8..d8e411c82c69 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -612,7 +612,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -621,8 +633,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index 74027022a135..cd638dfe9d7c 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -577,7 +577,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -586,8 +598,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index 0edc0f8b1f4d..82ba6f6b4c24 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -577,7 +577,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -586,8 +598,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 3c3ee908d1ed..2f4826478d3d 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -142,7 +142,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -151,8 +163,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index 080717694405..1cfe65851185 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -323,7 +323,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -332,8 +344,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 2fba498ab289..061eb23892f1 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -277,7 +277,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -286,8 +298,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index c268e8d8c038..2a71dc175c9b 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -358,7 +358,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -367,8 +379,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index f24f8691e720..4f763761b699 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -363,7 +363,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -372,8 +384,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index f5a8b9e88165..2f2ab91340e7 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -326,7 +326,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -335,8 +347,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index d268bfb873e4..92f311e620f3 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -295,7 +295,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -304,8 +316,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index b698b0df65e3..77cdce20ced6 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -291,7 +291,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -300,8 +312,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index 738e95346b34..47722904e9f1 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -313,7 +313,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -322,8 +334,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index 9a1c4e307cb1..b52709886900 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -275,7 +275,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -284,8 +296,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index 782e31cb9f5a..fcdf30a540f8 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -277,7 +277,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -286,8 +298,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index 2ad1954b84e6..a1504424e923 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -340,7 +340,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -349,8 +361,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index 5930b0e6e8f1..9fa1f54cee0d 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -363,7 +363,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -372,8 +384,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index 63c27b24a011..00474e2b2102 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -397,7 +397,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -406,8 +418,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index aa815cdc7481..908e573c1c42 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -357,7 +357,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -366,8 +378,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index a20725f661e9..6cae9e9c73aa 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -383,7 +383,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -392,8 +404,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 3d22ec2bd7af..5aa6415ec357 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -363,7 +363,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -372,8 +384,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index 35dc7b2f023a..f24b73839680 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -281,7 +281,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -290,8 +302,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 56504ab0f405..c2ba80140906 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -291,7 +291,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -300,8 +312,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 41394620d000..844bb205deee 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -249,7 +249,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -258,8 +270,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 25fe26fb2603..0a5db2a0a6b3 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7907,7 +7907,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return -EINVAL; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -7916,8 +7928,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index 766f86b53a7b..6b0d725cdfab 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11085,7 +11085,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -11094,8 +11106,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 138eec74bb3f..292c2d02d2c2 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13920,7 +13920,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -13929,8 +13941,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index 899da09fe0d7..d4e35bfd8dbd 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9456,7 +9456,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -9465,8 +9477,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_default.c b/fs/nls/nls_default.c index ef8c0efb8a3c..602eeec24b3d 100644 --- a/fs/nls/nls_default.c +++ b/fs/nls/nls_default.c @@ -447,7 +447,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -455,8 +467,6 @@ static const struct nls_ops charset_ops = { static struct nls_table default_table = { .charset = &default_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; struct nls_charset default_charset = { diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 8bc5d9991452..b3a81350cbea 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -549,7 +549,7 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return euc_offset; } -static const struct nls_ops charset_ops = { +static struct nls_ops charset_ops = { .uni2char = uni2char, .char2uni = char2uni, }; @@ -570,8 +570,9 @@ static int __init init_nls_euc_jp(void) p_nls = load_nls("cp932"); if (p_nls) { - table.charset2upper = p_nls->charset2upper; - table.charset2lower = p_nls->charset2lower; + + charset_ops.uppercase = p_nls->ops->uppercase; + charset_ops.lowercase = p_nls->ops->lowercase; return register_nls(&nls_charset); } diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 78e9c0169f69..a98298bd5de5 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -233,7 +233,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -242,8 +254,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index eb8665629e0f..811f4cf1d1a3 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -261,7 +261,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -270,8 +282,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index c8d5a48f869c..d8dafca31d26 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -317,7 +317,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -326,8 +338,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 0611c6cb56b4..9de12c9e25a3 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -283,7 +283,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -292,8 +304,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index 5255d92a25eb..c59e2424f2b5 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -284,7 +284,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -293,8 +305,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index ad1b84f3e102..4bab1b607059 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -284,7 +284,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -293,8 +305,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index 82469deee0ba..1a3cf5f507f6 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -284,7 +284,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -293,8 +305,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 3f3cd0c28797..0a26cea9d578 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -248,7 +248,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -257,8 +269,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index 43e6675998bc..d5a230888eed 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -239,7 +239,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -248,8 +260,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 83893e487f82..a5a171849ae4 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -293,7 +293,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -302,8 +314,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index df03f97cd9d1..795093547cd6 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -248,7 +248,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -257,8 +269,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 22918e154dbe..bbce9a608419 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -299,7 +299,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -308,8 +320,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index f4edbc313706..d3e946652bf6 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -51,7 +51,7 @@ static int char2uni(const unsigned char *rawstring, int boundlen, return n; } -static const struct nls_ops charset_ops = { +static struct nls_ops charset_ops = { .uni2char = uni2char, .char2uni = char2uni, }; @@ -72,8 +72,8 @@ static int __init init_nls_koi8_ru(void) p_nls = load_nls("koi8-u"); if (p_nls) { - table.charset2upper = p_nls->charset2upper; - table.charset2lower = p_nls->charset2lower; + charset_ops.uppercase = p_nls->ops->uppercase; + charset_ops.lowercase = p_nls->ops->lowercase; return register_nls(&nls_charset); } diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index b2421625e98b..5de52a74f0b3 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -306,7 +306,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return 1; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return charset2lower[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return charset2upper[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -315,8 +327,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = charset2lower, - .charset2upper = charset2upper, }; static struct nls_charset nls_charset = { diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aecf460827ac..fe1ac5efaa37 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -40,7 +40,19 @@ static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) return n; } +static unsigned char charset_tolower(const struct nls_table *table, + unsigned int c){ + return identity[c]; +} + +static unsigned char charset_toupper(const struct nls_table *table, + unsigned int c) { + return identity[c]; +} + static const struct nls_ops charset_ops = { + .lowercase = charset_toupper, + .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, }; @@ -49,8 +61,6 @@ static struct nls_charset nls_charset; static struct nls_table table = { .charset = &nls_charset, .ops = &charset_ops, - .charset2lower = identity, /* no conversion */ - .charset2upper = identity, }; static struct nls_charset nls_charset = { diff --git a/include/linux/nls.h b/include/linux/nls.h index d8b49f53c123..deca7367feb8 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -34,6 +34,11 @@ struct nls_ops { int (*strncasecmp)(const struct nls_table *charset, const unsigned char *str1, size_t len1, const unsigned char *str2, size_t len2); + unsigned char (*lowercase)(const struct nls_table *charset, + unsigned int c); + unsigned char (*uppercase)(const struct nls_table *charset, + unsigned int c); + }; struct nls_table { @@ -41,9 +46,8 @@ struct nls_table { unsigned int version; const struct nls_ops *ops; - const unsigned char *charset2lower; - const unsigned char *charset2upper; struct nls_table *next; + }; struct nls_charset { @@ -102,14 +106,14 @@ static inline const char *nls_charset_name(const struct nls_table *table) static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) { - unsigned char nc = t->charset2lower[c]; + unsigned char nc = t->ops->lowercase(t, c); return nc ? nc : c; } static inline unsigned char nls_toupper(struct nls_table *t, unsigned char c) { - unsigned char nc = t->charset2upper[c]; + unsigned char nc = t->ops->uppercase(t, c); return nc ? nc : c; } From patchwork Tue Jul 3 17:06:49 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938822 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCj4Ll4z9s1B for ; Wed, 4 Jul 2018 03:07:45 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934156AbeGCRHo (ORCPT ); Tue, 3 Jul 2018 13:07:44 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33386 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934178AbeGCRHn (ORCPT ); Tue, 3 Jul 2018 13:07:43 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 5097A2605C7 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 09/20] nls: Add optional normalization and casefold hooks Date: Tue, 3 Jul 2018 13:06:49 -0400 Message-Id: <20180703170700.9306-10-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Signed-off-by: Gabriel Krisman Bertazi --- include/linux/nls.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/include/linux/nls.h b/include/linux/nls.h index deca7367feb8..7f95a1bd8e1f 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -4,6 +4,7 @@ #include #include +#include /* Unicode has changed over the years. Unicode code points no longer * fit into 16 bits; as of Unicode 5 valid code points range from 0 @@ -38,7 +39,12 @@ struct nls_ops { unsigned int c); unsigned char (*uppercase)(const struct nls_table *charset, unsigned int c); - + int (*casefold)(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen); + int (*normalize)(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen); }; struct nls_table { @@ -157,6 +163,26 @@ static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1, return nls_strncasecmp(t, s1, len, s2, len); } +static inline int nls_casefold(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + if (charset->ops->casefold) + return charset->ops->casefold(charset, str, len, dest, dlen); + + return -ENOTSUPP; +} + +static inline int nls_normalize(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + if (charset->ops->normalize) + return charset->ops->normalize(charset, str, len, dest, dlen); + + return -ENOTSUPP; +} + /* * nls_nullsize - return length of null character for codepage * @codepage - codepage for which to return length of NULL terminator From patchwork Tue Jul 3 17:06:50 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938823 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCm2x0cz9s1B for ; Wed, 4 Jul 2018 03:07:48 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934178AbeGCRHr (ORCPT ); Tue, 3 Jul 2018 13:07:47 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33392 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRHr (ORCPT ); Tue, 3 Jul 2018 13:07:47 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 38710287CA8 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Olaf Weber , Gabriel Krisman Bertazi Subject: [PATCH 10/20] nls: utf8norm: Add unicode character database files Date: Tue, 3 Jul 2018 13:06:50 -0400 Message-Id: <20180703170700.9306-11-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org From: Olaf Weber Add files from the Unicode Character Database, version 10.0.0, to the source. A helper program that generates a trie used for normalization from these files is part of a separate commit. - Notes on the update from 8.0.0 and 10.0.0: The structure of ucd files and special cases have not experienced any changes between versions 8.0.0 and 10.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script was required for the update. The actual files are not part of the commit submitted to the list because they are to big and would bounce. Still, they can be obtained by the following script: FILES="CaseFolding.txt DerivedAge.txt extracted/DerivedCombiningClass.txt DerivedCoreProperties.txt NormalizationCorrections.txt NormalizationTest.txt UnicodeData.txt" VERSION=10.0.0 BASE=http://www.unicode.org/Public/${VERSION}/ucd for i in ${FILES} ; do wget "${BASE}/$i" -O fs/nls/ucd/$(basename ${i} .txt)-${VERSION}.txt done Signed-off-by: Olaf Weber Signed-off-by: Gabriel Krisman Bertazi [Move ucd directory to fs/nls/] [Update to ucd-10.0.0] --- fs/nls/ucd/README | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 fs/nls/ucd/README diff --git a/fs/nls/ucd/README b/fs/nls/ucd/README new file mode 100644 index 000000000000..67f2075d1fca --- /dev/null +++ b/fs/nls/ucd/README @@ -0,0 +1,33 @@ +The files in this directory are part of the Unicode Character Database +for version 10.0.0 of the Unicode standard. + +The full set of files can be found here: + + http://www.unicode.org/Public/10.0.0/ucd/ + +The latest released version of the UCD can be found here: + + http://www.unicode.org/Public/UCD/latest/ + +The files in this directory are identical, except that they have been +renamed with a suffix indicating the unicode version. + +Individual source links: + + http://www.unicode.org/Public/10.0.0/ucd/CaseFolding.txt + http://www.unicode.org/Public/10.0.0/ucd/DerivedAge.txt + http://www.unicode.org/Public/10.0.0/ucd/extracted/DerivedCombiningClass.txt + http://www.unicode.org/Public/10.0.0/ucd/DerivedCoreProperties.txt + http://www.unicode.org/Public/10.0.0/ucd/NormalizationCorrections.txt + http://www.unicode.org/Public/10.0.0/ucd/NormalizationTest.txt + http://www.unicode.org/Public/10.0.0/ucd/UnicodeData.txt + +md5sums + + 7893b6e005c5a521319a0d12062ae122 CaseFolding-10.0.0.txt + a602e4b44de3350087e40f2eb2184898 DerivedAge-10.0.0.txt + 5abdeb21af4edcc5d1e4c0b5802fc7a7 DerivedCombiningClass-10.0.0.txt + eda11c2c2e3c308d9d3b90e2b3282024 DerivedCoreProperties-10.0.0.txt + 425ece5ffbecd0140d98c13ce05724aa NormalizationCorrections-10.0.0.txt + 7296fe7aa07d7d288e65d559af2ad49b NormalizationTest-10.0.0.txt + 2a52f30695dcc821f0f224650552beaf UnicodeData-10.0.0.txt From patchwork Tue Jul 3 17:06:51 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938824 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCt04JRz9s1B for ; Wed, 4 Jul 2018 03:07:54 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932926AbeGCRHx (ORCPT ); Tue, 3 Jul 2018 13:07:53 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33400 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRHw (ORCPT ); Tue, 3 Jul 2018 13:07:52 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 9737B287CA8 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Olaf Weber , Gabriel Krisman Bertazi Subject: [PATCH 11/20] scripts: add trie generator for UTF-8 Date: Tue, 3 Jul 2018 13:06:51 -0400 Message-Id: <20180703170700.9306-12-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org From: Olaf Weber mkutf8data.c is the source for a program that generates utf8data.h, which contains the trie that utf8norm.c uses. The trie is generated from the Unicode 10.0.0 data files. The format of the utf8data[] table is described in utf8norm.c, which is added in the next patch. Signed-off-by: Olaf Weber Signed-off-by: Gabriel Krisman Bertazi [Rebase to mainline] [Fix out-of-tree-build] [Fix checkpatch warnings] [Merge back robustness fixes from original patch. Requested by Dave Chinner] [Update makefile to build 10.0.0 ucd files] --- fs/nls/Kconfig | 8 + fs/nls/Makefile | 13 + scripts/Makefile | 1 + scripts/mkutf8data.c | 3239 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 3261 insertions(+) create mode 100644 scripts/mkutf8data.c diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig index e2ce79ef48c4..39bf0ebd0643 100644 --- a/fs/nls/Kconfig +++ b/fs/nls/Kconfig @@ -616,4 +616,12 @@ config NLS_UTF8 input/output character sets. Say Y here for the UTF-8 encoding of the Unicode/ISO9646 universal character set. +# +# utf8 normalization module +# +config NLS_UTF8_NORMALIZATION + tristate "UTF-8 normalization support" + help + Say Y here to enable utf8 normalization support. + endif # NLS diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 5f42ceff9d15..9eff2f058c7a 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -55,3 +55,16 @@ obj-$(CONFIG_NLS_MAC_INUIT) += mac-inuit.o obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o + +$(obj)/utf8data.h: $(srctree)/$(src)/ucd/*.txt $(objtree)/scripts/mkutf8data FORCE + $(call cmd,mkutf8data) +quiet_cmd_mkutf8data = MKUTF8DATA $@ + cmd_mkutf8data = $(objtree)/scripts/mkutf8data \ + -a $(srctree)/$(src)/ucd/DerivedAge-10.0.0.txt \ + -c $(srctree)/$(src)/ucd/DerivedCombiningClass-10.0.0.txt \ + -p $(srctree)/$(src)/ucd/DerivedCoreProperties-10.0.0.txt \ + -d $(srctree)/$(src)/ucd/UnicodeData-10.0.0.txt \ + -f $(srctree)/$(src)/ucd/CaseFolding-10.0.0.txt \ + -n $(srctree)/$(src)/ucd/NormalizationCorrections-10.0.0.txt \ + -t $(srctree)/$(src)/ucd/NormalizationTest-10.0.0.txt \ + -o $@ diff --git a/scripts/Makefile b/scripts/Makefile index 25ab143cbe14..6972bb4815d8 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -19,6 +19,7 @@ hostprogs-$(CONFIG_ASN1) += asn1_compiler hostprogs-$(CONFIG_MODULE_SIG) += sign-file hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert +hostprogs-$(CONFIG_NLS_UTF8_NORMALIZATION) += mkutf8data HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include diff --git a/scripts/mkutf8data.c b/scripts/mkutf8data.c new file mode 100644 index 000000000000..700b41c0cb66 --- /dev/null +++ b/scripts/mkutf8data.c @@ -0,0 +1,3239 @@ +/* + * Copyright (c) 2014 SGI. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Generator for a compact trie for unicode normalization */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Default names of the in- and output files. */ + +#define AGE_NAME "DerivedAge.txt" +#define CCC_NAME "DerivedCombiningClass.txt" +#define PROP_NAME "DerivedCoreProperties.txt" +#define DATA_NAME "UnicodeData.txt" +#define FOLD_NAME "CaseFolding.txt" +#define NORM_NAME "NormalizationCorrections.txt" +#define TEST_NAME "NormalizationTest.txt" +#define UTF8_NAME "utf8data.h" + +const char *age_name = AGE_NAME; +const char *ccc_name = CCC_NAME; +const char *prop_name = PROP_NAME; +const char *data_name = DATA_NAME; +const char *fold_name = FOLD_NAME; +const char *norm_name = NORM_NAME; +const char *test_name = TEST_NAME; +const char *utf8_name = UTF8_NAME; + +int verbose = 0; + +/* An arbitrary line size limit on input lines. */ + +#define LINESIZE 1024 +char line[LINESIZE]; +char buf0[LINESIZE]; +char buf1[LINESIZE]; +char buf2[LINESIZE]; +char buf3[LINESIZE]; + +const char *argv0; + +/* ------------------------------------------------------------------ */ + +/* + * Unicode version numbers consist of three parts: major, minor, and a + * revision. These numbers are packed into an unsigned int to obtain + * a single version number. + * + * To save space in the generated trie, the unicode version is not + * stored directly, instead we calculate a generation number from the + * unicode versions seen in the DerivedAge file, and use that as an + * index into a table of unicode versions. + */ +#define UNICODE_MAJ_SHIFT (16) +#define UNICODE_MIN_SHIFT (8) + +#define UNICODE_MAJ_MAX ((unsigned short)-1) +#define UNICODE_MIN_MAX ((unsigned char)-1) +#define UNICODE_REV_MAX ((unsigned char)-1) + +#define UNICODE_AGE(MAJ,MIN,REV) \ + (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ + ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ + ((unsigned int)(REV))) + +unsigned int *ages; +int ages_count; + +unsigned int unicode_maxage; + +static int +age_valid(unsigned int major, unsigned int minor, unsigned int revision) +{ + if (major > UNICODE_MAJ_MAX) + return 0; + if (minor > UNICODE_MIN_MAX) + return 0; + if (revision > UNICODE_REV_MAX) + return 0; + return 1; +} + +/* ------------------------------------------------------------------ */ + +/* + * utf8trie_t + * + * A compact binary tree, used to decode UTF-8 characters. + * + * Internal nodes are one byte for the node itself, and up to three + * bytes for an offset into the tree. The first byte contains the + * following information: + * NEXTBYTE - flag - advance to next byte if set + * BITNUM - 3 bit field - the bit number to tested + * OFFLEN - 2 bit field - number of bytes in the offset + * if offlen == 0 (non-branching node) + * RIGHTPATH - 1 bit field - set if the following node is for the + * right-hand path (tested bit is set) + * TRIENODE - 1 bit field - set if the following node is an internal + * node, otherwise it is a leaf node + * if offlen != 0 (branching node) + * LEFTNODE - 1 bit field - set if the left-hand node is internal + * RIGHTNODE - 1 bit field - set if the right-hand node is internal + * + * Due to the way utf8 works, there cannot be branching nodes with + * NEXTBYTE set, and moreover those nodes always have a righthand + * descendant. + */ +typedef unsigned char utf8trie_t; +#define BITNUM 0x07 +#define NEXTBYTE 0x08 +#define OFFLEN 0x30 +#define OFFLEN_SHIFT 4 +#define RIGHTPATH 0x40 +#define TRIENODE 0x80 +#define RIGHTNODE 0x40 +#define LEFTNODE 0x80 + +/* + * utf8leaf_t + * + * The leaves of the trie are embedded in the trie, and so the same + * underlying datatype, unsigned char. + * + * leaf[0]: The unicode version, stored as a generation number that is + * an index into utf8agetab[]. With this we can filter code + * points based on the unicode version in which they were + * defined. The CCC of a non-defined code point is 0. + * leaf[1]: Canonical Combining Class. During normalization, we need + * to do a stable sort into ascending order of all characters + * with a non-zero CCC that occur between two characters with + * a CCC of 0, or at the begin or end of a string. + * The unicode standard guarantees that all CCC values are + * between 0 and 254 inclusive, which leaves 255 available as + * a special value. + * Code points with CCC 0 are known as stoppers. + * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the + * start of a NUL-terminated string that is the decomposition + * of the character. + * The CCC of a decomposable character is the same as the CCC + * of the first character of its decomposition. + * Some characters decompose as the empty string: these are + * characters with the Default_Ignorable_Code_Point property. + * These do affect normalization, as they all have CCC 0. + * + * The decompositions in the trie have been fully expanded. + * + * Casefolding, if applicable, is also done using decompositions. + */ +typedef unsigned char utf8leaf_t; + +#define LEAF_GEN(LEAF) ((LEAF)[0]) +#define LEAF_CCC(LEAF) ((LEAF)[1]) +#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) + +#define MAXGEN (255) + +#define MINCCC (0) +#define MAXCCC (254) +#define STOPPER (0) +#define DECOMPOSE (255) + +struct tree; +static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t); +static utf8leaf_t *utf8lookup(struct tree *, const char *); + +unsigned char *utf8data; +size_t utf8data_size; + +utf8trie_t *nfkdi; +utf8trie_t *nfkdicf; + +/* ------------------------------------------------------------------ */ + +/* + * UTF8 valid ranges. + * + * The UTF-8 encoding spreads the bits of a 32bit word over several + * bytes. This table gives the ranges that can be held and how they'd + * be represented. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * There is an additional requirement on UTF-8, in that only the + * shortest representation of a 32bit value is to be used. A decoder + * must not decode sequences that do not satisfy this requirement. + * Thus the allowed ranges have a lower bound. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, + * 17 planes of 65536 values. This limits the sequences actually seen + * even more, to just the following. + * + * 0 - 0x7f: 0 0x7f + * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf + * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf + * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf + * + * Even within those ranges not all values are allowed: the surrogates + * 0xd800 - 0xdfff should never be seen. + * + * Note that the longest sequence seen with valid usage is 4 bytes, + * the same a single UTF-32 character. This makes the UTF-8 + * representation of Unicode strictly smaller than UTF-32. + * + * The shortest sequence requirement was introduced by: + * Corrigendum #1: UTF-8 Shortest Form + * It can be found here: + * http://www.unicode.org/versions/corrigendum1.html + * + */ + +#define UTF8_2_BITS 0xC0 +#define UTF8_3_BITS 0xE0 +#define UTF8_4_BITS 0xF0 +#define UTF8_N_BITS 0x80 +#define UTF8_2_MASK 0xE0 +#define UTF8_3_MASK 0xF0 +#define UTF8_4_MASK 0xF8 +#define UTF8_N_MASK 0xC0 +#define UTF8_V_MASK 0x3F +#define UTF8_V_SHIFT 6 + +static int +utf8encode(char *str, unsigned int val) +{ + int len; + + if (val < 0x80) { + str[0] = val; + len = 1; + } else if (val < 0x800) { + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_2_BITS; + len = 2; + } else if (val < 0x10000) { + str[2] = val & UTF8_V_MASK; + str[2] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_3_BITS; + len = 3; + } else if (val < 0x110000) { + str[3] = val & UTF8_V_MASK; + str[3] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[2] = val & UTF8_V_MASK; + str[2] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[1] = val & UTF8_V_MASK; + str[1] |= UTF8_N_BITS; + val >>= UTF8_V_SHIFT; + str[0] = val; + str[0] |= UTF8_4_BITS; + len = 4; + } else { + printf("%#x: illegal val\n", val); + len = 0; + } + return len; +} + +static unsigned int +utf8decode(const char *str) +{ + const unsigned char *s = (const unsigned char*)str; + unsigned int unichar = 0; + + if (*s < 0x80) { + unichar = *s; + } else if (*s < UTF8_3_BITS) { + unichar = *s++ & 0x1F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } else if (*s < UTF8_4_BITS) { + unichar = *s++ & 0x0F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } else { + unichar = *s++ & 0x0F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s++ & 0x3F; + unichar <<= UTF8_V_SHIFT; + unichar |= *s & 0x3F; + } + return unichar; +} + +static int +utf32valid(unsigned int unichar) +{ + return unichar < 0x110000; +} + +#define NODE 1 +#define LEAF 0 + +struct tree { + void *root; + int childnode; + const char *type; + unsigned int maxage; + struct tree *next; + int (*leaf_equal)(void *, void *); + void (*leaf_print)(void *, int); + int (*leaf_mark)(void *); + int (*leaf_size)(void *); + int *(*leaf_index)(struct tree *, void *); + unsigned char *(*leaf_emit)(void *, unsigned char *); + int leafindex[0x110000]; + int index; +}; + +struct node { + int index; + int offset; + int mark; + int size; + struct node *parent; + void *left; + void *right; + unsigned char bitnum; + unsigned char nextbyte; + unsigned char leftnode; + unsigned char rightnode; + unsigned int keybits; + unsigned int keymask; +}; + +/* + * Example lookup function for a tree. + */ +static void * +lookup(struct tree *tree, const char *key) +{ + struct node *node; + void *leaf = NULL; + + node = tree->root; + while (!leaf && node) { + if (node->nextbyte) + key++; + if (*key & (1 << (node->bitnum & 7))) { + /* Right leg */ + if (node->rightnode == NODE) { + node = node->right; + } else if (node->rightnode == LEAF) { + leaf = node->right; + } else { + node = NULL; + } + } else { + /* Left leg */ + if (node->leftnode == NODE) { + node = node->left; + } else if (node->leftnode == LEAF) { + leaf = node->left; + } else { + node = NULL; + } + } + } + + return leaf; +} + +/* + * A simple non-recursive tree walker: keep track of visits to the + * left and right branches in the leftmask and rightmask. + */ +static void +tree_walk(struct tree *tree) +{ + struct node *node; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int indent = 1; + int nodes, singletons, leaves; + + nodes = singletons = leaves = 0; + + printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); + if (tree->childnode == LEAF) { + assert(tree->root); + tree->leaf_print(tree->root, indent); + leaves = 1; + } else { + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + printf("%*snode @ %p bitnum %d nextbyte %d" + " left %p right %p mask %x bits %x\n", + indent, "", node, + node->bitnum, node->nextbyte, + node->left, node->right, + node->keymask, node->keybits); + nodes += 1; + if (!(node->left && node->right)) + singletons += 1; + + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + tree->leaf_print(node->left, + indent+1); + leaves += 1; + } else if (node->left) { + assert(node->leftnode == NODE); + indent += 1; + node = node->left; + break; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + tree->leaf_print(node->right, + indent+1); + leaves += 1; + } else if (node->right) { + assert(node->rightnode==NODE); + indent += 1; + node = node->right; + break; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + indent -= 1; + } + } + } + printf("nodes %d leaves %d singletons %d\n", + nodes, leaves, singletons); +} + +/* + * Allocate an initialize a new internal node. + */ +static struct node * +alloc_node(struct node *parent) +{ + struct node *node; + int bitnum; + + node = malloc(sizeof(*node)); + node->left = node->right = NULL; + node->parent = parent; + node->leftnode = NODE; + node->rightnode = NODE; + node->keybits = 0; + node->keymask = 0; + node->mark = 0; + node->index = 0; + node->offset = -1; + node->size = 4; + + if (node->parent) { + bitnum = parent->bitnum; + if ((bitnum & 7) == 0) { + node->bitnum = bitnum + 7 + 8; + node->nextbyte = 1; + } else { + node->bitnum = bitnum - 1; + node->nextbyte = 0; + } + } else { + node->bitnum = 7; + node->nextbyte = 0; + } + + return node; +} + +/* + * Insert a new leaf into the tree, and collapse any subtrees that are + * fully populated and end in identical leaves. A nextbyte tagged + * internal node will not be removed to preserve the tree's integrity. + * Note that due to the structure of utf8, no nextbyte tagged node + * will be a candidate for removal. + */ +static int +insert(struct tree *tree, char *key, int keylen, void *leaf) +{ + struct node *node; + struct node *parent; + void **cursor; + int keybits; + + assert(keylen >= 1 && keylen <= 4); + + node = NULL; + cursor = &tree->root; + keybits = 8 * keylen; + + /* Insert, creating path along the way. */ + while (keybits) { + if (!*cursor) + *cursor = alloc_node(node); + node = *cursor; + if (node->nextbyte) + key++; + if (*key & (1 << (node->bitnum & 7))) + cursor = &node->right; + else + cursor = &node->left; + keybits--; + } + *cursor = leaf; + + /* Merge subtrees if possible. */ + while (node) { + if (*key & (1 << (node->bitnum & 7))) + node->rightnode = LEAF; + else + node->leftnode = LEAF; + if (node->nextbyte) + break; + if (node->leftnode == NODE || node->rightnode == NODE) + break; + assert(node->left); + assert(node->right); + /* Compare */ + if (! tree->leaf_equal(node->left, node->right)) + break; + /* Keep left, drop right leaf. */ + leaf = node->left; + /* Check in parent */ + parent = node->parent; + if (!parent) { + /* root of tree! */ + tree->root = leaf; + tree->childnode = LEAF; + } else if (parent->left == node) { + parent->left = leaf; + parent->leftnode = LEAF; + if (parent->right) { + parent->keymask = 0; + parent->keybits = 0; + } else { + parent->keymask |= (1 << node->bitnum); + } + } else if (parent->right == node) { + parent->right = leaf; + parent->rightnode = LEAF; + if (parent->left) { + parent->keymask = 0; + parent->keybits = 0; + } else { + parent->keymask |= (1 << node->bitnum); + parent->keybits |= (1 << node->bitnum); + } + } else { + /* internal tree error */ + assert(0); + } + free(node); + node = parent; + } + + /* Propagate keymasks up along singleton chains. */ + while (node) { + parent = node->parent; + if (!parent) + break; + /* Nix the mask for parents with two children. */ + if (node->keymask == 0) { + parent->keymask = 0; + parent->keybits = 0; + } else if (parent->left && parent->right) { + parent->keymask = 0; + parent->keybits = 0; + } else { + assert((parent->keymask & node->keymask) == 0); + parent->keymask |= node->keymask; + parent->keymask |= (1 << parent->bitnum); + parent->keybits |= node->keybits; + if (parent->right) + parent->keybits |= (1 << parent->bitnum); + } + node = parent; + } + + return 0; +} + +/* + * Prune internal nodes. + * + * Fully populated subtrees that end at the same leaf have already + * been collapsed. There are still internal nodes that have for both + * their left and right branches a sequence of singletons that make + * identical choices and end in identical leaves. The keymask and + * keybits collected in the nodes describe the choices made in these + * singleton chains. When they are identical for the left and right + * branch of a node, and the two leaves comare identical, the node in + * question can be removed. + * + * Note that nodes with the nextbyte tag set will not be removed by + * this to ensure tree integrity. Note as well that the structure of + * utf8 ensures that these nodes would not have been candidates for + * removal in any case. + */ +static void +prune(struct tree *tree) +{ + struct node *node; + struct node *left; + struct node *right; + struct node *parent; + void *leftleaf; + void *rightleaf; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int count; + + if (verbose > 0) + printf("Pruning %s_%x\n", tree->type, tree->maxage); + + count = 0; + if (tree->childnode == LEAF) + return; + if (!tree->root) + return; + + leftmask = rightmask = 0; + node = tree->root; + while (node) { + if (node->nextbyte) + goto advance; + if (node->leftnode == LEAF) + goto advance; + if (node->rightnode == LEAF) + goto advance; + if (!node->left) + goto advance; + if (!node->right) + goto advance; + left = node->left; + right = node->right; + if (left->keymask == 0) + goto advance; + if (right->keymask == 0) + goto advance; + if (left->keymask != right->keymask) + goto advance; + if (left->keybits != right->keybits) + goto advance; + leftleaf = NULL; + while (!leftleaf) { + assert(left->left || left->right); + if (left->leftnode == LEAF) + leftleaf = left->left; + else if (left->rightnode == LEAF) + leftleaf = left->right; + else if (left->left) + left = left->left; + else if (left->right) + left = left->right; + else + assert(0); + } + rightleaf = NULL; + while (!rightleaf) { + assert(right->left || right->right); + if (right->leftnode == LEAF) + rightleaf = right->left; + else if (right->rightnode == LEAF) + rightleaf = right->right; + else if (right->left) + right = right->left; + else if (right->right) + right = right->right; + else + assert(0); + } + if (! tree->leaf_equal(leftleaf, rightleaf)) + goto advance; + /* + * This node has identical singleton-only subtrees. + * Remove it. + */ + parent = node->parent; + left = node->left; + right = node->right; + if (parent->left == node) + parent->left = left; + else if (parent->right == node) + parent->right = left; + else + assert(0); + left->parent = parent; + left->keymask |= (1 << node->bitnum); + node->left = NULL; + while (node) { + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + if (node->leftnode == NODE && node->left) { + left = node->left; + free(node); + count++; + node = left; + } else if (node->rightnode == NODE && node->right) { + right = node->right; + free(node); + count++; + node = right; + } else { + node = NULL; + } + } + /* Propagate keymasks up along singleton chains. */ + node = parent; + /* Force re-check */ + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + for (;;) { + if (node->left && node->right) + break; + if (node->left) { + left = node->left; + node->keymask |= left->keymask; + node->keybits |= left->keybits; + } + if (node->right) { + right = node->right; + node->keymask |= right->keymask; + node->keybits |= right->keybits; + } + node->keymask |= (1 << node->bitnum); + node = node->parent; + /* Force re-check */ + bitmask = 1 << node->bitnum; + leftmask &= ~bitmask; + rightmask &= ~bitmask; + } + advance: + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0 && + node->leftnode == NODE && + node->left) { + leftmask |= bitmask; + node = node->left; + } else if ((rightmask & bitmask) == 0 && + node->rightnode == NODE && + node->right) { + rightmask |= bitmask; + node = node->right; + } else { + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } + } + if (verbose > 0) + printf("Pruned %d nodes\n", count); +} + +/* + * Mark the nodes in the tree that lead to leaves that must be + * emitted. + */ +static void +mark_nodes(struct tree *tree) +{ + struct node *node; + struct node *n; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int marked; + + marked = 0; + if (verbose > 0) + printf("Marking %s_%x\n", tree->type, tree->maxage); + if (tree->childnode == LEAF) + goto done; + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + if (tree->leaf_mark(node->left)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->left) { + assert(node->leftnode == NODE); + node = node->left; + continue; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + if (tree->leaf_mark(node->right)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->right) { + assert(node->rightnode==NODE); + node = node->right; + continue; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } + + /* second pass: left siblings and singletons */ + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + bitmask = 1 << node->bitnum; + if ((leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + if (tree->leaf_mark(node->left)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->left) { + assert(node->leftnode == NODE); + node = node->left; + if (!node->mark && node->parent->mark) { + marked++; + node->mark = 1; + } + continue; + } + } + if ((rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + if (tree->leaf_mark(node->right)) { + n = node; + while (n && !n->mark) { + marked++; + n->mark = 1; + n = n->parent; + } + } + } else if (node->right) { + assert(node->rightnode==NODE); + node = node->right; + if (!node->mark && node->parent->mark && + !node->parent->left) { + marked++; + node->mark = 1; + } + continue; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + } +done: + if (verbose > 0) + printf("Marked %d nodes\n", marked); +} + +/* + * Compute the index of each node and leaf, which is the offset in the + * emitted trie. These values must be pre-computed because relative + * offsets between nodes are used to navigate the tree. + */ +static int +index_nodes(struct tree *tree, int index) +{ + struct node *node; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int count; + int indent; + + /* Align to a cache line (or half a cache line?). */ + while (index % 64) + index++; + tree->index = index; + indent = 1; + count = 0; + + if (verbose > 0) + printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); + if (tree->childnode == LEAF) { + index += tree->leaf_size(tree->root); + goto done; + } + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + if (!node->mark) + goto skip; + count++; + if (node->index != index) + node->index = index; + index += node->size; +skip: + while (node) { + bitmask = 1 << node->bitnum; + if (node->mark && (leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + *tree->leaf_index(tree, node->left) = + index; + index += tree->leaf_size(node->left); + count++; + } else if (node->left) { + assert(node->leftnode == NODE); + indent += 1; + node = node->left; + break; + } + } + if (node->mark && (rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + *tree->leaf_index(tree, node->right) = index; + index += tree->leaf_size(node->right); + count++; + } else if (node->right) { + assert(node->rightnode==NODE); + indent += 1; + node = node->right; + break; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + indent -= 1; + } + } +done: + /* Round up to a multiple of 16 */ + while (index % 16) + index++; + if (verbose > 0) + printf("Final index %d\n", index); + return index; +} + +/* + * Compute the size of nodes and leaves. We start by assuming that + * each node needs to store a three-byte offset. The indexes of the + * nodes are calculated based on that, and then this function is + * called to see if the sizes of some nodes can be reduced. This is + * repeated until no more changes are seen. + */ +static int +size_nodes(struct tree *tree) +{ + struct tree *next; + struct node *node; + struct node *right; + struct node *n; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + unsigned int pathbits; + unsigned int pathmask; + int changed; + int offset; + int size; + int indent; + + indent = 1; + changed = 0; + size = 0; + + if (verbose > 0) + printf("Sizing %s_%x\n", tree->type, tree->maxage); + if (tree->childnode == LEAF) + goto done; + + assert(tree->childnode == NODE); + pathbits = 0; + pathmask = 0; + node = tree->root; + leftmask = rightmask = 0; + while (node) { + if (!node->mark) + goto skip; + offset = 0; + if (!node->left || !node->right) { + size = 1; + } else { + if (node->rightnode == NODE) { + right = node->right; + next = tree->next; + while (!right->mark) { + assert(next); + n = next->root; + while (n->bitnum != node->bitnum) { + if (pathbits & (1<bitnum)) + n = n->right; + else + n = n->left; + } + n = n->right; + assert(right->bitnum == n->bitnum); + right = n; + next = next->next; + } + offset = right->index - node->index; + } else { + offset = *tree->leaf_index(tree, node->right); + offset -= node->index; + } + assert(offset >= 0); + assert(offset <= 0xffffff); + if (offset <= 0xff) { + size = 2; + } else if (offset <= 0xffff) { + size = 3; + } else { /* offset <= 0xffffff */ + size = 4; + } + } + if (node->size != size || node->offset != offset) { + node->size = size; + node->offset = offset; + changed++; + } +skip: + while (node) { + bitmask = 1 << node->bitnum; + pathmask |= bitmask; + if (node->mark && (leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + } else if (node->left) { + assert(node->leftnode == NODE); + indent += 1; + node = node->left; + break; + } + } + if (node->mark && (rightmask & bitmask) == 0) { + rightmask |= bitmask; + pathbits |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + } else if (node->right) { + assert(node->rightnode==NODE); + indent += 1; + node = node->right; + break; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + pathmask &= ~bitmask; + pathbits &= ~bitmask; + node = node->parent; + indent -= 1; + } + } +done: + if (verbose > 0) + printf("Found %d changes\n", changed); + return changed; +} + +/* + * Emit a trie for the given tree into the data array. + */ +static void +emit(struct tree *tree, unsigned char *data) +{ + struct node *node; + unsigned int leftmask; + unsigned int rightmask; + unsigned int bitmask; + int offlen; + int offset; + int index; + int indent; + unsigned char byte; + + index = tree->index; + data += index; + indent = 1; + if (verbose > 0) + printf("Emitting %s_%x\n", tree->type, tree->maxage); + if (tree->childnode == LEAF) { + assert(tree->root); + tree->leaf_emit(tree->root, data); + return; + } + + assert(tree->childnode == NODE); + node = tree->root; + leftmask = rightmask = 0; + while (node) { + if (!node->mark) + goto skip; + assert(node->offset != -1); + assert(node->index == index); + + byte = 0; + if (node->nextbyte) + byte |= NEXTBYTE; + byte |= (node->bitnum & BITNUM); + if (node->left && node->right) { + if (node->leftnode == NODE) + byte |= LEFTNODE; + if (node->rightnode == NODE) + byte |= RIGHTNODE; + if (node->offset <= 0xff) + offlen = 1; + else if (node->offset <= 0xffff) + offlen = 2; + else + offlen = 3; + offset = node->offset; + byte |= offlen << OFFLEN_SHIFT; + *data++ = byte; + index++; + while (offlen--) { + *data++ = offset & 0xff; + index++; + offset >>= 8; + } + } else if (node->left) { + if (node->leftnode == NODE) + byte |= TRIENODE; + *data++ = byte; + index++; + } else if (node->right) { + byte |= RIGHTNODE; + if (node->rightnode == NODE) + byte |= TRIENODE; + *data++ = byte; + index++; + } else { + assert(0); + } +skip: + while (node) { + bitmask = 1 << node->bitnum; + if (node->mark && (leftmask & bitmask) == 0) { + leftmask |= bitmask; + if (node->leftnode == LEAF) { + assert(node->left); + data = tree->leaf_emit(node->left, + data); + index += tree->leaf_size(node->left); + } else if (node->left) { + assert(node->leftnode == NODE); + indent += 1; + node = node->left; + break; + } + } + if (node->mark && (rightmask & bitmask) == 0) { + rightmask |= bitmask; + if (node->rightnode == LEAF) { + assert(node->right); + data = tree->leaf_emit(node->right, + data); + index += tree->leaf_size(node->right); + } else if (node->right) { + assert(node->rightnode==NODE); + indent += 1; + node = node->right; + break; + } + } + leftmask &= ~bitmask; + rightmask &= ~bitmask; + node = node->parent; + indent -= 1; + } + } +} + +/* ------------------------------------------------------------------ */ + +/* + * Unicode data. + * + * We need to keep track of the Canonical Combining Class, the Age, + * and decompositions for a code point. + * + * For the Age, we store the index into the ages table. Effectively + * this is a generation number that the table maps to a unicode + * version. + * + * The correction field is used to indicate that this entry is in the + * corrections array, which contains decompositions that were + * corrected in later revisions. The value of the correction field is + * the Unicode version in which the mapping was corrected. + */ +struct unicode_data { + unsigned int code; + int ccc; + int gen; + int correction; + unsigned int *utf32nfkdi; + unsigned int *utf32nfkdicf; + char *utf8nfkdi; + char *utf8nfkdicf; +}; + +struct unicode_data unicode_data[0x110000]; +struct unicode_data *corrections; +int corrections_count; + +struct tree *nfkdi_tree; +struct tree *nfkdicf_tree; + +struct tree *trees; +int trees_count; + +/* + * Check the corrections array to see if this entry was corrected at + * some point. + */ +static struct unicode_data * +corrections_lookup(struct unicode_data *u) +{ + int i; + + for (i = 0; i != corrections_count; i++) + if (u->code == corrections[i].code) + return &corrections[i]; + return u; +} + +static int +nfkdi_equal(void *l, void *r) +{ + struct unicode_data *left = l; + struct unicode_data *right = r; + + if (left->gen != right->gen) + return 0; + if (left->ccc != right->ccc) + return 0; + if (left->utf8nfkdi && right->utf8nfkdi && + strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0) + return 1; + if (left->utf8nfkdi || right->utf8nfkdi) + return 0; + return 1; +} + +static int +nfkdicf_equal(void *l, void *r) +{ + struct unicode_data *left = l; + struct unicode_data *right = r; + + if (left->gen != right->gen) + return 0; + if (left->ccc != right->ccc) + return 0; + if (left->utf8nfkdicf && right->utf8nfkdicf && + strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0) + return 1; + if (left->utf8nfkdicf && right->utf8nfkdicf) + return 0; + if (left->utf8nfkdicf || right->utf8nfkdicf) + return 0; + if (left->utf8nfkdi && right->utf8nfkdi && + strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0) + return 1; + if (left->utf8nfkdi || right->utf8nfkdi) + return 0; + return 1; +} + +static void +nfkdi_print(void *l, int indent) +{ + struct unicode_data *leaf = l; + + printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, + leaf->code, leaf->ccc, leaf->gen); + if (leaf->utf8nfkdi) + printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); + printf("\n"); +} + +static void +nfkdicf_print(void *l, int indent) +{ + struct unicode_data *leaf = l; + + printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, + leaf->code, leaf->ccc, leaf->gen); + if (leaf->utf8nfkdicf) + printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf); + else if (leaf->utf8nfkdi) + printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); + printf("\n"); +} + +static int +nfkdi_mark(void *l) +{ + return 1; +} + +static int +nfkdicf_mark(void *l) +{ + struct unicode_data *leaf = l; + + if (leaf->utf8nfkdicf) + return 1; + return 0; +} + +static int +correction_mark(void *l) +{ + struct unicode_data *leaf = l; + + return leaf->correction; +} + +static int +nfkdi_size(void *l) +{ + struct unicode_data *leaf = l; + + int size = 2; + if (leaf->utf8nfkdi) + size += strlen(leaf->utf8nfkdi) + 1; + return size; +} + +static int +nfkdicf_size(void *l) +{ + struct unicode_data *leaf = l; + + int size = 2; + if (leaf->utf8nfkdicf) + size += strlen(leaf->utf8nfkdicf) + 1; + else if (leaf->utf8nfkdi) + size += strlen(leaf->utf8nfkdi) + 1; + return size; +} + +static int * +nfkdi_index(struct tree *tree, void *l) +{ + struct unicode_data *leaf = l; + + return &tree->leafindex[leaf->code]; +} + +static int * +nfkdicf_index(struct tree *tree, void *l) +{ + struct unicode_data *leaf = l; + + return &tree->leafindex[leaf->code]; +} + +static unsigned char * +nfkdi_emit(void *l, unsigned char *data) +{ + struct unicode_data *leaf = l; + unsigned char *s; + + *data++ = leaf->gen; + if (leaf->utf8nfkdi) { + *data++ = DECOMPOSE; + s = (unsigned char*)leaf->utf8nfkdi; + while ((*data++ = *s++) != 0) + ; + } else { + *data++ = leaf->ccc; + } + return data; +} + +static unsigned char * +nfkdicf_emit(void *l, unsigned char *data) +{ + struct unicode_data *leaf = l; + unsigned char *s; + + *data++ = leaf->gen; + if (leaf->utf8nfkdicf) { + *data++ = DECOMPOSE; + s = (unsigned char*)leaf->utf8nfkdicf; + while ((*data++ = *s++) != 0) + ; + } else if (leaf->utf8nfkdi) { + *data++ = DECOMPOSE; + s = (unsigned char*)leaf->utf8nfkdi; + while ((*data++ = *s++) != 0) + ; + } else { + *data++ = leaf->ccc; + } + return data; +} + +static void +utf8_create(struct unicode_data *data) +{ + char utf[18*4+1]; + char *u; + unsigned int *um; + int i; + + u = utf; + um = data->utf32nfkdi; + if (um) { + for (i = 0; um[i]; i++) + u += utf8encode(u, um[i]); + *u = '\0'; + data->utf8nfkdi = strdup(utf); + } + u = utf; + um = data->utf32nfkdicf; + if (um) { + for (i = 0; um[i]; i++) + u += utf8encode(u, um[i]); + *u = '\0'; + if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf)) + data->utf8nfkdicf = strdup(utf); + } +} + +static void +utf8_init(void) +{ + unsigned int unichar; + int i; + + for (unichar = 0; unichar != 0x110000; unichar++) + utf8_create(&unicode_data[unichar]); + + for (i = 0; i != corrections_count; i++) + utf8_create(&corrections[i]); +} + +static void +trees_init(void) +{ + struct unicode_data *data; + unsigned int maxage; + unsigned int nextage; + int count; + int i; + int j; + + /* Count the number of different ages. */ + count = 0; + nextage = (unsigned int)-1; + do { + maxage = nextage; + nextage = 0; + for (i = 0; i <= corrections_count; i++) { + data = &corrections[i]; + if (nextage < data->correction && + data->correction < maxage) + nextage = data->correction; + } + count++; + } while (nextage); + + /* Two trees per age: nfkdi and nfkdicf */ + trees_count = count * 2; + trees = calloc(trees_count, sizeof(struct tree)); + + /* Assign ages to the trees. */ + count = trees_count; + nextage = (unsigned int)-1; + do { + maxage = nextage; + trees[--count].maxage = maxage; + trees[--count].maxage = maxage; + nextage = 0; + for (i = 0; i <= corrections_count; i++) { + data = &corrections[i]; + if (nextage < data->correction && + data->correction < maxage) + nextage = data->correction; + } + } while (nextage); + + /* The ages assigned above are off by one. */ + for (i = 0; i != trees_count; i++) { + j = 0; + while (ages[j] < trees[i].maxage) + j++; + trees[i].maxage = ages[j-1]; + } + + /* Set up the forwarding between trees. */ + trees[trees_count-2].next = &trees[trees_count-1]; + trees[trees_count-1].leaf_mark = nfkdi_mark; + trees[trees_count-2].leaf_mark = nfkdicf_mark; + for (i = 0; i != trees_count-2; i += 2) { + trees[i].next = &trees[trees_count-2]; + trees[i].leaf_mark = correction_mark; + trees[i+1].next = &trees[trees_count-1]; + trees[i+1].leaf_mark = correction_mark; + } + + /* Assign the callouts. */ + for (i = 0; i != trees_count; i += 2) { + trees[i].type = "nfkdicf"; + trees[i].leaf_equal = nfkdicf_equal; + trees[i].leaf_print = nfkdicf_print; + trees[i].leaf_size = nfkdicf_size; + trees[i].leaf_index = nfkdicf_index; + trees[i].leaf_emit = nfkdicf_emit; + + trees[i+1].type = "nfkdi"; + trees[i+1].leaf_equal = nfkdi_equal; + trees[i+1].leaf_print = nfkdi_print; + trees[i+1].leaf_size = nfkdi_size; + trees[i+1].leaf_index = nfkdi_index; + trees[i+1].leaf_emit = nfkdi_emit; + } + + /* Finish init. */ + for (i = 0; i != trees_count; i++) + trees[i].childnode = NODE; +} + +static void +trees_populate(void) +{ + struct unicode_data *data; + unsigned int unichar; + char keyval[4]; + int keylen; + int i; + + for (i = 0; i != trees_count; i++) { + if (verbose > 0) { + printf("Populating %s_%x\n", + trees[i].type, trees[i].maxage); + } + for (unichar = 0; unichar != 0x110000; unichar++) { + if (unicode_data[unichar].gen < 0) + continue; + keylen = utf8encode(keyval, unichar); + data = corrections_lookup(&unicode_data[unichar]); + if (data->correction <= trees[i].maxage) + data = &unicode_data[unichar]; + insert(&trees[i], keyval, keylen, data); + } + } +} + +static void +trees_reduce(void) +{ + int i; + int size; + int changed; + + for (i = 0; i != trees_count; i++) + prune(&trees[i]); + for (i = 0; i != trees_count; i++) + mark_nodes(&trees[i]); + do { + size = 0; + for (i = 0; i != trees_count; i++) + size = index_nodes(&trees[i], size); + changed = 0; + for (i = 0; i != trees_count; i++) + changed += size_nodes(&trees[i]); + } while (changed); + + utf8data = calloc(size, 1); + utf8data_size = size; + for (i = 0; i != trees_count; i++) + emit(&trees[i], utf8data); + + if (verbose > 0) { + for (i = 0; i != trees_count; i++) { + printf("%s_%x idx %d\n", + trees[i].type, trees[i].maxage, trees[i].index); + } + } + + nfkdi = utf8data + trees[trees_count-1].index; + nfkdicf = utf8data + trees[trees_count-2].index; + + nfkdi_tree = &trees[trees_count-1]; + nfkdicf_tree = &trees[trees_count-2]; +} + +static void +verify(struct tree *tree) +{ + struct unicode_data *data; + utf8leaf_t *leaf; + unsigned int unichar; + char key[4]; + int report; + int nocf; + + if (verbose > 0) + printf("Verifying %s_%x\n", tree->type, tree->maxage); + nocf = strcmp(tree->type, "nfkdicf"); + + for (unichar = 0; unichar != 0x110000; unichar++) { + report = 0; + data = corrections_lookup(&unicode_data[unichar]); + if (data->correction <= tree->maxage) + data = &unicode_data[unichar]; + utf8encode(key,unichar); + leaf = utf8lookup(tree, key); + if (!leaf) { + if (data->gen != -1) + report++; + if (unichar < 0xd800 || unichar > 0xdfff) + report++; + } else { + if (unichar >= 0xd800 && unichar <= 0xdfff) + report++; + if (data->gen == -1) + report++; + if (data->gen != LEAF_GEN(leaf)) + report++; + if (LEAF_CCC(leaf) == DECOMPOSE) { + if (nocf) { + if (!data->utf8nfkdi) { + report++; + } else if (strcmp(data->utf8nfkdi, + LEAF_STR(leaf))) { + report++; + } + } else { + if (!data->utf8nfkdicf && + !data->utf8nfkdi) { + report++; + } else if (data->utf8nfkdicf) { + if (strcmp(data->utf8nfkdicf, + LEAF_STR(leaf))) + report++; + } else if (strcmp(data->utf8nfkdi, + LEAF_STR(leaf))) { + report++; + } + } + } else if (data->ccc != LEAF_CCC(leaf)) { + report++; + } + } + if (report) { + printf("%X code %X gen %d ccc %d" + " nfkdi -> \"%s\"", + unichar, data->code, data->gen, + data->ccc, + data->utf8nfkdi); + if (leaf) { + printf(" gen %d ccc %d" + " nfkdi -> \"%s\"", + LEAF_GEN(leaf), + LEAF_CCC(leaf), + LEAF_CCC(leaf) == DECOMPOSE ? + LEAF_STR(leaf) : ""); + } + printf("\n"); + } + } +} + +static void +trees_verify(void) +{ + int i; + + for (i = 0; i != trees_count; i++) + verify(&trees[i]); +} + +/* ------------------------------------------------------------------ */ + +static void +help(void) +{ + printf("Usage: %s [options]\n", argv0); + printf("\n"); + printf("This program creates an a data trie used for parsing and\n"); + printf("normalization of UTF-8 strings. The trie is derived from\n"); + printf("a set of input files from the Unicode character database\n"); + printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); + printf("\n"); + printf("The generated tree supports two normalization forms:\n"); + printf("\n"); + printf("\tnfkdi:\n"); + printf("\t- Apply unicode normalization form NFKD.\n"); + printf("\t- Remove any Default_Ignorable_Code_Point.\n"); + printf("\n"); + printf("\tnfkdicf:\n"); + printf("\t- Apply unicode normalization form NFKD.\n"); + printf("\t- Remove any Default_Ignorable_Code_Point.\n"); + printf("\t- Apply a full casefold (C + F).\n"); + printf("\n"); + printf("These forms were chosen as being most useful when dealing\n"); + printf("with file names: NFKD catches most cases where characters\n"); + printf("should be considered equivalent. The ignorables are mostly\n"); + printf("invisible, making names hard to type.\n"); + printf("\n"); + printf("The options to specify the files to be used are listed\n"); + printf("below with their default values, which are the names used\n"); + printf("by version 7.0.0 of the Unicode Character Database.\n"); + printf("\n"); + printf("The input files:\n"); + printf("\t-a %s\n", AGE_NAME); + printf("\t-c %s\n", CCC_NAME); + printf("\t-p %s\n", PROP_NAME); + printf("\t-d %s\n", DATA_NAME); + printf("\t-f %s\n", FOLD_NAME); + printf("\t-n %s\n", NORM_NAME); + printf("\n"); + printf("Additionally, the generated tables are tested using:\n"); + printf("\t-t %s\n", TEST_NAME); + printf("\n"); + printf("Finally, the output file:\n"); + printf("\t-o %s\n", UTF8_NAME); + printf("\n"); +} + +static void +usage(void) +{ + help(); + exit(1); +} + +static void +open_fail(const char *name, int error) +{ + printf("Error %d opening %s: %s\n", error, name, strerror(error)); + exit(1); +} + +static void +file_fail(const char *filename) +{ + printf("Error parsing %s\n", filename); + exit(1); +} + +static void +line_fail(const char *filename, const char *line) +{ + printf("Error parsing %s:%s\n", filename, line); + exit(1); +} + +/* ------------------------------------------------------------------ */ + +static void +print_utf32(unsigned int *utf32str) +{ + int i; + + for (i = 0; utf32str[i]; i++) + printf(" %X", utf32str[i]); +} + +static void +print_utf32nfkdi(unsigned int unichar) +{ + printf(" %X ->", unichar); + print_utf32(unicode_data[unichar].utf32nfkdi); + printf("\n"); +} + +static void +print_utf32nfkdicf(unsigned int unichar) +{ + printf(" %X ->", unichar); + print_utf32(unicode_data[unichar].utf32nfkdicf); + printf("\n"); +} + +/* ------------------------------------------------------------------ */ + +static void +age_init(void) +{ + FILE *file; + unsigned int first; + unsigned int last; + unsigned int unichar; + unsigned int major; + unsigned int minor; + unsigned int revision; + int gen; + int count; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", age_name); + + file = fopen(age_name, "r"); + if (!file) + open_fail(age_name, errno); + count = 0; + + gen = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "# Age=V%d_%d_%d", + &major, &minor, &revision); + if (ret == 3) { + ages_count++; + if (verbose > 1) + printf(" Age V%d_%d_%d\n", + major, minor, revision); + if (!age_valid(major, minor, revision)) + line_fail(age_name, line); + continue; + } + ret = sscanf(line, "# Age=V%d_%d", &major, &minor); + if (ret == 2) { + ages_count++; + if (verbose > 1) + printf(" Age V%d_%d\n", major, minor); + if (!age_valid(major, minor, 0)) + line_fail(age_name, line); + continue; + } + } + + /* We must have found something above. */ + if (verbose > 1) + printf("%d age entries\n", ages_count); + if (ages_count == 0 || ages_count > MAXGEN) + file_fail(age_name); + + /* There is a 0 entry. */ + ages_count++; + ages = calloc(ages_count + 1, sizeof(*ages)); + /* And a guard entry. */ + ages[ages_count] = (unsigned int)-1; + + rewind(file); + count = 0; + gen = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "# Age=V%d_%d_%d", + &major, &minor, &revision); + if (ret == 3) { + ages[++gen] = + UNICODE_AGE(major, minor, revision); + if (verbose > 1) + printf(" Age V%d_%d_%d = gen %d\n", + major, minor, revision, gen); + if (!age_valid(major, minor, revision)) + line_fail(age_name, line); + continue; + } + ret = sscanf(line, "# Age=V%d_%d", &major, &minor); + if (ret == 2) { + ages[++gen] = UNICODE_AGE(major, minor, 0); + if (verbose > 1) + printf(" Age V%d_%d = %d\n", + major, minor, gen); + if (!age_valid(major, minor, 0)) + line_fail(age_name, line); + continue; + } + ret = sscanf(line, "%X..%X ; %d.%d #", + &first, &last, &major, &minor); + if (ret == 4) { + for (unichar = first; unichar <= last; unichar++) + unicode_data[unichar].gen = gen; + count += 1 + last - first; + if (verbose > 1) + printf(" %X..%X gen %d\n", first, last, gen); + if (!utf32valid(first) || !utf32valid(last)) + line_fail(age_name, line); + continue; + } + ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); + if (ret == 3) { + unicode_data[unichar].gen = gen; + count++; + if (verbose > 1) + printf(" %X gen %d\n", unichar, gen); + if (!utf32valid(unichar)) + line_fail(age_name, line); + continue; + } + } + unicode_maxage = ages[gen]; + fclose(file); + + /* Nix surrogate block */ + if (verbose > 1) + printf(" Removing surrogate block D800..DFFF\n"); + for (unichar = 0xd800; unichar <= 0xdfff; unichar++) + unicode_data[unichar].gen = -1; + + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(age_name); +} + +static void +ccc_init(void) +{ + FILE *file; + unsigned int first; + unsigned int last; + unsigned int unichar; + unsigned int value; + int count; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", ccc_name); + + file = fopen(ccc_name, "r"); + if (!file) + open_fail(ccc_name, errno); + + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); + if (ret == 3) { + for (unichar = first; unichar <= last; unichar++) { + unicode_data[unichar].ccc = value; + count++; + } + if (verbose > 1) + printf(" %X..%X ccc %d\n", first, last, value); + if (!utf32valid(first) || !utf32valid(last)) + line_fail(ccc_name, line); + continue; + } + ret = sscanf(line, "%X ; %d #", &unichar, &value); + if (ret == 2) { + unicode_data[unichar].ccc = value; + count++; + if (verbose > 1) + printf(" %X ccc %d\n", unichar, value); + if (!utf32valid(unichar)) + line_fail(ccc_name, line); + continue; + } + } + fclose(file); + + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(ccc_name); +} + +static void +nfkdi_init(void) +{ + FILE *file; + unsigned int unichar; + unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ + char *s; + unsigned int *um; + int count; + int i; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", data_name); + file = fopen(data_name, "r"); + if (!file) + open_fail(data_name, errno); + + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", + &unichar, buf0); + if (ret != 2) + continue; + if (!utf32valid(unichar)) + line_fail(data_name, line); + + s = buf0; + /* skip over */ + if (*s == '<') + while (*s++ != ' ') + ; + /* decode the decomposition into UTF-32 */ + i = 0; + while (*s) { + mapping[i] = strtoul(s, &s, 16); + if (!utf32valid(mapping[i])) + line_fail(data_name, line); + i++; + } + mapping[i++] = 0; + + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdi = um; + + if (verbose > 1) + print_utf32nfkdi(unichar); + count++; + } + fclose(file); + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(data_name); +} + +static void +nfkdicf_init(void) +{ + FILE *file; + unsigned int unichar; + unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ + char status; + char *s; + unsigned int *um; + int i; + int count; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", fold_name); + file = fopen(fold_name, "r"); + if (!file) + open_fail(fold_name, errno); + + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); + if (ret != 3) + continue; + if (!utf32valid(unichar)) + line_fail(fold_name, line); + /* Use the C+F casefold. */ + if (status != 'C' && status != 'F') + continue; + s = buf0; + if (*s == '<') + while (*s++ != ' ') + ; + i = 0; + while (*s) { + mapping[i] = strtoul(s, &s, 16); + if (!utf32valid(mapping[i])) + line_fail(fold_name, line); + i++; + } + mapping[i++] = 0; + + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdicf = um; + + if (verbose > 1) + print_utf32nfkdicf(unichar); + count++; + } + fclose(file); + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(fold_name); +} + +static void +ignore_init(void) +{ + FILE *file; + unsigned int unichar; + unsigned int first; + unsigned int last; + unsigned int *um; + int count; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", prop_name); + file = fopen(prop_name, "r"); + if (!file) + open_fail(prop_name, errno); + assert(file); + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); + if (ret == 3) { + if (strcmp(buf0, "Default_Ignorable_Code_Point")) + continue; + if (!utf32valid(first) || !utf32valid(last)) + line_fail(prop_name, line); + for (unichar = first; unichar <= last; unichar++) { + free(unicode_data[unichar].utf32nfkdi); + um = malloc(sizeof(unsigned int)); + *um = 0; + unicode_data[unichar].utf32nfkdi = um; + free(unicode_data[unichar].utf32nfkdicf); + um = malloc(sizeof(unsigned int)); + *um = 0; + unicode_data[unichar].utf32nfkdicf = um; + count++; + } + if (verbose > 1) + printf(" %X..%X Default_Ignorable_Code_Point\n", + first, last); + continue; + } + ret = sscanf(line, "%X ; %s # ", &unichar, buf0); + if (ret == 2) { + if (strcmp(buf0, "Default_Ignorable_Code_Point")) + continue; + if (!utf32valid(unichar)) + line_fail(prop_name, line); + free(unicode_data[unichar].utf32nfkdi); + um = malloc(sizeof(unsigned int)); + *um = 0; + unicode_data[unichar].utf32nfkdi = um; + free(unicode_data[unichar].utf32nfkdicf); + um = malloc(sizeof(unsigned int)); + *um = 0; + unicode_data[unichar].utf32nfkdicf = um; + if (verbose > 1) + printf(" %X Default_Ignorable_Code_Point\n", + unichar); + count++; + continue; + } + } + fclose(file); + + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(prop_name); +} + +static void +corrections_init(void) +{ + FILE *file; + unsigned int unichar; + unsigned int major; + unsigned int minor; + unsigned int revision; + unsigned int age; + unsigned int *um; + unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ + char *s; + int i; + int count; + int ret; + + if (verbose > 0) + printf("Parsing %s\n", norm_name); + file = fopen(norm_name, "r"); + if (!file) + open_fail(norm_name, errno); + + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", + &unichar, buf0, buf1, + &major, &minor, &revision); + if (ret != 6) + continue; + if (!utf32valid(unichar) || !age_valid(major, minor, revision)) + line_fail(norm_name, line); + count++; + } + corrections = calloc(count, sizeof(struct unicode_data)); + corrections_count = count; + rewind(file); + + count = 0; + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", + &unichar, buf0, buf1, + &major, &minor, &revision); + if (ret != 6) + continue; + if (!utf32valid(unichar) || !age_valid(major, minor, revision)) + line_fail(norm_name, line); + corrections[count] = unicode_data[unichar]; + assert(corrections[count].code == unichar); + age = UNICODE_AGE(major, minor, revision); + corrections[count].correction = age; + + i = 0; + s = buf0; + while (*s) { + mapping[i] = strtoul(s, &s, 16); + if (!utf32valid(mapping[i])) + line_fail(norm_name, line); + i++; + } + mapping[i++] = 0; + + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + corrections[count].utf32nfkdi = um; + + if (verbose > 1) + printf(" %X -> %s -> %s V%d_%d_%d\n", + unichar, buf0, buf1, major, minor, revision); + count++; + } + fclose(file); + + if (verbose > 0) + printf("Found %d entries\n", count); + if (count == 0) + file_fail(norm_name); +} + +/* ------------------------------------------------------------------ */ + +/* + * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) + * + * AC00;;Lo;0;L;;;;;N;;;;; + * D7A3;;Lo;0;L;;;;;N;;;;; + * + * SBase = 0xAC00 + * LBase = 0x1100 + * VBase = 0x1161 + * TBase = 0x11A7 + * LCount = 19 + * VCount = 21 + * TCount = 28 + * NCount = 588 (VCount * TCount) + * SCount = 11172 (LCount * NCount) + * + * Decomposition: + * SIndex = s - SBase + * + * LV (Canonical/Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * + * LVT (Canonical) + * LVIndex = (SIndex / TCount) * TCount + * TIndex = (Sindex % TCount) + * LVPart = SBase + LVIndex + * TPart = TBase + TIndex + * + * LVT (Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * TIndex = (Sindex % TCount) + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * if (TIndex == 0) { + * d = + * } else { + * TPart = TBase + TIndex + * d = + * } + * + */ + +static void +hangul_decompose(void) +{ + unsigned int sb = 0xAC00; + unsigned int lb = 0x1100; + unsigned int vb = 0x1161; + unsigned int tb = 0x11a7; + /* unsigned int lc = 19; */ + unsigned int vc = 21; + unsigned int tc = 28; + unsigned int nc = (vc * tc); + /* unsigned int sc = (lc * nc); */ + unsigned int unichar; + unsigned int mapping[4]; + unsigned int *um; + int count; + int i; + + if (verbose > 0) + printf("Decomposing hangul\n"); + /* Hangul */ + count = 0; + for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { + unsigned int si = unichar - sb; + unsigned int li = si / nc; + unsigned int vi = (si % nc) / tc; + unsigned int ti = si % tc; + + i = 0; + mapping[i++] = lb + li; + mapping[i++] = vb + vi; + if (ti) + mapping[i++] = tb + ti; + mapping[i++] = 0; + + assert(!unicode_data[unichar].utf32nfkdi); + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdi = um; + + assert(!unicode_data[unichar].utf32nfkdicf); + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdicf = um; + + if (verbose > 1) + print_utf32nfkdi(unichar); + + count++; + } + if (verbose > 0) + printf("Created %d entries\n", count); +} + +static void +nfkdi_decompose(void) +{ + unsigned int unichar; + unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ + unsigned int *um; + unsigned int *dc; + int count; + int i; + int j; + int ret; + + if (verbose > 0) + printf("Decomposing nfkdi\n"); + + count = 0; + for (unichar = 0; unichar != 0x110000; unichar++) { + if (!unicode_data[unichar].utf32nfkdi) + continue; + for (;;) { + ret = 1; + i = 0; + um = unicode_data[unichar].utf32nfkdi; + while (*um) { + dc = unicode_data[*um].utf32nfkdi; + if (dc) { + for (j = 0; dc[j]; j++) + mapping[i++] = dc[j]; + ret = 0; + } else { + mapping[i++] = *um; + } + um++; + } + mapping[i++] = 0; + if (ret) + break; + free(unicode_data[unichar].utf32nfkdi); + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdi = um; + } + /* Add this decomposition to nfkdicf if there is no entry. */ + if (!unicode_data[unichar].utf32nfkdicf) { + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdicf = um; + } + if (verbose > 1) + print_utf32nfkdi(unichar); + count++; + } + if (verbose > 0) + printf("Processed %d entries\n", count); +} + +static void +nfkdicf_decompose(void) +{ + unsigned int unichar; + unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ + unsigned int *um; + unsigned int *dc; + int count; + int i; + int j; + int ret; + + if (verbose > 0) + printf("Decomposing nfkdicf\n"); + count = 0; + for (unichar = 0; unichar != 0x110000; unichar++) { + if (!unicode_data[unichar].utf32nfkdicf) + continue; + for (;;) { + ret = 1; + i = 0; + um = unicode_data[unichar].utf32nfkdicf; + while (*um) { + dc = unicode_data[*um].utf32nfkdicf; + if (dc) { + for (j = 0; dc[j]; j++) + mapping[i++] = dc[j]; + ret = 0; + } else { + mapping[i++] = *um; + } + um++; + } + mapping[i++] = 0; + if (ret) + break; + free(unicode_data[unichar].utf32nfkdicf); + um = malloc(i * sizeof(unsigned int)); + memcpy(um, mapping, i * sizeof(unsigned int)); + unicode_data[unichar].utf32nfkdicf = um; + } + if (verbose > 1) + print_utf32nfkdicf(unichar); + count++; + } + if (verbose > 0) + printf("Processed %d entries\n", count); +} + +/* ------------------------------------------------------------------ */ + +int utf8agemax(struct tree *, const char *); +int utf8nagemax(struct tree *, const char *, size_t); +int utf8agemin(struct tree *, const char *); +int utf8nagemin(struct tree *, const char *, size_t); +ssize_t utf8len(struct tree *, const char *); +ssize_t utf8nlen(struct tree *, const char *, size_t); +struct utf8cursor; +int utf8cursor(struct utf8cursor *, struct tree *, const char *); +int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); +int utf8byte(struct utf8cursor *); + +/* + * Use trie to scan s, touching at most len bytes. + * Returns the leaf if one exists, NULL otherwise. + * + * A non-NULL return guarantees that the UTF-8 sequence starting at s + * is well-formed and corresponds to a known unicode code point. The + * shorthand for this will be "is valid UTF-8 unicode". + */ +static utf8leaf_t * +utf8nlookup(struct tree *tree, const char *s, size_t len) +{ + utf8trie_t *trie = utf8data + tree->index; + int offlen; + int offset; + int mask; + int node; + + if (!tree) + return NULL; + if (len == 0) + return NULL; + node = 1; + while (node) { + offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; + if (*trie & NEXTBYTE) { + if (--len == 0) + return NULL; + s++; + } + mask = 1 << (*trie & BITNUM); + if (*s & mask) { + /* Right leg */ + if (offlen) { + /* Right node at offset of trie */ + node = (*trie & RIGHTNODE); + offset = trie[offlen]; + while (--offlen) { + offset <<= 8; + offset |= trie[offlen]; + } + trie += offset; + } else if (*trie & RIGHTPATH) { + /* Right node after this node */ + node = (*trie & TRIENODE); + trie++; + } else { + /* No right node. */ + return NULL; + } + } else { + /* Left leg */ + if (offlen) { + /* Left node after this node. */ + node = (*trie & LEFTNODE); + trie += offlen + 1; + } else if (*trie & RIGHTPATH) { + /* No left node. */ + return NULL; + } else { + /* Left node after this node */ + node = (*trie & TRIENODE); + trie++; + } + } + } + return trie; +} + +/* + * Use trie to scan s. + * Returns the leaf if one exists, NULL otherwise. + * + * Forwards to trie_nlookup(). + */ +static utf8leaf_t * +utf8lookup(struct tree *tree, const char *s) +{ + return utf8nlookup(tree, s, (size_t)-1); +} + +/* + * Return the number of bytes used by the current UTF-8 sequence. + * Assumes the input points to the first byte of a valid UTF-8 + * sequence. + */ +static inline int +utf8clen(const char *s) +{ + unsigned char c = *s; + return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); +} + +/* + * Maximum age of any character in s. + * Return -1 if s is not valid UTF-8 unicode. + * Return 0 if only non-assigned code points are used. + */ +int +utf8agemax(struct tree *tree, const char *s) +{ + utf8leaf_t *leaf; + int age = 0; + int leaf_age; + + if (!tree) + return -1; + while (*s) { + if (!(leaf = utf8lookup(tree, s))) + return -1; + leaf_age = ages[LEAF_GEN(leaf)]; + if (leaf_age <= tree->maxage && leaf_age > age) + age = leaf_age; + s += utf8clen(s); + } + return age; +} + +/* + * Minimum age of any character in s. + * Return -1 if s is not valid UTF-8 unicode. + * Return 0 if non-assigned code points are used. + */ +int +utf8agemin(struct tree *tree, const char *s) +{ + utf8leaf_t *leaf; + int age; + int leaf_age; + + if (!tree) + return -1; + age = tree->maxage; + while (*s) { + if (!(leaf = utf8lookup(tree, s))) + return -1; + leaf_age = ages[LEAF_GEN(leaf)]; + if (leaf_age <= tree->maxage && leaf_age < age) + age = leaf_age; + s += utf8clen(s); + } + return age; +} + +/* + * Maximum age of any character in s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +int +utf8nagemax(struct tree *tree, const char *s, size_t len) +{ + utf8leaf_t *leaf; + int age = 0; + int leaf_age; + + if (!tree) + return -1; + while (len && *s) { + if (!(leaf = utf8nlookup(tree, s, len))) + return -1; + leaf_age = ages[LEAF_GEN(leaf)]; + if (leaf_age <= tree->maxage && leaf_age > age) + age = leaf_age; + len -= utf8clen(s); + s += utf8clen(s); + } + return age; +} + +/* + * Maximum age of any character in s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +int +utf8nagemin(struct tree *tree, const char *s, size_t len) +{ + utf8leaf_t *leaf; + int leaf_age; + int age; + + if (!tree) + return -1; + age = tree->maxage; + while (len && *s) { + if (!(leaf = utf8nlookup(tree, s, len))) + return -1; + leaf_age = ages[LEAF_GEN(leaf)]; + if (leaf_age <= tree->maxage && leaf_age < age) + age = leaf_age; + len -= utf8clen(s); + s += utf8clen(s); + } + return age; +} + +/* + * Length of the normalization of s. + * Return -1 if s is not valid UTF-8 unicode. + * + * A string of Default_Ignorable_Code_Point has length 0. + */ +ssize_t +utf8len(struct tree *tree, const char *s) +{ + utf8leaf_t *leaf; + size_t ret = 0; + + if (!tree) + return -1; + while (*s) { + if (!(leaf = utf8lookup(tree, s))) + return -1; + if (ages[LEAF_GEN(leaf)] > tree->maxage) + ret += utf8clen(s); + else if (LEAF_CCC(leaf) == DECOMPOSE) + ret += strlen(LEAF_STR(leaf)); + else + ret += utf8clen(s); + s += utf8clen(s); + } + return ret; +} + +/* + * Length of the normalization of s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +ssize_t +utf8nlen(struct tree *tree, const char *s, size_t len) +{ + utf8leaf_t *leaf; + size_t ret = 0; + + if (!tree) + return -1; + while (len && *s) { + if (!(leaf = utf8nlookup(tree, s, len))) + return -1; + if (ages[LEAF_GEN(leaf)] > tree->maxage) + ret += utf8clen(s); + else if (LEAF_CCC(leaf) == DECOMPOSE) + ret += strlen(LEAF_STR(leaf)); + else + ret += utf8clen(s); + len -= utf8clen(s); + s += utf8clen(s); + } + return ret; +} + +/* + * Cursor structure used by the normalizer. + */ +struct utf8cursor { + struct tree *tree; + const char *s; + const char *p; + const char *ss; + const char *sp; + unsigned int len; + unsigned int slen; + short int ccc; + short int nccc; + unsigned int unichar; +}; + +/* + * Set up an utf8cursor for use by utf8byte(). + * + * s : string. + * len : length of s. + * u8c : pointer to cursor. + * trie : utf8trie_t to use for normalization. + * + * Returns -1 on error, 0 on success. + */ +int +utf8ncursor( + struct utf8cursor *u8c, + struct tree *tree, + const char *s, + size_t len) +{ + if (!tree) + return -1; + if (!s) + return -1; + u8c->tree = tree; + u8c->s = s; + u8c->p = NULL; + u8c->ss = NULL; + u8c->sp = NULL; + u8c->len = len; + u8c->slen = 0; + u8c->ccc = STOPPER; + u8c->nccc = STOPPER; + u8c->unichar = 0; + /* Check we didn't clobber the maximum length. */ + if (u8c->len != len) + return -1; + /* The first byte of s may not be an utf8 continuation. */ + if (len > 0 && (*s & 0xC0) == 0x80) + return -1; + return 0; +} + +/* + * Set up an utf8cursor for use by utf8byte(). + * + * s : NUL-terminated string. + * u8c : pointer to cursor. + * trie : utf8trie_t to use for normalization. + * + * Returns -1 on error, 0 on success. + */ +int +utf8cursor( + struct utf8cursor *u8c, + struct tree *tree, + const char *s) +{ + return utf8ncursor(u8c, tree, s, (unsigned int)-1); +} + +/* + * Get one byte from the normalized form of the string described by u8c. + * + * Returns the byte cast to an unsigned char on succes, and -1 on failure. + * + * The cursor keeps track of the location in the string in u8c->s. + * When a character is decomposed, the current location is stored in + * u8c->p, and u8c->s is set to the start of the decomposition. Note + * that bytes from a decomposition do not count against u8c->len. + * + * Characters are emitted if they match the current CCC in u8c->ccc. + * Hitting end-of-string while u8c->ccc == STOPPER means we're done, + * and the function returns 0 in that case. + * + * Sorting by CCC is done by repeatedly scanning the string. The + * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at + * the start of the scan. The first pass finds the lowest CCC to be + * emitted and stores it in u8c->nccc, the second pass emits the + * characters with this CCC and finds the next lowest CCC. This limits + * the number of passes to 1 + the number of different CCCs in the + * sequence being scanned. + * + * Therefore: + * u8c->p != NULL -> a decomposition is being scanned. + * u8c->ss != NULL -> this is a repeating scan. + * u8c->ccc == -1 -> this is the first scan of a repeating scan. + */ +int +utf8byte(struct utf8cursor *u8c) +{ + utf8leaf_t *leaf; + int ccc; + + for (;;) { + /* Check for the end of a decomposed character. */ + if (u8c->p && *u8c->s == '\0') { + u8c->s = u8c->p; + u8c->p = NULL; + } + + /* Check for end-of-string. */ + if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { + /* There is no next byte. */ + if (u8c->ccc == STOPPER) + return 0; + /* End-of-string during a scan counts as a stopper. */ + ccc = STOPPER; + goto ccc_mismatch; + } else if ((*u8c->s & 0xC0) == 0x80) { + /* This is a continuation of the current character. */ + if (!u8c->p) + u8c->len--; + return (unsigned char)*u8c->s++; + } + + /* Look up the data for the current character. */ + if (u8c->p) + leaf = utf8lookup(u8c->tree, u8c->s); + else + leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len); + + /* No leaf found implies that the input is a binary blob. */ + if (!leaf) + return -1; + + /* Characters that are too new have CCC 0. */ + if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { + ccc = STOPPER; + } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { + u8c->len -= utf8clen(u8c->s); + u8c->p = u8c->s + utf8clen(u8c->s); + u8c->s = LEAF_STR(leaf); + /* Empty decomposition implies CCC 0. */ + if (*u8c->s == '\0') { + if (u8c->ccc == STOPPER) + continue; + ccc = STOPPER; + goto ccc_mismatch; + } + leaf = utf8lookup(u8c->tree, u8c->s); + ccc = LEAF_CCC(leaf); + } + u8c->unichar = utf8decode(u8c->s); + + /* + * If this is not a stopper, then see if it updates + * the next canonical class to be emitted. + */ + if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) + u8c->nccc = ccc; + + /* + * Return the current byte if this is the current + * combining class. + */ + if (ccc == u8c->ccc) { + if (!u8c->p) + u8c->len--; + return (unsigned char)*u8c->s++; + } + + /* Current combining class mismatch. */ + ccc_mismatch: + if (u8c->nccc == STOPPER) { + /* + * Scan forward for the first canonical class + * to be emitted. Save the position from + * which to restart. + */ + assert(u8c->ccc == STOPPER); + u8c->ccc = MINCCC - 1; + u8c->nccc = ccc; + u8c->sp = u8c->p; + u8c->ss = u8c->s; + u8c->slen = u8c->len; + if (!u8c->p) + u8c->len -= utf8clen(u8c->s); + u8c->s += utf8clen(u8c->s); + } else if (ccc != STOPPER) { + /* Not a stopper, and not the ccc we're emitting. */ + if (!u8c->p) + u8c->len -= utf8clen(u8c->s); + u8c->s += utf8clen(u8c->s); + } else if (u8c->nccc != MAXCCC + 1) { + /* At a stopper, restart for next ccc. */ + u8c->ccc = u8c->nccc; + u8c->nccc = MAXCCC + 1; + u8c->s = u8c->ss; + u8c->p = u8c->sp; + u8c->len = u8c->slen; + } else { + /* All done, proceed from here. */ + u8c->ccc = STOPPER; + u8c->nccc = STOPPER; + u8c->sp = NULL; + u8c->ss = NULL; + u8c->slen = 0; + } + } +} + +/* ------------------------------------------------------------------ */ + +static int +normalize_line(struct tree *tree) +{ + char *s; + char *t; + int c; + struct utf8cursor u8c; + + /* First test: null-terminated string. */ + s = buf2; + t = buf3; + if (utf8cursor(&u8c, tree, s)) + return -1; + while ((c = utf8byte(&u8c)) > 0) + if (c != (unsigned char)*t++) + return -1; + if (c < 0) + return -1; + if (*t != 0) + return -1; + + /* Second test: length-limited string. */ + s = buf2; + /* Replace NUL with a value that will cause an error if seen. */ + s[strlen(s) + 1] = -1; + t = buf3; + if (utf8cursor(&u8c, tree, s)) + return -1; + while ((c = utf8byte(&u8c)) > 0) + if (c != (unsigned char)*t++) + return -1; + if (c < 0) + return -1; + if (*t != 0) + return -1; + + return 0; +} + +static void +normalization_test(void) +{ + FILE *file; + unsigned int unichar; + struct unicode_data *data; + char *s; + char *t; + int ret; + int ignorables; + int tests = 0; + int failures = 0; + + if (verbose > 0) + printf("Parsing %s\n", test_name); + /* Step one, read data from file. */ + file = fopen(test_name, "r"); + if (!file) + open_fail(test_name, errno); + + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];", + buf0, buf1); + if (ret != 2 || *line == '#') + continue; + s = buf0; + t = buf2; + while (*s) { + unichar = strtoul(s, &s, 16); + t += utf8encode(t, unichar); + } + *t = '\0'; + + ignorables = 0; + s = buf1; + t = buf3; + while (*s) { + unichar = strtoul(s, &s, 16); + data = &unicode_data[unichar]; + if (data->utf8nfkdi && !*data->utf8nfkdi) + ignorables = 1; + else + t += utf8encode(t, unichar); + } + *t = '\0'; + + tests++; + if (normalize_line(nfkdi_tree) < 0) { + printf("Line %s -> %s", buf0, buf1); + if (ignorables) + printf(" (ignorables removed)"); + printf(" failure\n"); + failures++; + } + } + fclose(file); + if (verbose > 0) + printf("Ran %d tests with %d failures\n", tests, failures); + if (failures) + file_fail(test_name); +} + +/* ------------------------------------------------------------------ */ + +static void +write_file(void) +{ + FILE *file; + int i; + int j; + int t; + int gen; + + if (verbose > 0) + printf("Writing %s\n", utf8_name); + file = fopen(utf8_name, "w"); + if (!file) + open_fail(utf8_name, errno); + + fprintf(file, "/* This file is generated code, do not edit. */\n"); + fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); + fprintf(file, "#error Only xfs_utf8.c may include this file.\n"); + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + fprintf(file, "static const unsigned int utf8vers = %#x;\n", + unicode_maxage); + fprintf(file, "\n"); + fprintf(file, "static const unsigned int utf8agetab[] = {\n"); + for (i = 0; i != ages_count; i++) + fprintf(file, "\t%#x%s\n", ages[i], + ages[i] == unicode_maxage ? "" : ","); + fprintf(file, "};\n"); + fprintf(file, "\n"); + fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n"); + t = 0; + for (gen = 0; gen < ages_count; gen++) { + fprintf(file, "\t{ %#x, %d }%s\n", + ages[gen], trees[t].index, + ages[gen] == unicode_maxage ? "" : ","); + if (trees[t].maxage == ages[gen]) + t += 2; + } + fprintf(file, "};\n"); + fprintf(file, "\n"); + fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n"); + t = 1; + for (gen = 0; gen < ages_count; gen++) { + fprintf(file, "\t{ %#x, %d }%s\n", + ages[gen], trees[t].index, + ages[gen] == unicode_maxage ? "" : ","); + if (trees[t].maxage == ages[gen]) + t += 2; + } + fprintf(file, "};\n"); + fprintf(file, "\n"); + fprintf(file, "static const unsigned char utf8data[%zd] = {\n", + utf8data_size); + t = 0; + for (i = 0; i != utf8data_size; i += 16) { + if (i == trees[t].index) { + fprintf(file, "\t/* %s_%x */\n", + trees[t].type, trees[t].maxage); + if (t < trees_count-1) + t++; + } + fprintf(file, "\t"); + for (j = i; j != i + 16; j++) + fprintf(file, "0x%.2x%s", utf8data[j], + (j < utf8data_size -1 ? "," : "")); + fprintf(file, "\n"); + } + fprintf(file, "};\n"); + fclose(file); +} + +/* ------------------------------------------------------------------ */ + +int +main(int argc, char *argv[]) +{ + unsigned int unichar; + int opt; + + argv0 = argv[0]; + + while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { + switch (opt) { + case 'a': + age_name = optarg; + break; + case 'c': + ccc_name = optarg; + break; + case 'd': + data_name = optarg; + break; + case 'f': + fold_name = optarg; + break; + case 'n': + norm_name = optarg; + break; + case 'o': + utf8_name = optarg; + break; + case 'p': + prop_name = optarg; + break; + case 't': + test_name = optarg; + break; + case 'v': + verbose++; + break; + case 'h': + help(); + exit(0); + default: + usage(); + } + } + + if (verbose > 1) + help(); + for (unichar = 0; unichar != 0x110000; unichar++) + unicode_data[unichar].code = unichar; + age_init(); + ccc_init(); + nfkdi_init(); + nfkdicf_init(); + ignore_init(); + corrections_init(); + hangul_decompose(); + nfkdi_decompose(); + nfkdicf_decompose(); + utf8_init(); + trees_init(); + trees_populate(); + trees_reduce(); + trees_verify(); + /* Prevent "unused function" warning. */ + (void)lookup(nfkdi_tree, " "); + if (verbose > 2) + tree_walk(nfkdi_tree); + if (verbose > 2) + tree_walk(nfkdicf_tree); + normalization_test(); + write_file(); + + return 0; +} From patchwork Tue Jul 3 17:06:52 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938825 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrCy39F4z9s2g for ; Wed, 4 Jul 2018 03:07:58 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934037AbeGCRH5 (ORCPT ); Tue, 3 Jul 2018 13:07:57 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33414 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRHz (ORCPT ); Tue, 3 Jul 2018 13:07:55 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id B5132289313 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Olaf Weber , Gabriel Krisman Bertazi Subject: [PATCH 12/20] nls: utf8norm: Introduce code for UTF-8 normalization Date: Tue, 3 Jul 2018 13:06:52 -0400 Message-Id: <20180703170700.9306-13-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org From: Olaf Weber Supporting functions for UTF-8 normalization are in utf8norm.c with the header utf8norm.h. Two normalization forms are supported: nfkdi and nfkdicf. nfkdi: - Apply unicode normalization form NFKD. - Remove any Default_Ignorable_Code_Point. nfkdicf: - Apply unicode normalization form NFKD. - Remove any Default_Ignorable_Code_Point. - Apply a full casefold (C + F). For the purposes of the code, a string is valid UTF-8 if: - The values encoded are 0x1..0x10FFFF. - The surrogate codepoints 0xD800..0xDFFFF are not encoded. - The shortest possible encoding is used for all values. The supporting functions work on null-terminated strings (utf8 prefix) and on length-limited strings (utf8n prefix). From the original SGI patch and for conformity with coding standards, the utf8data_t typedef was dropped, since it was just masking the struct keyword. On other occasions, namely utf8leaf_t and utf8trie_t, I decided to keep it, since they are simple pointers to memory buffers, and using uchars here wouldn't provide any more meaningful information. Changes since RFC v2: - Merge to NLS system Changes since RFC v1: - utf8_version_is_supported receives maj, min and rev as separate arguments. (Olaf Weber) Signed-off-by: Olaf Weber Signed-off-by: Gabriel Krisman Bertazi [Rebase to Mainline] [Fix up checkpatch.pl warnings] [Drop typedefs] [Merge with NLS subsystem] --- fs/nls/Makefile | 4 + fs/nls/nls_utf8n-norm.c | 640 ++++++++++++++++++++++++++++++++++++++++ fs/nls/utf8n.h | 112 +++++++ 3 files changed, 756 insertions(+) create mode 100644 fs/nls/nls_utf8n-norm.c create mode 100644 fs/nls/utf8n.h diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 9eff2f058c7a..6ff62c0fe436 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -56,6 +56,10 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o +nls_utf8n-y += nls_utf8n-norm.o +obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o + +$(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h $(obj)/utf8data.h: $(srctree)/$(src)/ucd/*.txt $(objtree)/scripts/mkutf8data FORCE $(call cmd,mkutf8data) quiet_cmd_mkutf8data = MKUTF8DATA $@ diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c new file mode 100644 index 000000000000..ca0bbf644b49 --- /dev/null +++ b/fs/nls/nls_utf8n-norm.c @@ -0,0 +1,640 @@ +/* + * Copyright (c) 2014 SGI. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include "utf8n.h" + +struct utf8data { + unsigned int maxage; + unsigned int offset; +}; + +#define __INCLUDED_FROM_UTF8NORM_C__ +#include "utf8data.h" +#undef __INCLUDED_FROM_UTF8NORM_C__ + +int utf8version_is_supported(u8 maj, u8 min, u8 rev) +{ + int i = ARRAY_SIZE(utf8agetab) - 1; + unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); + + while (i >= 0 && utf8agetab[i] != 0) { + if (sb_utf8version == utf8agetab[i]) + return 1; + i--; + } + return 0; +} +EXPORT_SYMBOL(utf8version_is_supported); + +/* + * UTF-8 valid ranges. + * + * The UTF-8 encoding spreads the bits of a 32bit word over several + * bytes. This table gives the ranges that can be held and how they'd + * be represented. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * There is an additional requirement on UTF-8, in that only the + * shortest representation of a 32bit value is to be used. A decoder + * must not decode sequences that do not satisfy this requirement. + * Thus the allowed ranges have a lower bound. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, + * 17 planes of 65536 values. This limits the sequences actually seen + * even more, to just the following. + * + * 0 - 0x7F: 0 - 0x7F + * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF + * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF + * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF + * + * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. + * + * Note that the longest sequence seen with valid usage is 4 bytes, + * the same a single UTF-32 character. This makes the UTF-8 + * representation of Unicode strictly smaller than UTF-32. + * + * The shortest sequence requirement was introduced by: + * Corrigendum #1: UTF-8 Shortest Form + * It can be found here: + * http://www.unicode.org/versions/corrigendum1.html + * + */ + +/* + * Return the number of bytes used by the current UTF-8 sequence. + * Assumes the input points to the first byte of a valid UTF-8 + * sequence. + */ +static inline int utf8clen(const char *s) +{ + unsigned char c = *s; + + return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); +} + +/* + * utf8trie_t + * + * A compact binary tree, used to decode UTF-8 characters. + * + * Internal nodes are one byte for the node itself, and up to three + * bytes for an offset into the tree. The first byte contains the + * following information: + * NEXTBYTE - flag - advance to next byte if set + * BITNUM - 3 bit field - the bit number to tested + * OFFLEN - 2 bit field - number of bytes in the offset + * if offlen == 0 (non-branching node) + * RIGHTPATH - 1 bit field - set if the following node is for the + * right-hand path (tested bit is set) + * TRIENODE - 1 bit field - set if the following node is an internal + * node, otherwise it is a leaf node + * if offlen != 0 (branching node) + * LEFTNODE - 1 bit field - set if the left-hand node is internal + * RIGHTNODE - 1 bit field - set if the right-hand node is internal + * + * Due to the way utf8 works, there cannot be branching nodes with + * NEXTBYTE set, and moreover those nodes always have a righthand + * descendant. + */ +typedef const unsigned char utf8trie_t; +#define BITNUM 0x07 +#define NEXTBYTE 0x08 +#define OFFLEN 0x30 +#define OFFLEN_SHIFT 4 +#define RIGHTPATH 0x40 +#define TRIENODE 0x80 +#define RIGHTNODE 0x40 +#define LEFTNODE 0x80 + +/* + * utf8leaf_t + * + * The leaves of the trie are embedded in the trie, and so the same + * underlying datatype: unsigned char. + * + * leaf[0]: The unicode version, stored as a generation number that is + * an index into utf8agetab[]. With this we can filter code + * points based on the unicode version in which they were + * defined. The CCC of a non-defined code point is 0. + * leaf[1]: Canonical Combining Class. During normalization, we need + * to do a stable sort into ascending order of all characters + * with a non-zero CCC that occur between two characters with + * a CCC of 0, or at the begin or end of a string. + * The unicode standard guarantees that all CCC values are + * between 0 and 254 inclusive, which leaves 255 available as + * a special value. + * Code points with CCC 0 are known as stoppers. + * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the + * start of a NUL-terminated string that is the decomposition + * of the character. + * The CCC of a decomposable character is the same as the CCC + * of the first character of its decomposition. + * Some characters decompose as the empty string: these are + * characters with the Default_Ignorable_Code_Point property. + * These do affect normalization, as they all have CCC 0. + * + * The decompositions in the trie have been fully expanded. + * + * Casefolding, if applicable, is also done using decompositions. + * + * The trie is constructed in such a way that leaves exist for all + * UTF-8 sequences that match the criteria from the "UTF-8 valid + * ranges" comment above, and only for those sequences. Therefore a + * lookup in the trie can be used to validate the UTF-8 input. + */ +typedef const unsigned char utf8leaf_t; + +#define LEAF_GEN(LEAF) ((LEAF)[0]) +#define LEAF_CCC(LEAF) ((LEAF)[1]) +#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) + +#define MINCCC (0) +#define MAXCCC (254) +#define STOPPER (0) +#define DECOMPOSE (255) + +/* + * Use trie to scan s, touching at most len bytes. + * Returns the leaf if one exists, NULL otherwise. + * + * A non-NULL return guarantees that the UTF-8 sequence starting at s + * is well-formed and corresponds to a known unicode code point. The + * shorthand for this will be "is valid UTF-8 unicode". + */ +static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, + size_t len) +{ + utf8trie_t *trie = utf8data + data->offset; + int offlen; + int offset; + int mask; + int node; + + if (!data) + return NULL; + if (len == 0) + return NULL; + node = 1; + while (node) { + offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; + if (*trie & NEXTBYTE) { + if (--len == 0) + return NULL; + s++; + } + mask = 1 << (*trie & BITNUM); + if (*s & mask) { + /* Right leg */ + if (offlen) { + /* Right node at offset of trie */ + node = (*trie & RIGHTNODE); + offset = trie[offlen]; + while (--offlen) { + offset <<= 8; + offset |= trie[offlen]; + } + trie += offset; + } else if (*trie & RIGHTPATH) { + /* Right node after this node */ + node = (*trie & TRIENODE); + trie++; + } else { + /* No right node. */ + node = 0; + trie = NULL; + } + } else { + /* Left leg */ + if (offlen) { + /* Left node after this node. */ + node = (*trie & LEFTNODE); + trie += offlen + 1; + } else if (*trie & RIGHTPATH) { + /* No left node. */ + node = 0; + trie = NULL; + } else { + /* Left node after this node */ + node = (*trie & TRIENODE); + trie++; + } + } + } + return trie; +} + +/* + * Use trie to scan s. + * Returns the leaf if one exists, NULL otherwise. + * + * Forwards to utf8nlookup(). + */ +static utf8leaf_t *utf8lookup(const struct utf8data *data, const char *s) +{ + return utf8nlookup(data, s, (size_t)-1); +} + +/* + * Maximum age of any character in s. + * Return -1 if s is not valid UTF-8 unicode. + * Return 0 if only non-assigned code points are used. + */ +int utf8agemax(const struct utf8data *data, const char *s) +{ + utf8leaf_t *leaf; + int age = 0; + int leaf_age; + + if (!data) + return -1; + while (*s) { + leaf = utf8lookup(data, s); + if (!leaf) + return -1; + + leaf_age = utf8agetab[LEAF_GEN(leaf)]; + if (leaf_age <= data->maxage && leaf_age > age) + age = leaf_age; + s += utf8clen(s); + } + return age; +} +EXPORT_SYMBOL(utf8agemax); + +/* + * Minimum age of any character in s. + * Return -1 if s is not valid UTF-8 unicode. + * Return 0 if non-assigned code points are used. + */ +int utf8agemin(const struct utf8data *data, const char *s) +{ + utf8leaf_t *leaf; + int age; + int leaf_age; + + if (!data) + return -1; + age = data->maxage; + while (*s) { + leaf = utf8lookup(data, s); + if (!leaf) + return -1; + leaf_age = utf8agetab[LEAF_GEN(leaf)]; + if (leaf_age <= data->maxage && leaf_age < age) + age = leaf_age; + s += utf8clen(s); + } + return age; +} +EXPORT_SYMBOL(utf8agemin); + +/* + * Maximum age of any character in s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +int utf8nagemax(const struct utf8data *data, const char *s, size_t len) +{ + utf8leaf_t *leaf; + int age = 0; + int leaf_age; + + if (!data) + return -1; + while (len && *s) { + leaf = utf8nlookup(data, s, len); + if (!leaf) + return -1; + leaf_age = utf8agetab[LEAF_GEN(leaf)]; + if (leaf_age <= data->maxage && leaf_age > age) + age = leaf_age; + len -= utf8clen(s); + s += utf8clen(s); + } + return age; +} +EXPORT_SYMBOL(utf8nagemax); + +/* + * Maximum age of any character in s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +int utf8nagemin(const struct utf8data *data, const char *s, size_t len) +{ + utf8leaf_t *leaf; + int leaf_age; + int age; + + if (!data) + return -1; + age = data->maxage; + while (len && *s) { + leaf = utf8nlookup(data, s, len); + if (!leaf) + return -1; + leaf_age = utf8agetab[LEAF_GEN(leaf)]; + if (leaf_age <= data->maxage && leaf_age < age) + age = leaf_age; + len -= utf8clen(s); + s += utf8clen(s); + } + return age; +} +EXPORT_SYMBOL(utf8nagemin); + +/* + * Length of the normalization of s. + * Return -1 if s is not valid UTF-8 unicode. + * + * A string of Default_Ignorable_Code_Point has length 0. + */ +ssize_t utf8len(const struct utf8data *data, const char *s) +{ + utf8leaf_t *leaf; + size_t ret = 0; + + if (!data) + return -1; + while (*s) { + leaf = utf8lookup(data, s); + if (!leaf) + return -1; + if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) + ret += utf8clen(s); + else if (LEAF_CCC(leaf) == DECOMPOSE) + ret += strlen(LEAF_STR(leaf)); + else + ret += utf8clen(s); + s += utf8clen(s); + } + return ret; +} +EXPORT_SYMBOL(utf8len); + +/* + * Length of the normalization of s, touch at most len bytes. + * Return -1 if s is not valid UTF-8 unicode. + */ +ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) +{ + utf8leaf_t *leaf; + size_t ret = 0; + + if (!data) + return -1; + while (len && *s) { + leaf = utf8nlookup(data, s, len); + if (!leaf) + return -1; + if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) + ret += utf8clen(s); + else if (LEAF_CCC(leaf) == DECOMPOSE) + ret += strlen(LEAF_STR(leaf)); + else + ret += utf8clen(s); + len -= utf8clen(s); + s += utf8clen(s); + } + return ret; +} +EXPORT_SYMBOL(utf8nlen); + +/* + * Set up an utf8cursor for use by utf8byte(). + * + * u8c : pointer to cursor. + * data : const struct utf8data to use for normalization. + * s : string. + * len : length of s. + * + * Returns -1 on error, 0 on success. + */ +int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, + const char *s, size_t len) +{ + if (!data) + return -1; + if (!s) + return -1; + u8c->data = data; + u8c->s = s; + u8c->p = NULL; + u8c->ss = NULL; + u8c->sp = NULL; + u8c->len = len; + u8c->slen = 0; + u8c->ccc = STOPPER; + u8c->nccc = STOPPER; + /* Check we didn't clobber the maximum length. */ + if (u8c->len != len) + return -1; + /* The first byte of s may not be an utf8 continuation. */ + if (len > 0 && (*s & 0xC0) == 0x80) + return -1; + return 0; +} +EXPORT_SYMBOL(utf8ncursor); + +/* + * Set up an utf8cursor for use by utf8byte(). + * + * u8c : pointer to cursor. + * data : const struct utf8data to use for normalization. + * s : NUL-terminated string. + * + * Returns -1 on error, 0 on success. + */ +int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, + const char *s) +{ + return utf8ncursor(u8c, data, s, (unsigned int)-1); +} +EXPORT_SYMBOL(utf8cursor); + +/* + * Get one byte from the normalized form of the string described by u8c. + * + * Returns the byte cast to an unsigned char on succes, and -1 on failure. + * + * The cursor keeps track of the location in the string in u8c->s. + * When a character is decomposed, the current location is stored in + * u8c->p, and u8c->s is set to the start of the decomposition. Note + * that bytes from a decomposition do not count against u8c->len. + * + * Characters are emitted if they match the current CCC in u8c->ccc. + * Hitting end-of-string while u8c->ccc == STOPPER means we're done, + * and the function returns 0 in that case. + * + * Sorting by CCC is done by repeatedly scanning the string. The + * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at + * the start of the scan. The first pass finds the lowest CCC to be + * emitted and stores it in u8c->nccc, the second pass emits the + * characters with this CCC and finds the next lowest CCC. This limits + * the number of passes to 1 + the number of different CCCs in the + * sequence being scanned. + * + * Therefore: + * u8c->p != NULL -> a decomposition is being scanned. + * u8c->ss != NULL -> this is a repeating scan. + * u8c->ccc == -1 -> this is the first scan of a repeating scan. + */ +int utf8byte(struct utf8cursor *u8c) +{ + utf8leaf_t *leaf; + int ccc; + + for (;;) { + /* Check for the end of a decomposed character. */ + if (u8c->p && *u8c->s == '\0') { + u8c->s = u8c->p; + u8c->p = NULL; + } + + /* Check for end-of-string. */ + if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { + /* There is no next byte. */ + if (u8c->ccc == STOPPER) + return 0; + /* End-of-string during a scan counts as a stopper. */ + ccc = STOPPER; + goto ccc_mismatch; + } else if ((*u8c->s & 0xC0) == 0x80) { + /* This is a continuation of the current character. */ + if (!u8c->p) + u8c->len--; + return (unsigned char)*u8c->s++; + } + + /* Look up the data for the current character. */ + if (u8c->p) + leaf = utf8lookup(u8c->data, u8c->s); + else + leaf = utf8nlookup(u8c->data, u8c->s, u8c->len); + + /* No leaf found implies that the input is a binary blob. */ + if (!leaf) + return -1; + + ccc = LEAF_CCC(leaf); + /* Characters that are too new have CCC 0. */ + if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { + ccc = STOPPER; + } else if (ccc == DECOMPOSE) { + u8c->len -= utf8clen(u8c->s); + u8c->p = u8c->s + utf8clen(u8c->s); + u8c->s = LEAF_STR(leaf); + /* Empty decomposition implies CCC 0. */ + if (*u8c->s == '\0') { + if (u8c->ccc == STOPPER) + continue; + ccc = STOPPER; + goto ccc_mismatch; + } + leaf = utf8lookup(u8c->data, u8c->s); + } + + /* + * If this is not a stopper, then see if it updates + * the next canonical class to be emitted. + */ + if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) + u8c->nccc = ccc; + + /* + * Return the current byte if this is the current + * combining class. + */ + if (ccc == u8c->ccc) { + if (!u8c->p) + u8c->len--; + return (unsigned char)*u8c->s++; + } + + /* Current combining class mismatch. */ +ccc_mismatch: + if (u8c->nccc == STOPPER) { + /* + * Scan forward for the first canonical class + * to be emitted. Save the position from + * which to restart. + */ + u8c->ccc = MINCCC - 1; + u8c->nccc = ccc; + u8c->sp = u8c->p; + u8c->ss = u8c->s; + u8c->slen = u8c->len; + if (!u8c->p) + u8c->len -= utf8clen(u8c->s); + u8c->s += utf8clen(u8c->s); + } else if (ccc != STOPPER) { + /* Not a stopper, and not the ccc we're emitting. */ + if (!u8c->p) + u8c->len -= utf8clen(u8c->s); + u8c->s += utf8clen(u8c->s); + } else if (u8c->nccc != MAXCCC + 1) { + /* At a stopper, restart for next ccc. */ + u8c->ccc = u8c->nccc; + u8c->nccc = MAXCCC + 1; + u8c->s = u8c->ss; + u8c->p = u8c->sp; + u8c->len = u8c->slen; + } else { + /* All done, proceed from here. */ + u8c->ccc = STOPPER; + u8c->nccc = STOPPER; + u8c->sp = NULL; + u8c->ss = NULL; + u8c->slen = 0; + } + } +} +EXPORT_SYMBOL(utf8byte); + +const struct utf8data *utf8nfkdi(unsigned int maxage) +{ + int i = ARRAY_SIZE(utf8nfkdidata) - 1; + + while (maxage < utf8nfkdidata[i].maxage) + i--; + if (maxage > utf8nfkdidata[i].maxage) + return NULL; + return &utf8nfkdidata[i]; +} +EXPORT_SYMBOL(utf8nfkdi); + +const struct utf8data *utf8nfkdicf(unsigned int maxage) +{ + int i = ARRAY_SIZE(utf8nfkdicfdata) - 1; + + while (maxage < utf8nfkdicfdata[i].maxage) + i--; + if (maxage > utf8nfkdicfdata[i].maxage) + return NULL; + return &utf8nfkdicfdata[i]; +} +EXPORT_SYMBOL(utf8nfkdicf); diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h new file mode 100644 index 000000000000..0f5fc14d4fd2 --- /dev/null +++ b/fs/nls/utf8n.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2014 SGI. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef UTF8NORM_H +#define UTF8NORM_H + +#include +#include +#include +#include + +/* Encoding a unicode version number as a single unsigned int. */ +#define UNICODE_MAJ_SHIFT (16) +#define UNICODE_MIN_SHIFT (8) + +#define UNICODE_AGE(MAJ, MIN, REV) \ + (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ + ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ + ((unsigned int)(REV))) + +/* Highest unicode version supported by the data tables. */ +extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); + +/* + * Look for the correct const struct utf8data for a unicode version. + * Returns NULL if the version requested is too new. + * + * Two normalization forms are supported: nfkdi and nfkdicf. + * + * nfkdi: + * - Apply unicode normalization form NFKD. + * - Remove any Default_Ignorable_Code_Point. + * + * nfkdicf: + * - Apply unicode normalization form NFKD. + * - Remove any Default_Ignorable_Code_Point. + * - Apply a full casefold (C + F). + */ +extern const struct utf8data *utf8nfkdi(unsigned int maxage); +extern const struct utf8data *utf8nfkdicf(unsigned int maxage); + +/* + * Determine the maximum age of any unicode character in the string. + * Returns 0 if only unassigned code points are present. + * Returns -1 if the input is not valid UTF-8. + */ +extern int utf8agemax(const struct utf8data *data, const char *s); +extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len); + +/* + * Determine the minimum age of any unicode character in the string. + * Returns 0 if any unassigned code points are present. + * Returns -1 if the input is not valid UTF-8. + */ +extern int utf8agemin(const struct utf8data *data, const char *s); +extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); + +/* + * Determine the length of the normalized from of the string, + * excluding any terminating NULL byte. + * Returns 0 if only ignorable code points are present. + * Returns -1 if the input is not valid UTF-8. + */ +extern ssize_t utf8len(const struct utf8data *data, const char *s); +extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); + +/* + * Cursor structure used by the normalizer. + */ +struct utf8cursor { + const struct utf8data *data; + const char *s; + const char *p; + const char *ss; + const char *sp; + unsigned int len; + unsigned int slen; + short int ccc; + short int nccc; +}; + +/* + * Initialize a utf8cursor to normalize a string. + * Returns 0 on success. + * Returns -1 on failure. + */ +extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, + const char *s); +extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, + const char *s, size_t len); + +/* + * Get the next byte in the normalization. + * Returns a value > 0 && < 256 on success. + * Returns 0 when the end of the normalization is reached. + * Returns -1 if the string being normalized is not valid UTF-8. + */ +extern int utf8byte(struct utf8cursor *u8c); + +#endif /* UTF8NORM_H */ From patchwork Tue Jul 3 17:06:53 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938826 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrD05LGMz9s1B for ; Wed, 4 Jul 2018 03:08:00 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934171AbeGCRIA (ORCPT ); Tue, 3 Jul 2018 13:08:00 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33416 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRH7 (ORCPT ); Tue, 3 Jul 2018 13:07:59 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id BAE40289313 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Olaf Weber , Gabriel Krisman Bertazi Subject: [PATCH 13/20] nls: utf8norm: reduce the size of utf8data[] Date: Tue, 3 Jul 2018 13:06:53 -0400 Message-Id: <20180703170700.9306-14-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org From: Olaf Weber Remove the Hangul decompositions from the utf8data trie, and do algorithmic decomposition to calculate them on the fly. To store the decomposition the caller of utf8lookup()/utf8nlookup() must provide a 12-byte buffer, which is used to synthesize a leaf with the decomposition. Trie size is reduced from 245kB to 90kB. Signed-off-by: Olaf Weber Signed-off-by: Gabriel Krisman Bertazi [Rebase to mainline] [Fix checkpatch errors] [Extract robustness fixes and merge back to original mkutf8data.c patch] --- fs/nls/nls_utf8n-norm.c | 191 +++++++++++++++++++++++--- fs/nls/utf8n.h | 4 + scripts/mkutf8data.c | 295 +++++++++++++++++++++++++++++++++++----- 3 files changed, 435 insertions(+), 55 deletions(-) diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c index ca0bbf644b49..64c3cc74a2ca 100644 --- a/fs/nls/nls_utf8n-norm.c +++ b/fs/nls/nls_utf8n-norm.c @@ -98,6 +98,38 @@ static inline int utf8clen(const char *s) return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); } +/* + * Decode a 3-byte UTF-8 sequence. + */ +static unsigned int +utf8decode3(const char *str) +{ + unsigned int uc; + + uc = *str++ & 0x0F; + uc <<= 6; + uc |= *str++ & 0x3F; + uc <<= 6; + uc |= *str++ & 0x3F; + + return uc; +} + +/* + * Encode a 3-byte UTF-8 sequence. + */ +static int +utf8encode3(char *str, unsigned int val) +{ + str[2] = (val & 0x3F) | 0x80; + val >>= 6; + str[1] = (val & 0x3F) | 0x80; + val >>= 6; + str[0] = val | 0xE0; + + return 3; +} + /* * utf8trie_t * @@ -159,7 +191,8 @@ typedef const unsigned char utf8trie_t; * characters with the Default_Ignorable_Code_Point property. * These do affect normalization, as they all have CCC 0. * - * The decompositions in the trie have been fully expanded. + * The decompositions in the trie have been fully expanded, with the + * exception of Hangul syllables, which are decomposed algorithmically. * * Casefolding, if applicable, is also done using decompositions. * @@ -179,6 +212,105 @@ typedef const unsigned char utf8leaf_t; #define STOPPER (0) #define DECOMPOSE (255) +/* Marker for hangul syllable decomposition. */ +#define HANGUL ((char)(255)) +/* Size of the synthesized leaf used for Hangul syllable decomposition. */ +#define UTF8HANGULLEAF (12) + +/* + * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) + * + * AC00;;Lo;0;L;;;;;N;;;;; + * D7A3;;Lo;0;L;;;;;N;;;;; + * + * SBase = 0xAC00 + * LBase = 0x1100 + * VBase = 0x1161 + * TBase = 0x11A7 + * LCount = 19 + * VCount = 21 + * TCount = 28 + * NCount = 588 (VCount * TCount) + * SCount = 11172 (LCount * NCount) + * + * Decomposition: + * SIndex = s - SBase + * + * LV (Canonical/Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * + * LVT (Canonical) + * LVIndex = (SIndex / TCount) * TCount + * TIndex = (Sindex % TCount) + * LVPart = SBase + LVIndex + * TPart = TBase + TIndex + * + * LVT (Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * TIndex = (Sindex % TCount) + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * if (TIndex == 0) { + * d = + * } else { + * TPart = TBase + TIndex + * d = + * } + */ + +/* Constants */ +#define SB (0xAC00) +#define LB (0x1100) +#define VB (0x1161) +#define TB (0x11A7) +#define LC (19) +#define VC (21) +#define TC (28) +#define NC (VC * TC) +#define SC (LC * NC) + +/* Algorithmic decomposition of hangul syllable. */ +static utf8leaf_t * +utf8hangul(const char *str, unsigned char *hangul) +{ + unsigned int si; + unsigned int li; + unsigned int vi; + unsigned int ti; + unsigned char *h; + + /* Calculate the SI, LI, VI, and TI values. */ + si = utf8decode3(str) - SB; + li = si / NC; + vi = (si % NC) / TC; + ti = si % TC; + + /* Fill in base of leaf. */ + h = hangul; + LEAF_GEN(h) = 2; + LEAF_CCC(h) = DECOMPOSE; + h += 2; + + /* Add LPart, a 3-byte UTF-8 sequence. */ + h += utf8encode3((char *)h, li + LB); + + /* Add VPart, a 3-byte UTF-8 sequence. */ + h += utf8encode3((char *)h, vi + VB); + + /* Add TPart if required, also a 3-byte UTF-8 sequence. */ + if (ti) + h += utf8encode3((char *)h, ti + TB); + + /* Terminate string. */ + h[0] = '\0'; + + return hangul; +} + /* * Use trie to scan s, touching at most len bytes. * Returns the leaf if one exists, NULL otherwise. @@ -187,8 +319,8 @@ typedef const unsigned char utf8leaf_t; * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ -static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, - size_t len) +static utf8leaf_t *utf8nlookup(const struct utf8data *data, + unsigned char *hangul, const char *s, size_t len) { utf8trie_t *trie = utf8data + data->offset; int offlen; @@ -226,8 +358,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, trie++; } else { /* No right node. */ - node = 0; - trie = NULL; + return NULL; } } else { /* Left leg */ @@ -237,8 +368,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, trie += offlen + 1; } else if (*trie & RIGHTPATH) { /* No left node. */ - node = 0; - trie = NULL; + return NULL; } else { /* Left node after this node */ node = (*trie & TRIENODE); @@ -246,6 +376,14 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, } } } + /* + * Hangul decomposition is done algorithmically. These are the + * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is + * always 3 bytes long, so s has been advanced twice, and the + * start of the sequence is at s-2. + */ + if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) + trie = utf8hangul(s - 2, hangul); return trie; } @@ -255,9 +393,10 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, * * Forwards to utf8nlookup(). */ -static utf8leaf_t *utf8lookup(const struct utf8data *data, const char *s) +static utf8leaf_t *utf8lookup(const struct utf8data *data, + unsigned char *hangul, const char *s) { - return utf8nlookup(data, s, (size_t)-1); + return utf8nlookup(data, hangul, s, (size_t)-1); } /* @@ -270,11 +409,13 @@ int utf8agemax(const struct utf8data *data, const char *s) utf8leaf_t *leaf; int age = 0; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; + while (*s) { - leaf = utf8lookup(data, s); + leaf = utf8lookup(data, hangul, s); if (!leaf) return -1; @@ -297,12 +438,13 @@ int utf8agemin(const struct utf8data *data, const char *s) utf8leaf_t *leaf; int age; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; age = data->maxage; while (*s) { - leaf = utf8lookup(data, s); + leaf = utf8lookup(data, hangul, s); if (!leaf) return -1; leaf_age = utf8agetab[LEAF_GEN(leaf)]; @@ -323,11 +465,13 @@ int utf8nagemax(const struct utf8data *data, const char *s, size_t len) utf8leaf_t *leaf; int age = 0; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; + while (len && *s) { - leaf = utf8nlookup(data, s, len); + leaf = utf8nlookup(data, hangul, s, len); if (!leaf) return -1; leaf_age = utf8agetab[LEAF_GEN(leaf)]; @@ -349,12 +493,13 @@ int utf8nagemin(const struct utf8data *data, const char *s, size_t len) utf8leaf_t *leaf; int leaf_age; int age; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; age = data->maxage; while (len && *s) { - leaf = utf8nlookup(data, s, len); + leaf = utf8nlookup(data, hangul, s, len); if (!leaf) return -1; leaf_age = utf8agetab[LEAF_GEN(leaf)]; @@ -377,11 +522,12 @@ ssize_t utf8len(const struct utf8data *data, const char *s) { utf8leaf_t *leaf; size_t ret = 0; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; while (*s) { - leaf = utf8lookup(data, s); + leaf = utf8lookup(data, hangul, s); if (!leaf) return -1; if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) @@ -404,11 +550,12 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; + unsigned char hangul[UTF8HANGULLEAF]; if (!data) return -1; while (len && *s) { - leaf = utf8nlookup(data, s, len); + leaf = utf8nlookup(data, hangul, s, len); if (!leaf) return -1; if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) @@ -531,10 +678,12 @@ int utf8byte(struct utf8cursor *u8c) } /* Look up the data for the current character. */ - if (u8c->p) - leaf = utf8lookup(u8c->data, u8c->s); - else - leaf = utf8nlookup(u8c->data, u8c->s, u8c->len); + if (u8c->p) { + leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + } else { + leaf = utf8nlookup(u8c->data, u8c->hangul, + u8c->s, u8c->len); + } /* No leaf found implies that the input is a binary blob. */ if (!leaf) @@ -555,7 +704,9 @@ int utf8byte(struct utf8cursor *u8c) ccc = STOPPER; goto ccc_mismatch; } - leaf = utf8lookup(u8c->data, u8c->s); + + leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + ccc = LEAF_CCC(leaf); } /* diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h index 0f5fc14d4fd2..f60827663503 100644 --- a/fs/nls/utf8n.h +++ b/fs/nls/utf8n.h @@ -76,6 +76,9 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); extern ssize_t utf8len(const struct utf8data *data, const char *s); extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); +/* Needed in struct utf8cursor below. */ +#define UTF8HANGULLEAF (12) + /* * Cursor structure used by the normalizer. */ @@ -89,6 +92,7 @@ struct utf8cursor { unsigned int slen; short int ccc; short int nccc; + unsigned char hangul[UTF8HANGULLEAF]; }; /* diff --git a/scripts/mkutf8data.c b/scripts/mkutf8data.c index 700b41c0cb66..69f3be92ba71 100644 --- a/scripts/mkutf8data.c +++ b/scripts/mkutf8data.c @@ -180,10 +180,14 @@ typedef unsigned char utf8leaf_t; #define MAXCCC (254) #define STOPPER (0) #define DECOMPOSE (255) +#define HANGUL ((char)(255)) + +#define UTF8HANGULLEAF (12) struct tree; -static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t); -static utf8leaf_t *utf8lookup(struct tree *, const char *); +static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, + const char *, size_t); +static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); unsigned char *utf8data; size_t utf8data_size; @@ -334,6 +338,8 @@ utf32valid(unsigned int unichar) return unichar < 0x110000; } +#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) + #define NODE 1 #define LEAF 0 @@ -466,7 +472,7 @@ tree_walk(struct tree *tree) indent+1); leaves += 1; } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); indent += 1; node = node->right; break; @@ -864,7 +870,7 @@ mark_nodes(struct tree *tree) } } } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); node = node->right; continue; } @@ -916,7 +922,7 @@ mark_nodes(struct tree *tree) } } } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); node = node->right; if (!node->mark && node->parent->mark && !node->parent->left) { @@ -1000,7 +1006,7 @@ index_nodes(struct tree *tree, int index) index += tree->leaf_size(node->right); count++; } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); indent += 1; node = node->right; break; @@ -1021,6 +1027,26 @@ index_nodes(struct tree *tree, int index) return index; } +/* + * Mark the nodes in a subtree, helper for size_nodes(). + */ +static int +mark_subtree(struct node *node) +{ + int changed; + + if (!node || node->mark) + return 0; + node->mark = 1; + node->index = node->parent->index; + changed = 1; + if (node->leftnode == NODE) + changed += mark_subtree(node->left); + if (node->rightnode == NODE) + changed += mark_subtree(node->right); + return changed; +} + /* * Compute the size of nodes and leaves. We start by assuming that * each node needs to store a three-byte offset. The indexes of the @@ -1040,6 +1066,7 @@ size_nodes(struct tree *tree) unsigned int bitmask; unsigned int pathbits; unsigned int pathmask; + unsigned int nbit; int changed; int offset; int size; @@ -1067,22 +1094,40 @@ size_nodes(struct tree *tree) size = 1; } else { if (node->rightnode == NODE) { + /* + * If the right node is not marked, + * look for a corresponding node in + * the next tree. Such a node need + * not exist. + */ right = node->right; next = tree->next; while (!right->mark) { assert(next); n = next->root; while (n->bitnum != node->bitnum) { - if (pathbits & (1<bitnum)) + nbit = 1 << n->bitnum; + if (!(pathmask & nbit)) + break; + if (pathbits & nbit) { + if (n->rightnode == LEAF) + break; n = n->right; - else + } else { + if (n->leftnode == LEAF) + break; n = n->left; + } } + if (n->bitnum != node->bitnum) + break; n = n->right; - assert(right->bitnum == n->bitnum); right = n; next = next->next; } + /* Make sure the right node is marked. */ + if (!right->mark) + changed += mark_subtree(right); offset = right->index - node->index; } else { offset = *tree->leaf_index(tree, node->right); @@ -1124,7 +1169,7 @@ size_nodes(struct tree *tree) if (node->rightnode == LEAF) { assert(node->right); } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); indent += 1; node = node->right; break; @@ -1158,8 +1203,15 @@ emit(struct tree *tree, unsigned char *data) int offset; int index; int indent; + int size; + int bytes; + int leaves; + int nodes[4]; unsigned char byte; + nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; + leaves = 0; + bytes = 0; index = tree->index; data += index; indent = 1; @@ -1168,7 +1220,10 @@ emit(struct tree *tree, unsigned char *data) if (tree->childnode == LEAF) { assert(tree->root); tree->leaf_emit(tree->root, data); - return; + size = tree->leaf_size(tree->root); + index += size; + leaves++; + goto done; } assert(tree->childnode == NODE); @@ -1195,6 +1250,7 @@ emit(struct tree *tree, unsigned char *data) offlen = 2; else offlen = 3; + nodes[offlen]++; offset = node->offset; byte |= offlen << OFFLEN_SHIFT; *data++ = byte; @@ -1207,12 +1263,14 @@ emit(struct tree *tree, unsigned char *data) } else if (node->left) { if (node->leftnode == NODE) byte |= TRIENODE; + nodes[0]++; *data++ = byte; index++; } else if (node->right) { byte |= RIGHTNODE; if (node->rightnode == NODE) byte |= TRIENODE; + nodes[0]++; *data++ = byte; index++; } else { @@ -1227,7 +1285,10 @@ emit(struct tree *tree, unsigned char *data) assert(node->left); data = tree->leaf_emit(node->left, data); - index += tree->leaf_size(node->left); + size = tree->leaf_size(node->left); + index += size; + bytes += size; + leaves++; } else if (node->left) { assert(node->leftnode == NODE); indent += 1; @@ -1241,9 +1302,12 @@ emit(struct tree *tree, unsigned char *data) assert(node->right); data = tree->leaf_emit(node->right, data); - index += tree->leaf_size(node->right); + size = tree->leaf_size(node->right); + index += size; + bytes += size; + leaves++; } else if (node->right) { - assert(node->rightnode==NODE); + assert(node->rightnode == NODE); indent += 1; node = node->right; break; @@ -1255,6 +1319,15 @@ emit(struct tree *tree, unsigned char *data) indent -= 1; } } +done: + if (verbose > 0) { + printf("Emitted %d (%d) leaves", + leaves, bytes); + printf(" %d (%d+%d+%d+%d) nodes", + nodes[0] + nodes[1] + nodes[2] + nodes[3], + nodes[0], nodes[1], nodes[2], nodes[3]); + printf(" %d total\n", index - tree->index); + } } /* ------------------------------------------------------------------ */ @@ -1360,7 +1433,9 @@ nfkdi_print(void *l, int indent) printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, leaf->code, leaf->ccc, leaf->gen); - if (leaf->utf8nfkdi) + if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL) + printf(" nfkdi \"%s\"", "HANGUL SYLLABLE"); + else if (leaf->utf8nfkdi) printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); printf("\n"); } @@ -1374,6 +1449,8 @@ nfkdicf_print(void *l, int indent) leaf->code, leaf->ccc, leaf->gen); if (leaf->utf8nfkdicf) printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf); + else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL) + printf(" nfkdi \"%s\"", "HANGUL SYLLABLE"); else if (leaf->utf8nfkdi) printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); printf("\n"); @@ -1409,7 +1486,9 @@ nfkdi_size(void *l) struct unicode_data *leaf = l; int size = 2; - if (leaf->utf8nfkdi) + if (HANGUL_SYLLABLE(leaf->code)) + size += 1; + else if (leaf->utf8nfkdi) size += strlen(leaf->utf8nfkdi) + 1; return size; } @@ -1420,7 +1499,9 @@ nfkdicf_size(void *l) struct unicode_data *leaf = l; int size = 2; - if (leaf->utf8nfkdicf) + if (HANGUL_SYLLABLE(leaf->code)) + size += 1; + else if (leaf->utf8nfkdicf) size += strlen(leaf->utf8nfkdicf) + 1; else if (leaf->utf8nfkdi) size += strlen(leaf->utf8nfkdi) + 1; @@ -1450,7 +1531,10 @@ nfkdi_emit(void *l, unsigned char *data) unsigned char *s; *data++ = leaf->gen; - if (leaf->utf8nfkdi) { + if (HANGUL_SYLLABLE(leaf->code)) { + *data++ = DECOMPOSE; + *data++ = HANGUL; + } else if (leaf->utf8nfkdi) { *data++ = DECOMPOSE; s = (unsigned char*)leaf->utf8nfkdi; while ((*data++ = *s++) != 0) @@ -1468,7 +1552,10 @@ nfkdicf_emit(void *l, unsigned char *data) unsigned char *s; *data++ = leaf->gen; - if (leaf->utf8nfkdicf) { + if (HANGUL_SYLLABLE(leaf->code)) { + *data++ = DECOMPOSE; + *data++ = HANGUL; + } else if (leaf->utf8nfkdicf) { *data++ = DECOMPOSE; s = (unsigned char*)leaf->utf8nfkdicf; while ((*data++ = *s++) != 0) @@ -1492,6 +1579,11 @@ utf8_create(struct unicode_data *data) unsigned int *um; int i; + if (data->utf8nfkdi) { + assert(data->utf8nfkdi[0] == HANGUL); + return; + } + u = utf; um = data->utf32nfkdi; if (um) { @@ -1682,6 +1774,7 @@ verify(struct tree *tree) utf8leaf_t *leaf; unsigned int unichar; char key[4]; + unsigned char hangul[UTF8HANGULLEAF]; int report; int nocf; @@ -1695,7 +1788,8 @@ verify(struct tree *tree) if (data->correction <= tree->maxage) data = &unicode_data[unichar]; utf8encode(key,unichar); - leaf = utf8lookup(tree, key); + leaf = utf8lookup(tree, hangul, key); + if (!leaf) { if (data->gen != -1) report++; @@ -1709,7 +1803,10 @@ verify(struct tree *tree) if (data->gen != LEAF_GEN(leaf)) report++; if (LEAF_CCC(leaf) == DECOMPOSE) { - if (nocf) { + if (HANGUL_SYLLABLE(data->code)) { + if (data->utf8nfkdi[0] != HANGUL) + report++; + } else if (nocf) { if (!data->utf8nfkdi) { report++; } else if (strcmp(data->utf8nfkdi, @@ -2394,6 +2491,15 @@ hangul_decompose(void) memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfkdicf = um; + /* + * Add a cookie as a reminder that the hangul syllable + * decompositions must not be stored in the generated + * trie. + */ + unicode_data[unichar].utf8nfkdi = malloc(2); + unicode_data[unichar].utf8nfkdi[0] = HANGUL; + unicode_data[unichar].utf8nfkdi[1] = '\0'; + if (verbose > 1) print_utf32nfkdi(unichar); @@ -2521,6 +2627,100 @@ int utf8cursor(struct utf8cursor *, struct tree *, const char *); int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); int utf8byte(struct utf8cursor *); +/* + * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) + * + * AC00;;Lo;0;L;;;;;N;;;;; + * D7A3;;Lo;0;L;;;;;N;;;;; + * + * SBase = 0xAC00 + * LBase = 0x1100 + * VBase = 0x1161 + * TBase = 0x11A7 + * LCount = 19 + * VCount = 21 + * TCount = 28 + * NCount = 588 (VCount * TCount) + * SCount = 11172 (LCount * NCount) + * + * Decomposition: + * SIndex = s - SBase + * + * LV (Canonical/Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * + * LVT (Canonical) + * LVIndex = (SIndex / TCount) * TCount + * TIndex = (Sindex % TCount) + * LVPart = SBase + LVIndex + * TPart = TBase + TIndex + * + * LVT (Full) + * LIndex = SIndex / NCount + * VIndex = (Sindex % NCount) / TCount + * TIndex = (Sindex % TCount) + * LPart = LBase + LIndex + * VPart = VBase + VIndex + * if (TIndex == 0) { + * d = + * } else { + * TPart = TBase + TIndex + * d = + * } + */ + +/* Constants */ +#define SB (0xAC00) +#define LB (0x1100) +#define VB (0x1161) +#define TB (0x11A7) +#define LC (19) +#define VC (21) +#define TC (28) +#define NC (VC * TC) +#define SC (LC * NC) + +/* Algorithmic decomposition of hangul syllable. */ +static utf8leaf_t * +utf8hangul(const char *str, unsigned char *hangul) +{ + unsigned int si; + unsigned int li; + unsigned int vi; + unsigned int ti; + unsigned char *h; + + /* Calculate the SI, LI, VI, and TI values. */ + si = utf8decode(str) - SB; + li = si / NC; + vi = (si % NC) / TC; + ti = si % TC; + + /* Fill in base of leaf. */ + h = hangul; + LEAF_GEN(h) = 2; + LEAF_CCC(h) = DECOMPOSE; + h += 2; + + /* Add LPart, a 3-byte UTF-8 sequence. */ + h += utf8encode((char *)h, li + LB); + + /* Add VPart, a 3-byte UTF-8 sequence. */ + h += utf8encode((char *)h, vi + VB); + + /* Add TPart if required, also a 3-byte UTF-8 sequence. */ + if (ti) + h += utf8encode((char *)h, ti + TB); + + /* Terminate string. */ + h[0] = '\0'; + + return hangul; +} + /* * Use trie to scan s, touching at most len bytes. * Returns the leaf if one exists, NULL otherwise. @@ -2530,7 +2730,7 @@ int utf8byte(struct utf8cursor *); * shorthand for this will be "is valid UTF-8 unicode". */ static utf8leaf_t * -utf8nlookup(struct tree *tree, const char *s, size_t len) +utf8nlookup(struct tree *tree, unsigned char *hangul, const char *s, size_t len) { utf8trie_t *trie = utf8data + tree->index; int offlen; @@ -2586,6 +2786,14 @@ utf8nlookup(struct tree *tree, const char *s, size_t len) } } } + /* + * Hangul decomposition is done algorithmically. These are the + * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is + * always 3 bytes long, so s has been advanced twice, and the + * start of the sequence is at s-2. + */ + if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) + trie = utf8hangul(s - 2, hangul); return trie; } @@ -2596,9 +2804,9 @@ utf8nlookup(struct tree *tree, const char *s, size_t len) * Forwards to trie_nlookup(). */ static utf8leaf_t * -utf8lookup(struct tree *tree, const char *s) +utf8lookup(struct tree *tree, unsigned char *hangul, const char *s) { - return utf8nlookup(tree, s, (size_t)-1); + return utf8nlookup(tree, hangul, s, (size_t)-1); } /* @@ -2624,11 +2832,14 @@ utf8agemax(struct tree *tree, const char *s) utf8leaf_t *leaf; int age = 0; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; + while (*s) { - if (!(leaf = utf8lookup(tree, s))) + leaf = utf8lookup(tree, hangul, s); + if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age > age) @@ -2649,12 +2860,14 @@ utf8agemin(struct tree *tree, const char *s) utf8leaf_t *leaf; int age; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; age = tree->maxage; while (*s) { - if (!(leaf = utf8lookup(tree, s))) + leaf = utf8lookup(tree, hangul, s); + if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age < age) @@ -2674,11 +2887,14 @@ utf8nagemax(struct tree *tree, const char *s, size_t len) utf8leaf_t *leaf; int age = 0; int leaf_age; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; + while (len && *s) { - if (!(leaf = utf8nlookup(tree, s, len))) + leaf = utf8nlookup(tree, hangul, s, len); + if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age > age) @@ -2699,12 +2915,14 @@ utf8nagemin(struct tree *tree, const char *s, size_t len) utf8leaf_t *leaf; int leaf_age; int age; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; age = tree->maxage; while (len && *s) { - if (!(leaf = utf8nlookup(tree, s, len))) + leaf = utf8nlookup(tree, hangul, s, len); + if (!leaf) return -1; leaf_age = ages[LEAF_GEN(leaf)]; if (leaf_age <= tree->maxage && leaf_age < age) @@ -2726,11 +2944,13 @@ utf8len(struct tree *tree, const char *s) { utf8leaf_t *leaf; size_t ret = 0; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (*s) { - if (!(leaf = utf8lookup(tree, s))) + leaf = utf8lookup(tree, hangul, s); + if (!leaf) return -1; if (ages[LEAF_GEN(leaf)] > tree->maxage) ret += utf8clen(s); @@ -2752,11 +2972,13 @@ utf8nlen(struct tree *tree, const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; + unsigned char hangul[UTF8HANGULLEAF]; if (!tree) return -1; while (len && *s) { - if (!(leaf = utf8nlookup(tree, s, len))) + leaf = utf8nlookup(tree, hangul, s, len); + if (!leaf) return -1; if (ages[LEAF_GEN(leaf)] > tree->maxage) ret += utf8clen(s); @@ -2784,6 +3006,7 @@ struct utf8cursor { short int ccc; short int nccc; unsigned int unichar; + unsigned char hangul[UTF8HANGULLEAF]; }; /* @@ -2900,10 +3123,12 @@ utf8byte(struct utf8cursor *u8c) } /* Look up the data for the current character. */ - if (u8c->p) - leaf = utf8lookup(u8c->tree, u8c->s); - else - leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len); + if (u8c->p) { + leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); + } else { + leaf = utf8nlookup(u8c->tree, u8c->hangul, + u8c->s, u8c->len); + } /* No leaf found implies that the input is a binary blob. */ if (!leaf) @@ -2923,7 +3148,7 @@ utf8byte(struct utf8cursor *u8c) ccc = STOPPER; goto ccc_mismatch; } - leaf = utf8lookup(u8c->tree, u8c->s); + leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); ccc = LEAF_CCC(leaf); } u8c->unichar = utf8decode(u8c->s); From patchwork Tue Jul 3 17:06:54 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938827 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrD409KJz9s2g for ; Wed, 4 Jul 2018 03:08:04 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934192AbeGCRID (ORCPT ); Tue, 3 Jul 2018 13:08:03 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33424 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRIC (ORCPT ); Tue, 3 Jul 2018 13:08:02 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 5BCF8289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 14/20] nls: utf8norm: Integrate utf8norm code with NLS subsystem Date: Tue, 3 Jul 2018 13:06:54 -0400 Message-Id: <20180703170700.9306-15-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Changes since RFC v2: - Integrate with NLS Changes since RFC v1: - Change error return code from EIO to EINVAL. (Olaf Weber) - Fix issues with strncmp/strcmp. (Olaf Weber) - Remove stack buffer in normalization/casefold. (Olaf Weber) - Include length parameter for second string on comparison functions. - Change length type to size_t. Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/Makefile | 2 +- fs/nls/nls_utf8n-core.c | 276 ++++++++++++++++++++++++++++++++++++++++ fs/nls/nls_utf8n-norm.c | 6 + fs/nls/utf8n.h | 1 + 4 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 fs/nls/nls_utf8n-core.c diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 6ff62c0fe436..3650bb58534b 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o -nls_utf8n-y += nls_utf8n-norm.o +nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h diff --git a/fs/nls/nls_utf8n-core.c b/fs/nls/nls_utf8n-core.c new file mode 100644 index 000000000000..d723e9327182 --- /dev/null +++ b/fs/nls/nls_utf8n-core.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017 Collabora Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + +#include "utf8n.h" + +static struct nls_charset utf8norm_info; + +static int utf8_strncmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_strncasecmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_casefold(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur; + size_t nlen = 0; + + utf8ncursor(&cur, data, str, len); + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + } + + return -EINVAL; +} + +static int utf8_normalize(const struct nls_table *charset, + const unsigned char *str, + size_t len, unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur; + ssize_t nlen = 0; + + utf8ncursor(&cur, data, str, len); + + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + } + + return -EINVAL; +} + +static int utf8_uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { + *out = '?'; + return -EINVAL; + } + return n; +} + +static int utf8_char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + int n; + unicode_t u; + + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { + *uni = 0x003f; /* ? */ + return -EINVAL; + } + *uni = (wchar_t) u; + return n; +} + +static unsigned char utf8_tolower(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static unsigned char utf8_toupper(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static const struct nls_ops utf8_ops = { + .strncmp = utf8_strncmp, + .strncasecmp = utf8_strncasecmp, + .casefold = utf8_casefold, + .normalize = utf8_normalize, + .lowercase = utf8_tolower, + .uppercase = utf8_toupper, + .uni2char = utf8_uni2char, + .char2uni = utf8_char2uni, +}; + +static int utf8_parse_version(const char *version, unsigned int *maj, + unsigned int *min, unsigned int *rev) +{ + substring_t args[3]; + char *tmp; + const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + int ret = 0; + + tmp = kstrdup(version, GFP_KERNEL); + if (match_token(tmp, token, args) != 1) { + ret = -EINVAL; + goto out; + } + + if (match_int(&args[0], maj) || match_int(&args[1], min) || + match_int(&args[2], rev)) { + ret = -EINVAL; + goto out; + } +out: + kfree(tmp); + return ret; +} + +static struct nls_table *utf8_load_charset(const char *version) +{ + struct nls_table *tbl = NULL; + unsigned int nls_version; + + if (version) { + unsigned int maj, min, rev; + + if (utf8_parse_version(version, &maj, &min, &rev) < 0) + return ERR_PTR(-EINVAL); + + if (!utf8version_is_supported(maj, min, rev)) + return ERR_PTR(-EINVAL); + + nls_version = UNICODE_AGE(maj, min, rev); + } else { + nls_version = utf8version_latest(); + printk(KERN_WARNING"utf8norm version not specified. " + "Assuming latest supported version (%d.%d.%d).", + (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff, + (nls_version & 0xff)); + } + + /* Try an already loaded table first. */ + for (tbl = utf8norm_info.tables; tbl; tbl = tbl->next) { + if (tbl->version == nls_version) + return tbl; + } + + tbl = kmalloc(sizeof(struct nls_table), GFP_KERNEL); + if (!tbl) + return ERR_PTR(-ENOMEM); + + tbl->charset = &utf8norm_info; + tbl->version = nls_version; + tbl->ops = &utf8_ops; + + tbl->next = utf8norm_info.tables; + utf8norm_info.tables = tbl; + + return tbl; +} + +static void utf8_cleanup_tables(void) +{ + struct nls_table *tmp, *tbl = utf8norm_info.tables; + + while (tbl) { + tmp = tbl; + tbl = tbl->next; + kfree(tmp); + } + utf8norm_info.tables = NULL; +} + +static struct nls_charset utf8norm_info = { + .charset = "utf8n", + .load_table = utf8_load_charset, +}; + +static int __init init_utf8(void) +{ + register_nls(&utf8norm_info); + return 0; +} + +static void __exit exit_utf8(void) +{ + unregister_nls(&utf8norm_info); + utf8_cleanup_tables(); +} + +module_init(init_utf8); +module_exit(exit_utf8); +MODULE_AUTHOR("SGI, Gabriel Krisman Bertazi"); +MODULE_DESCRIPTION("UTF-8 charset operations for filesystems"); +MODULE_LICENSE("GPL"); diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c index 64c3cc74a2ca..abee8b376a87 100644 --- a/fs/nls/nls_utf8n-norm.c +++ b/fs/nls/nls_utf8n-norm.c @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev) } EXPORT_SYMBOL(utf8version_is_supported); +int utf8version_latest() +{ + return utf8vers; +} +EXPORT_SYMBOL(utf8version_latest); + /* * UTF-8 valid ranges. * diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h index f60827663503..b4697f9bfbab 100644 --- a/fs/nls/utf8n.h +++ b/fs/nls/utf8n.h @@ -32,6 +32,7 @@ /* Highest unicode version supported by the data tables. */ extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); +extern int utf8version_latest(void); /* * Look for the correct const struct utf8data for a unicode version. From patchwork Tue Jul 3 17:06:55 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938834 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrFD6K1yz9s3R for ; Wed, 4 Jul 2018 03:09:04 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934125AbeGCRIH (ORCPT ); Tue, 3 Jul 2018 13:08:07 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33430 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934195AbeGCRIG (ORCPT ); Tue, 3 Jul 2018 13:08:06 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 28702289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 15/20] nls: utf8norm: Introduce test module for utf8norm implementation Date: Tue, 3 Jul 2018 13:06:55 -0400 Message-Id: <20180703170700.9306-16-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Changes since RFC v2: - Merge with NLS Changes since RFC v1: - Include comparison tests for matching strings with different lengths. - Include tests for characters included in unicode 8.0.0, 9.0.0 and 10.0.0. Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/Kconfig | 5 + fs/nls/Makefile | 1 + fs/nls/nls_utf8n-selftest.c | 307 ++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 fs/nls/nls_utf8n-selftest.c diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig index 39bf0ebd0643..23a333e503a4 100644 --- a/fs/nls/Kconfig +++ b/fs/nls/Kconfig @@ -624,4 +624,9 @@ config NLS_UTF8_NORMALIZATION help Say Y here to enable utf8 normalization support. +config NLS_UTF8_NORMALIZATION_SELFTEST + tristate "Test UTF-8 normalization support" + default n + depends on NLS_UTF8_NORMALIZATION + endif # NLS diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 3650bb58534b..0b86f6d7a484 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o +obj-$(CONFIG_NLS_UTF8_NORMALIZATION_SELFTEST) += nls_utf8n-selftest.o $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h $(obj)/utf8data.h: $(srctree)/$(src)/ucd/*.txt $(objtree)/scripts/mkutf8data FORCE diff --git a/fs/nls/nls_utf8n-selftest.c b/fs/nls/nls_utf8n-selftest.c new file mode 100644 index 000000000000..5ee7f927f6fd --- /dev/null +++ b/fs/nls/nls_utf8n-selftest.c @@ -0,0 +1,307 @@ +/* + * Kernel module for testing utf-8 support. + * + * Copyright 2017 Collabora Ltd. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include "utf8n.h" + +unsigned int failed_tests; +unsigned int total_tests; + +/* Tests will be based on this version. */ +#define latest_maj 10 +#define latest_min 0 +#define latest_rev 0 + +#define _test(cond, func, line, fmt, ...) do { \ + total_tests++; \ + if (!cond) { \ + failed_tests++; \ + pr_err("test %s:%d Failed: %s%s", \ + func, line, #cond, (fmt?":":".")); \ + if (fmt) \ + pr_err(fmt, ##__VA_ARGS__); \ + } \ + } while (0) +#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define test(cond) _test(cond, __func__, __LINE__, "") + +const static struct { + /* UTF-8 strings in this vector _must_ be NULL-terminated. */ + unsigned char str[10]; + unsigned char dec[10]; +} nfkdi_test_data[] = { + /* Trivial sequence */ + { + /* "ABba" decomposes to itself */ + .str = {0x41, 0x42, 0x62, 0x61, 0x00}, + .dec = {0x41, 0x42, 0x62, 0x61, 0x00} + }, + /* Simple equivalent sequences */ + { + /* 'VULGAR FRACTION ONE QUARTER' decomposes to + 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' */ + .str = {0xc2, 0xbc, 0x00}, + .dec = {0x31, 0xe2, 0x81, 0x84, 0x34, 0x00}, + }, + { + /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to + 'LETTER A' + 'COMBINING DIAERESIS' */ + .str = {0xc3, 0xa4, 0x00}, + .dec = {0x61, 0xcc, 0x88, 0x00}, + }, + { + /* 'LATIN SMALL LETTER LJ' decomposes to + 'LETTER L' + 'LETTER J' */ + .str = {0xC7, 0x89, 0x00}, + .dec = {0x6c, 0x6a, 0x00}, + }, + { + /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ + .str = {0xCE, 0x87, 0x00}, + .dec = {0xC2, 0xB7, 0x00} + }, + /* Canonical ordering */ + { + /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes + to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ + .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, + .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, + }, + { + /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' + decomposes to + 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ + .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, + + .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, + }, + +}; + +const static struct { + /* UTF-8 strings in this vector _must_ be NULL-terminated. */ + unsigned char str[30]; + unsigned char ncf[30]; +} nfkdicf_test_data[] = { + /* Trivial sequences */ + { + /* "ABba" folds to lowercase */ + .str = {0x41, 0x42, 0x62, 0x61, 0x00}, + .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, + }, + { + /* All ASCII folds to lower-case */ + .str = "ABCDEFGHIJKLMNOPRSTUVWXYZ0.1", + .ncf = "abcdefghijklmnoprstuvwxyz0.1", + }, + { + /* LATIN SMALL LETTER SHARP S folds to + LATIN SMALL LETTER S + LATIN SMALL LETTER S */ + .str = {0xc3, 0x9f, 0x00}, + .ncf = {0x73, 0x73, 0x00}, + }, + { + /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to + LATIN SMALL LETTER A + COMBINING RING ABOVE */ + .str = {0xC3, 0x85, 0x00}, + .ncf = {0x61, 0xcc, 0x8a, 0x00}, + }, + /* Introduced by UTF-8.0.0. */ + /* Cherokee letters are interesting test-cases because they fold + to upper-case. Before 8.0.0, Cherokee lowercase were + undefined, thus, the folding from LC is not stable between + 7.0.0 -> 8.0.0, but it is from UC. */ + { + /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ + .str = {0xea, 0xad, 0xb0, 0x00}, + .ncf = {0xe1, 0x8e, 0xa0, 0x00}, + }, + { + /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ + .str = {0xe1, 0x8f, 0xb8, 0x00}, + .ncf = {0xe1, 0x8f, 0xb0, 0x00}, + }, + { + /* OLD HUNGARIAN CAPITAL LETTER AMB folds to + OLD HUNGARIAN SMALL LETTER AMB */ + .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, + .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, + }, + /* Introduced by UTF-9.0.0. */ + { + /* OSAGE CAPITAL LETTER CHA folds to + OSAGE SMALL LETTER CHA */ + .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, + .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, + }, + { + /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to + LATIN LETTER SMALL CAPITAL I */ + .str = {0xea, 0x9e, 0xae, 0x00}, + .ncf = {0xc9, 0xaa, 0x00}, + }, +}; + +static void check_utf8_nfkdi(void) +{ + int i; + struct utf8cursor u8c; + const struct utf8data *data; + + data = utf8nfkdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); + if (!data) { + pr_err("%s: Unable to load Unicode %d.%d.%d. Skipping.\n", + __func__, latest_maj, latest_min, latest_rev); + return; + } + + for (i = 0; i < ARRAY_SIZE(nfkdi_test_data); i++) { + int len = strlen(nfkdi_test_data[i].str); + int nlen = strlen(nfkdi_test_data[i].dec); + int j = 0; + unsigned char c; + + test((utf8len(data, nfkdi_test_data[i].str) == nlen)); + test((utf8nlen(data, nfkdi_test_data[i].str, len) == nlen)); + + if (utf8cursor(&u8c, data, nfkdi_test_data[i].str) < 0) + pr_err("can't create cursor\n"); + + while ((c = utf8byte(&u8c)) > 0) { + test_f((c == nfkdi_test_data[i].dec[j]), + "Unexpected byte 0x%x should be 0x%x\n", + c, nfkdi_test_data[i].dec[j]); + j++; + } + + test((j == nlen)); + } +} + +static void check_utf8_nfkdicf(void) +{ + int i; + struct utf8cursor u8c; + const struct utf8data *data; + + data = utf8nfkdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); + if (!data) { + pr_err("%s: Unable to load Unicode %d.%d.%d. Skipping.\n", + __func__, latest_maj, latest_min, latest_rev); + return; + } + + for (i = 0; i < ARRAY_SIZE(nfkdicf_test_data); i++) { + int len = strlen(nfkdicf_test_data[i].str); + int nlen = strlen(nfkdicf_test_data[i].ncf); + int j = 0; + unsigned char c; + + test((utf8len(data, nfkdicf_test_data[i].str) == nlen)); + test((utf8nlen(data, nfkdicf_test_data[i].str, len) == nlen)); + + if (utf8cursor(&u8c, data, nfkdicf_test_data[i].str) < 0) + pr_err("can't create cursor\n"); + + while ((c = utf8byte(&u8c)) > 0) { + test_f((c == nfkdicf_test_data[i].ncf[j]), + "Unexpected byte 0x%x should be 0x%x\n", + c, nfkdicf_test_data[i].ncf[j]); + j++; + } + + test((j == nlen)); + } +} + +static void check_utf8_comparisons(void) +{ + int i; + struct nls_table *table = load_nls_version("utf8n", "10.0.0"); + + if (IS_ERR(table)) { + pr_err("%s: Unable to load utf8norm %d.%d.%d. Skipping.\n", + __func__, latest_maj, latest_min, latest_rev); + return; + } + + for (i = 0; i < ARRAY_SIZE(nfkdi_test_data); i++) { + const char *s1 = nfkdi_test_data[i].str; + const char *s2 = nfkdi_test_data[i].dec; + + test_f(!nls_strncmp(table, s1, strlen(s1), s2, strlen(s2)), + "%s %s comparison mismatch\n", s1, s2); + } + for (i = 0; i < ARRAY_SIZE(nfkdicf_test_data); i++) { + const char *s1 = nfkdicf_test_data[i].str; + const char *s2 = nfkdicf_test_data[i].ncf; + + test_f(!nls_strncasecmp(table, s1, strlen(s1), + s2, strlen(s2)), + "%s %s comparison mismatch\n", s1, s2); + } + + unload_nls(table); +} + +static void check_supported_versions(void) +{ + /* Unicode 7.0.0 should be supported. */ + test(utf8version_is_supported(7, 0, 0)); + + /* Unicode 9.0.0 should be supported. */ + test(utf8version_is_supported(9, 0, 0)); + + /* Unicode 10.0.0 (the latest version) should be supported. */ + test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); + + /* Next versions don't exist. */ + test(!utf8version_is_supported(11, 0, 0)); + test(!utf8version_is_supported(0, 0, 0)); + test(!utf8version_is_supported(-1, -1, -1)); +} + +static int __init init_test_ucd(void) +{ + failed_tests = 0; + total_tests = 0; + + check_supported_versions(); + check_utf8_nfkdi(); + check_utf8_nfkdicf(); + check_utf8_comparisons(); + + if (!failed_tests) + pr_info("All %u tests passed\n", total_tests); + else + pr_err("%u out of %u tests failed\n", failed_tests, + total_tests); + return 0; +} + +static void __exit exit_test_ucd(void) +{ +} + +module_init(init_test_ucd); +module_exit(exit_test_ucd); + +MODULE_AUTHOR("Gabriel Krisman Bertazi "); +MODULE_LICENSE("GPL"); From patchwork Tue Jul 3 17:06:56 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938833 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrF531tzz9s3C for ; Wed, 4 Jul 2018 03:08:57 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934241AbeGCRIM (ORCPT ); Tue, 3 Jul 2018 13:08:12 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33436 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRIK (ORCPT ); Tue, 3 Jul 2018 13:08:10 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id D17AA289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 16/20] nls: ascii: Support casefold and normalization operations Date: Tue, 3 Jul 2018 13:06:56 -0400 Message-Id: <20180703170700.9306-17-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Normalization is identity, but casefold can be implemented with toupper or tolower, and we have no specification on that. We should be safe, as long as it is constant. Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/nls_ascii.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 2f4826478d3d..40b8e7acfe1e 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -12,6 +12,7 @@ #include #include #include +#include static const wchar_t charset2uni[256] = { /* 0x00*/ @@ -152,11 +153,43 @@ static unsigned char charset_toupper(const struct nls_table *table, return charset2upper[c]; } +/* Ascii casefold can be defined as either to lower or to upper. As long + * as it is stable. */ +static int ascii_casefold(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + unsigned int i; + + if (dlen < len) + return -EINVAL; + + for (i = 0; i < len; i++) + dest[i] = charset_tolower(charset, str[i]); + + return 0; +} + +/* Ascii normalization is identity. */ +static int ascii_normalize(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + if (dlen < len) + return -EINVAL; + + memcpy(dest, str, len); + + return 0; +} + static const struct nls_ops charset_ops = { .lowercase = charset_toupper, .uppercase = charset_tolower, .uni2char = uni2char, .char2uni = char2uni, + .casefold = ascii_casefold, + .normalize = ascii_normalize, }; static struct nls_charset nls_charset; From patchwork Tue Jul 3 17:06:57 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938832 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrDx4V3kz9s1B for ; Wed, 4 Jul 2018 03:08:49 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934289AbeGCRIQ (ORCPT ); Tue, 3 Jul 2018 13:08:16 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33442 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934247AbeGCRIN (ORCPT ); Tue, 3 Jul 2018 13:08:13 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 74B8D289318 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 17/20] ext4: Include encoding information in the superblock Date: Tue, 3 Jul 2018 13:06:57 -0400 Message-Id: <20180703170700.9306-18-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Support for encoding is considered an incompatible feature, since it has potential to create collisions of file names in existing filesystems. If the feature flag is not enabled, the entire filesystem will operate on opaque byte sequences, respecting the original behavior. The charset data is encoded in a new field in the superblock using a magic number specific to ext4. This is the easiest way I found to avoid writing the name of the charset in the superblock. The magic number is mapped to the exact NLS table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encodings I am supporting right now is utf8n-10.0.0 and ascii, both using the NLS abstraction. A mount option that forces the use of an encoding is also provided. This allows the user to override the superblock information and force the mount using a specific encoding. There is little point in doing that, except for debugging. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. Signed-off-by: Gabriel Krisman Bertazi --- fs/ext4/ext4.h | 7 ++- fs/ext4/super.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0b127853c584..fb0b70d6eb68 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1298,7 +1298,8 @@ struct ext4_super_block { __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ - __le32 s_reserved[98]; /* Padding to the end of the block */ + __le32 s_ioencoding; /* charset encoding */ + __le32 s_reserved[97]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1372,6 +1373,7 @@ struct ext4_sb_info { struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; + struct nls_table *encoding; /* Journaling */ struct journal_s *s_journal; @@ -1652,6 +1654,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_IOENCODING 0x20000 #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ @@ -1740,6 +1743,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(ioencoding, IOENCODING) #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1767,6 +1771,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_IOENCODING | \ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c2201b3aa..53db9b6c7e33 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -985,6 +986,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + unload_nls(sbi->encoding); kfree(sbi); } @@ -1378,6 +1380,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_encoding, }; static const match_table_t tokens = { @@ -1460,6 +1463,7 @@ static const match_table_t tokens = { {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_encoding, "encoding=%s"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ @@ -1670,9 +1674,58 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_GTE0}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_encoding, 0, MOPT_EXT4_ONLY | MOPT_STRING}, {Opt_err, 0, 0} }; +static const struct ext4_sb_encodings { + char *name; + char *version; +} ext4_sb_encoding_map[] = { + /* 0x0 */ {"ascii", NULL}, + /* 0x1 */ {"utf8n", "10.0.0"}, +}; + +static const struct ext4_sb_encodings * +ext4_sb_read_encoding(struct ext4_super_block *es) +{ + unsigned int magic = le32_to_cpu(es->s_ioencoding); + + if (magic >= ARRAY_SIZE(ext4_sb_encoding_map)) + return NULL; + + return &ext4_sb_encoding_map[magic]; +} + +static const struct ext4_sb_encodings *ext4_parse_encoding_opt(const char *arg) +{ + int i, nlen; + const struct ext4_sb_encodings *e = NULL; + const char version_separator = '-'; + + for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) { + e = &ext4_sb_encoding_map[i]; + nlen = strlen(e->name); + + if (strncmp(arg, e->name, nlen)) + continue; + + /* Encoding doesn't require version */ + if (!e->version && !arg[nlen]) + return e; + + if (arg[nlen] != version_separator) + continue; + + /* Eat out the separator */ + nlen += 1; + + if (!strcmp(&arg[nlen], e->version)) + return e; + } + return NULL; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -1905,6 +1958,40 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_opt |= m->mount_opt; } else if (token == Opt_data_err_ignore) { sbi->s_mount_opt &= ~m->mount_opt; + } else if (token == Opt_encoding) { + const struct ext4_sb_encodings *encoding_info; + char *encoding = match_strdup(&args[0]); + + if (!encoding) + return -ENOMEM; + + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_ERR, + "Can't mount with both encoding and encryption"); + goto encoding_fail; + } + + encoding_info = ext4_parse_encoding_opt(encoding); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding %s not supported by ext4", encoding); + goto encoding_fail; + } + + sbi->encoding = load_nls_version(encoding_info->name, + encoding_info->version); + if (IS_ERR(sbi->encoding)) { + ext4_msg(sb, KERN_ERR, "Cannot load encoding: %s", + encoding); + goto encoding_fail; + } + + kfree(encoding); + return 0; +encoding_fail: + sbi->encoding = NULL; + kfree(encoding); + return -1; } else { if (!args->from) arg = 1; @@ -3453,6 +3540,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; + struct nls_table *encoding; + const struct ext4_sb_encodings *encoding_info; if ((data && !orig_data) || !sbi) goto out_free_base; @@ -3625,6 +3714,35 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &journal_ioprio, 0)) goto failed_mount; + if (ext4_has_feature_ioencoding(sb) && !sbi->encoding) { + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_ERR, + "Can't mount with both encoding and encryption"); + goto failed_mount; + } + + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding requested by superblock is unknown"); + goto failed_mount; + } + + encoding = load_nls_version(encoding_info->name, + encoding_info->version); + if (IS_ERR(encoding)) { + ext4_msg(sb, KERN_ERR, "can't mount with superblock charset:" + "%s-%s not supported by the kernel", + encoding_info->name, encoding_info->version); + goto failed_mount; + } + ext4_msg(sb, KERN_INFO, + "Using encoding defined by superblock: %s %s", + encoding_info->name, encoding_info->version); + + sbi->encoding = encoding; + } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " "with data=journal disables delayed " @@ -4442,6 +4560,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(sbi->s_group_desc[i]); kvfree(sbi->s_group_desc); failed_mount: + unload_nls(sbi->encoding); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); #ifdef CONFIG_QUOTA From patchwork Tue Jul 3 17:06:58 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938829 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrDP3JP2z9s3C for ; Wed, 4 Jul 2018 03:08:21 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934302AbeGCRIT (ORCPT ); Tue, 3 Jul 2018 13:08:19 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33448 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934295AbeGCRIR (ORCPT ); Tue, 3 Jul 2018 13:08:17 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 6FAD9289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 18/20] ext4: Support encoding-aware file name lookups Date: Tue, 3 Jul 2018 13:06:58 -0400 Message-Id: <20180703170700.9306-19-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Signed-off-by: Gabriel Krisman Bertazi --- fs/ext4/namei.c | 60 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a4c25c4681d..e4906bac7279 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" @@ -1251,15 +1252,26 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } +static inline bool ext4_charset_match_name(struct nls_table *charset, + const struct fscrypt_name *fname, + const u8 *de_name, + u32 de_name_len) +{ + return !nls_strncmp(charset, (char *) de_name, de_name_len, + fname->disk_name.name, fname->disk_name.len); +} + /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ -static inline bool ext4_match(const struct ext4_filename *fname, +static inline bool ext4_match(const struct inode *parent, + const struct ext4_filename *fname, const struct ext4_dir_entry_2 *de) { struct fscrypt_name f; + const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb); if (!de->inode) return false; @@ -1269,6 +1281,11 @@ static inline bool ext4_match(const struct ext4_filename *fname, #ifdef CONFIG_EXT4_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif + + if (sbi->encoding) + return ext4_charset_match_name(sbi->encoding, &f, + de->name, de->name_len); + return fscrypt_match_name(&f, de->name, de->name_len); } @@ -1289,7 +1306,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if ((char *) de + de->name_len <= dlimit && - ext4_match(fname, de)) { + ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, @@ -1345,6 +1362,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; + struct ext4_sb_info *sbi = EXT4_SB(dir->i_sb); const u8 *name = d_name->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ @@ -1393,9 +1411,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. + * + * Even if we are doing encodings, an exact-match lookup + * could still benefit from DX, so we don't skip it + * entirely. Only if it fails to find a match, we + * fallback to linear search. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if ((ret && (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)) + || (!ret && !sbi->encoding)) goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1544,6 +1569,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; + struct ext4_sb_info *sbi = EXT4_SB(dir->i_sb); int err; err = fscrypt_prepare_lookup(dir, dentry, flags); @@ -1585,6 +1611,32 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi iput(inode); return ERR_PTR(-EPERM); } + + if (sbi->encoding) { + struct dentry *new; + struct qstr ciname; + char *name; + + name = kmalloc((sizeof(char) * de->name_len) + 1, + GFP_NOFS); + if (!name) + return ERR_PTR(-ENOMEM); + + memcpy(name, de->name, de->name_len); + name[de->name_len] = '\0'; + ciname.len = de->name_len; + ciname.name = name; + new = d_add_ci(dentry, inode, &ciname); + kfree(name); + return new; + } + } else if (sbi->encoding) { + /* Eventually we want to call d_add_ci(dentry, NULL) for + * negative dentries in the encoding case as well. For + * now, prevent the negative dentry from being + * cached. + */ + return NULL; } return d_splice_alias(inode, dentry); } @@ -1796,7 +1848,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; - if (ext4_match(fname, de)) + if (ext4_match(dir, fname, de)) return -EEXIST; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); From patchwork Tue Jul 3 17:06:59 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938830 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrDT5FtDz9s1B for ; Wed, 4 Jul 2018 03:08:25 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934309AbeGCRIY (ORCPT ); Tue, 3 Jul 2018 13:08:24 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33454 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934295AbeGCRIU (ORCPT ); Tue, 3 Jul 2018 13:08:20 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id B6D7A289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 19/20] vfs: Handle case-exact lookup in d_add_ci Date: Tue, 3 Jul 2018 13:06:59 -0400 Message-Id: <20180703170700.9306-20-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org This prevents a soft hang if called d_add_ci is called from the FS layer, when doing a CI search but the result dentry is the exact match. Signed-off-by: Gabriel Krisman Bertazi --- fs/dcache.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 0e8e5de3c48a..3b55023cca7e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2059,6 +2059,20 @@ struct dentry *d_obtain_root(struct inode *inode) } EXPORT_SYMBOL(d_obtain_root); +static inline bool d_same_name(const struct dentry *dentry, + const struct dentry *parent, + const struct qstr *name) +{ + if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { + if (dentry->d_name.len != name->len) + return false; + return dentry_cmp(dentry, name->name, name->len) == 0; + } + return parent->d_op->d_compare(dentry, + dentry->d_name.len, dentry->d_name.name, + name) == 0; +} + /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found @@ -2080,6 +2094,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, { struct dentry *found, *res; + /* Trivial case: CI search is exact match. */ + if (d_same_name(dentry, dentry->d_parent, name)) + return d_splice_alias(inode, dentry); + /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. @@ -2112,21 +2130,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } EXPORT_SYMBOL(d_add_ci); - -static inline bool d_same_name(const struct dentry *dentry, - const struct dentry *parent, - const struct qstr *name) -{ - if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { - if (dentry->d_name.len != name->len) - return false; - return dentry_cmp(dentry, name->name, name->len) == 0; - } - return parent->d_op->d_compare(dentry, - dentry->d_name.len, dentry->d_name.name, - name) == 0; -} - /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry From patchwork Tue Jul 3 17:07:00 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938831 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrDY1tdxz9s1B for ; Wed, 4 Jul 2018 03:08:29 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934311AbeGCRI1 (ORCPT ); Tue, 3 Jul 2018 13:08:27 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33464 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S934303AbeGCRIX (ORCPT ); Tue, 3 Jul 2018 13:08:23 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id A4345289318 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 20/20] ext4: Implement encoding-aware dcache hooks Date: Tue, 3 Jul 2018 13:07:00 -0400 Message-Id: <20180703170700.9306-21-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org d_revalidate to reject negative dentries is not needed, because we avoided adding those in the first place during lookup, similar to what xfs does. Signed-off-by: Gabriel Krisman Bertazi --- fs/ext4/dir.c | 30 ++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 1 + fs/ext4/super.c | 4 ++++ 3 files changed, 35 insertions(+) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e2902d394f1b..c520b9e94778 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "ext4.h" #include "xattr.h" @@ -664,3 +665,32 @@ const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .release = ext4_release_dir, }; + +static int ext4_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct nls_table *charset = EXT4_SB(dentry->d_sb)->encoding; + size_t nlen = strlen(name->name); + + return nls_strncmp(charset, str, len, name->name, nlen); +} + +static int ext4_d_hash(const struct dentry *dentry, struct qstr *q) +{ + const struct nls_table *charset = EXT4_SB(dentry->d_sb)->encoding; + unsigned char norm[PATH_MAX]; + int len; + + len = nls_normalize(charset, q->name, q->len, norm, PATH_MAX); + if (len < 0) + return -EINVAL; + + q->hash = full_name_hash(dentry, norm, len); + + return 0; +} + +const struct dentry_operations ext4_dentry_ops = { + .d_hash = ext4_d_hash, + .d_compare = ext4_d_compare, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb0b70d6eb68..2a5c7712967f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2953,6 +2953,7 @@ static inline void ext4_unlock_group(struct super_block *sb, /* dir.c */ extern const struct file_operations ext4_dir_operations; +extern const struct dentry_operations ext4_dentry_ops; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 53db9b6c7e33..f292cc5bacda 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4358,6 +4358,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) iput(root); goto failed_mount4; } + + if (sbi->encoding) + sb->s_d_op = &ext4_dentry_ops; + sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed");