Patch to support extended characters in C/C++ identifiers

Message ID	20190812220121.GA9251@ldh.local
State	New
Headers	show Return-Path: <gcc-patches-return-506735-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:mime-version:content-type; q=dns; s=default; b=e7fsnPzhn5Gp+Z+7qim4usTu+8rNsMUGT/Yb4dj3R2c69opnyD O6IpVRGKxzNi1AgiqZpkxrgpP5Fvt7tNbBINicEe/4YxIN5hm10QXVG7l/j+Jjy6 haW0zf2KrNzZcUYs6EMgPi7QpLmbhC49VE1aT/PbqnZ1swxmTe7DSoNco= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Mon, 12 Aug 2019 18:01:21 -0400 From: Lewis Hyatt <lhyatt@gmail.com> To: gcc-patches@gcc.gnu.org Cc: joseph@codesourcery.com Subject: Patch to support extended characters in C/C++ identifiers Message-ID: <20190812220121.GA9251@ldh.local> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="opJtzjQTFsWo+cga" Content-Disposition: inline User-Agent: Mutt/1.12.1 (2019-06-15)
Series	Patch to support extended characters in C/C++ identifiers \| expand Patch to support extended characters in C/C++ identifiers

diff --git a/libcpp/charset.c b/libcpp/charset.c index 8a0e5cbb29b..4f1bee96cee 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, return from; } +/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded + extended characters rather than UCNs. If the return value is TRUE, then a + character was successfully decoded and stored in *CP; *PSTR has been + updated to point one past the valid UTF-8 sequence. Diagnostics may have + been emitted if the character parsed is not allowed in the current context. + If the return value is FALSE, then *PSTR has not been modified and *CP may + equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it + may, when processing an identifier in C mode, equal a codepoint that was + validly encoded but is not allowed to appear in an identifier. In either + case, no diagnostic is emitted, and the return value of FALSE should cause + a new token to be formed. + + Unlike _cpp_valid_ucn, this will never be called when lexing a string; only + a potential identifier, or a CPP_OTHER token. NST is unused in the latter + case. + + As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for + the start of an identifier, or 2 otherwise. */ + +extern bool +_cpp_valid_utf8 (cpp_reader *pfile, + const uchar **pstr, + const uchar *limit, + int identifier_pos, + struct normalize_state *nst, + cppchar_t *cp) +{ + const uchar *base = *pstr; + size_t inbytesleft = limit - base; + if (one_utf8_to_cppchar (pstr, &inbytesleft, cp)) + { + /* No diagnostic here as this byte will rather become a + new token. */ + *cp = 0; + return false; + } + + if (identifier_pos) + { + switch (ucn_valid_in_identifier (pfile, *cp, nst)) + { + + case 0: + /* In C++, this is an error for invalid character in an identifier + because logically, the UTF-8 was converted to a UCN during + translation phase 1 (even though we don't physically do it that + way). In C, this byte rather becomes grammatically a separate + token. */ + + if (CPP_OPTION (pfile, cplusplus)) + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid in an identifier", + (int) (*pstr - base), base); + else + { + *pstr = base; + return false; + } + + break; + + case 2: + if (identifier_pos == 1) + { + /* This is treated the same way in C++ or C99 -- lexed as an + identifier which is then invalid because an identifier is + not allowed to start with this character. */ + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid at the start of an identifier", + (int) (*pstr - base), base); + } + break; + } + } + + return true; +} + /* Subroutine of convert_hex and convert_oct. N is the representation in the execution character set of a numeric escape; write it into the string buffer TBUF and update the end-of-string pointer therein. WIDE @@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, } /* Convert an identifier denoted by ID and LEN, which might contain - UCN escapes, to the source character set, either UTF-8 or - UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ + UCN escapes or UTF-8 multibyte chars, to the source character set, + either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually + a valid identifier. */ cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) { diff --git a/libcpp/internal.h b/libcpp/internal.h index 45167a9500e..d2158426b1f 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -777,6 +777,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **, cppchar_t *, source_range *char_range, cpp_string_location_reader *loc_reader); + +extern bool _cpp_valid_utf8 (cpp_reader *pfile, + const uchar **pstr, + const uchar *limit, + int identifier_pos, + struct normalize_state *nst, + cppchar_t *cp); + extern void _cpp_destroy_iconv (cpp_reader *); extern unsigned char *_cpp_convert_input (cpp_reader *, const char *, unsigned char *, size_t, size_t, diff --git a/libcpp/lex.c b/libcpp/lex.c index 16ded6e9b05..15b10cb3f01 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile, } } -/* Returns TRUE if the sequence starting at buffer->cur is invalid in +static const cppchar_t utf8_signifier = 0xC0; + +/* Returns TRUE if the sequence starting at buffer->cur is valid in an identifier. FIRST is TRUE if this starts an identifier. */ static bool forms_identifier_p (cpp_reader *pfile, int first, @@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first, return true; } - /* Is this a syntactically valid UCN? */ - if (CPP_OPTION (pfile, extended_identifiers) - && *buffer->cur == '\\' - && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + /* Is this a syntactically valid UCN or a valid UTF-8 char? */ + if (CPP_OPTION (pfile, extended_identifiers)) { cppchar_t s; - buffer->cur += 2; - if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, - state, &s, NULL, NULL)) - return true; - buffer->cur -= 2; + if (*buffer->cur >= utf8_signifier) + { + if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s)) + return true; + } + else if (*buffer->cur == '\\' + && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + { + buffer->cur += 2; + if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s, NULL, NULL)) + return true; + buffer->cur -= 2; + } } return false; @@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, pfile->buffer->cur = cur; if (starts_ucn || forms_identifier_p (pfile, false, nst)) { - /* Slower version for identifiers containing UCNs (or $). */ + /* Slower version for identifiers containing UCNs + or extended chars (including $). */ do { while (ISIDNUM (*pfile->buffer->cur)) { @@ -3117,12 +3128,12 @@ _cpp_lex_direct (cpp_reader *pfile) /* @ is a punctuator in Objective-C. */ case '@': result->type = CPP_ATSIGN; break; - case '$': - case '\\': + default: { const uchar *base = --buffer->cur; - struct normalize_state nst = INITIAL_NORMALIZE_STATE; + /* Check for an extended identifier ($ or UCN or UTF-8). */ + struct normalize_state nst = INITIAL_NORMALIZE_STATE; if (forms_identifier_p (pfile, true, &nst)) { result->type = CPP_NAME; @@ -3131,13 +3142,21 @@ _cpp_lex_direct (cpp_reader *pfile) warn_about_normalization (pfile, result, &nst); break; } + + /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a + single token. */ buffer->cur++; + if (c >= utf8_signifier) + { + const uchar *pstr = base; + cppchar_t s; + if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) + buffer->cur = pstr; + } + create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); + break; } - /* FALLTHRU */ - default: - create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER); - break; } /* Potentially convert the location of the token to a range. */

Patch to support extended characters in C/C++ identifiers

Commit Message

Comments

Patch