===================================================================
@@ -459,6 +459,11 @@ package body Impunit is
"a-szuzti", -- Ada.Strings.Wide_Wide_Unbounded.Wide_Wide_Text_IO
"a-zchuni", -- Ada.Wide_Wide_Characters.Unicode
+ -- Note: strictly the next one should be an Ada 2012 unit, but it seems
+ -- harmless (and useful) to make it available in Ada 2005 mode.
+
+ "a-stuten", -- Ada.Strings.UTF_Encoding
+
---------------------------
-- GNAT Special IO Units --
---------------------------
===================================================================
@@ -0,0 +1,1032 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- A D A . S T R I N G S . U T F _ E N C O D I N G --
+-- --
+-- B o d y --
+-- --
+-- Copyright (C) 2010, Free Software Foundation, Inc. --
+-- --
+-- GNAT is free software; you can redistribute it and/or modify it under --
+-- terms of the GNU General Public License as published by the Free Soft- --
+-- ware Foundation; either version 3, or (at your option) any later ver- --
+-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. --
+-- --
+-- As a special exception under Section 7 of GPL version 3, you are granted --
+-- additional permissions described in the GCC Runtime Library Exception, --
+-- version 3.1, as published by the Free Software Foundation. --
+-- --
+-- You should have received a copy of the GNU General Public License and --
+-- a copy of the GCC Runtime Library Exception along with this program; --
+-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
+-- <http://www.gnu.org/licenses/>. --
+-- --
+-- GNAT was originally developed by the GNAT team at New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc. --
+-- --
+-------------------------------------------------------------------------------
+with Interfaces; use Interfaces;
+with Unchecked_Conversion;
+
+package body Ada.Strings.UTF_Encoding is
+
+ function To_Unsigned_8 is new
+ Unchecked_Conversion (Character, Unsigned_8);
+
+ function To_Unsigned_16 is new
+ Unchecked_Conversion (Wide_Character, Unsigned_16);
+
+ function To_Unsigned_32 is new
+ Unchecked_Conversion (Wide_Wide_Character, Unsigned_32);
+
+ -- Local subprograms
+
+ procedure Raise_Encoding_Error;
+ -- Called if an invalid input encoding sequence is found by Decode
+
+ function Decode_UTF_8 (Item : String) return Wide_String;
+ -- Equivalent to Decode (Item, UTF_8), but smaller and faster
+
+ function Decode_UTF_8 (Item : String) return Wide_Wide_String;
+ -- Equivalent to Decode (Item, UTF_8), but smaller and faster
+
+ function Encode_UTF_8 (Item : Wide_String) return String;
+ -- Equivalent to Encode (Item, UTF_8) but smaller and faster
+
+ function Encode_UTF_8 (Item : Wide_Wide_String) return String;
+ -- Equivalent to Encode (Item, UTF_8) but smaller and faster
+
+ function Decode_UTF_16 (Item : Wide_String) return Wide_String;
+ -- Equivalent to Decode (Item, UTF_16)
+
+ function Decode_UTF_16 (Item : Wide_String) return Wide_Wide_String;
+ -- Equivalent to Decode (Item, UTF_16)
+
+ function Encode_UTF_16 (Item : Wide_String) return Wide_String;
+ -- Equivalent to Encode (Item, UTF_16)
+
+ function Encode_UTF_16 (Item : Wide_Wide_String) return Wide_String;
+ -- Equivalent to Encode (Item, UTF_16)
+
+ ------------
+ -- Decode --
+ ------------
+
+ -- String input with Wide_String output (short encodings)
+
+ function Decode
+ (Item : String;
+ Scheme : Short_Encoding := UTF_8) return Wide_String
+ is
+ begin
+ -- UTF-8 encoding case
+
+ if Scheme = UTF_8 then
+ return Decode_UTF_8 (Item);
+
+ -- Case of UTF_16LE or UTF_16BE
+
+ else
+ UTF16_XE : declare
+ Input_UTF16 : Wide_String (1 .. Item'Length / 2);
+ -- UTF_16 input string
+
+ Iptr : Natural;
+ -- Pointer to next location to store in Input_UTF16
+
+ Ptr : Natural;
+ -- Input string pointer
+
+ H, L : Natural range 0 .. 1;
+ -- Offset for high and low order bytes
+
+ begin
+ -- In both cases, the input string must be even in length, since
+ -- we have two input characters for each input code in UTF_16.
+
+ if Item'Length mod 2 /= 0 then
+ Raise_Encoding_Error;
+ end if;
+
+ -- We first assemble the UTF_16 string from the input. Set offsets
+ -- for the two bytes. For UTF_16LE we have low order/high order.
+ -- For UTF_16BE we have high order/low order.
+
+ if Scheme = UTF_16LE then
+ L := 0;
+ H := 1;
+ else
+ L := 1;
+ H := 0;
+ end if;
+
+ -- Loop to convert input to UTF_16 form
+
+ Iptr := 1;
+ Ptr := Item'First;
+ while Ptr < Item'Last loop
+ Input_UTF16 (Iptr) :=
+ Wide_Character'Val
+ (Unsigned_16 (To_Unsigned_8 (Item (Ptr + L)))
+ or
+ Shift_Left
+ (Unsigned_16 (To_Unsigned_8 (Item (Ptr + H))), 8));
+ Iptr := Iptr + 1;
+ Ptr := Ptr + 2;
+ end loop;
+
+ -- Result is obtained by converting this UTF_16 input. Note that
+ -- we rely on this nested call to Decode to skip any BOM present.
+
+ return Decode (Input_UTF16);
+ end UTF16_XE;
+ end if;
+ end Decode;
+
+ -- String input with Wide_Wide_String output (short encodings)
+
+ function Decode
+ (Item : String;
+ Scheme : Short_Encoding := UTF_8) return Wide_Wide_String
+ is
+ begin
+ -- UTF-8 encoding case
+
+ if Scheme = UTF_8 then
+ return Decode_UTF_8 (Item);
+
+ -- Case of UTF_16LE or UTF_16BE
+
+ else
+ UTF16_XE : declare
+ Input_UTF16 : Wide_String (1 .. Item'Length / 2);
+ -- UTF_16 input string
+
+ Iptr : Natural;
+ -- Pointer to next location to store in Input_UTF16
+
+ Ptr : Natural;
+ -- Input string pointer
+
+ H, L : Integer range 0 .. 1;
+ -- Offset for high and low order bytes
+
+ begin
+ -- In both cases, the input string must be even in length, since
+ -- we have two input characters for each input code in UTF_16.
+
+ if Item'Length mod 2 /= 0 then
+ Raise_Encoding_Error;
+ end if;
+
+ -- We first assemble the UTF_16 string from the input. Set offsets
+ -- for the two bytes. For UTF_16LE we have low order/high order.
+ -- For UTF_16BE we have high order/low order.
+
+ if Scheme = UTF_16LE then
+ L := 0;
+ H := 1;
+ else
+ L := 1;
+ H := 0;
+ end if;
+
+ -- Loop to convert input to UTF_16 form
+
+ Ptr := Item'First;
+ Iptr := 1;
+ while Ptr < Item'Last loop
+ Input_UTF16 (Iptr) :=
+ Wide_Character'Val
+ (Unsigned_16 (To_Unsigned_8 (Item (Ptr + L)))
+ or
+ Shift_Left
+ (Unsigned_16 (To_Unsigned_8 (Item (Ptr + H))), 8));
+ Iptr := Iptr + 1;
+ Ptr := Ptr + 2;
+ end loop;
+
+ -- Result is obtained by converting this UTF_16 input. Note that
+ -- we rely on this nested call to Decode to skip any BOM present.
+
+ return Decode_UTF_16 (Input_UTF16);
+ end UTF16_XE;
+ end if;
+ end Decode;
+
+ -- Wide_String input with Wide_Wide_String output (long encodings)
+
+ function Decode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String
+ is
+ pragma Unreferenced (Scheme);
+ begin
+ return Decode_UTF_16 (Item);
+ end Decode;
+
+ -- Wide_String input with Wide_Wide_String output (long encodings)
+
+ function Decode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_Wide_String
+ is
+ pragma Unreferenced (Scheme);
+ begin
+ return Decode_UTF_16 (Item);
+ end Decode;
+
+ -------------------
+ -- Decode_UTF_16 --
+ -------------------
+
+ -- Version returning Wide_String result
+
+ function Decode_UTF_16 (Item : Wide_String) return Wide_String is
+ Result : Wide_String (1 .. Item'Length);
+ -- Result is same length as input (possibly minus 1 if BOM present)
+
+ Len : Natural := 0;
+ -- Length of result
+
+ Cod : Unsigned_16;
+ J : Positive;
+
+ begin
+ -- Skip UTF-16 BOM at start
+
+ J := Item'First;
+
+ if J <= Item'Last and then Item (J) = BOM_16 (1) then
+ J := J + 1;
+ end if;
+
+ -- Loop through input characters
+
+ while J <= Item'Last loop
+ Cod := To_Unsigned_16 (Item (J));
+
+ -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFF#
+ -- represent their own value.
+
+ if Cod <= 16#D7FF# or else Cod >= 16#E000# then
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (Cod);
+
+ -- Codes in the range 16#D800#..16#DBFF# represent the first of the
+ -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
+ -- Such codes are out of range for 16-bit output.
+
+ -- The remaining case of input in the range 16#DC00#..16#DFFF# must
+ -- never occur, since it means we have a second surrogate character
+ -- with no corresponding first surrogate.
+
+ -- Thus all remaining codes are invalid
+
+ else
+ Raise_Encoding_Error;
+ end if;
+
+ J := J + 1;
+ end loop;
+
+ return Result (1 .. Len);
+ end Decode_UTF_16;
+
+ -- Version returning Wide_Wide_String result
+
+ function Decode_UTF_16 (Item : Wide_String) return Wide_Wide_String is
+ Result : Wide_Wide_String (1 .. Item'Length);
+ -- Result cannot be longer than the input string
+
+ Len : Natural := 0;
+ -- Length of result
+
+ Cod : Unsigned_16;
+ J : Positive;
+ Rcod : Unsigned_32;
+
+ begin
+ -- Skip UTF-16 BOM at start
+
+ J := Item'First;
+
+ if J <= Item'Last and then Item (J) = BOM_16 (1) then
+ J := J + 1;
+ end if;
+
+ -- Loop through input characters
+
+ while J <= Item'Last loop
+ Cod := To_Unsigned_16 (Item (J));
+
+ -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFF#
+ -- represent their own value.
+
+ if Cod <= 16#D7FF# or else Cod >= 16#E000# then
+ Len := Len + 1;
+ Result (Len) := Wide_Wide_Character'Val (Cod);
+
+ -- Codes in the range 16#D800#..16#DBFF# represent the first of the
+ -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
+
+ elsif Cod <= 16#DBFF# then
+ Rcod := (Unsigned_32 (Cod) - 16#D800#) * 2 ** 10;
+
+ -- Error if at end of string
+
+ if J = Item'Last then
+ Raise_Encoding_Error;
+
+ -- Otherwise next character must be valid low order surrogate
+
+ else
+ J := J + 1;
+ Cod := To_Unsigned_16 (Item (J));
+
+ if Cod < 16#DC00# or else Cod > 16#DFFF# then
+ Raise_Encoding_Error;
+
+ else
+ Rcod := Rcod + (Unsigned_32 (Cod) mod 2 ** 10) + 16#01_0000#;
+ Len := Len + 1;
+ Result (Len) := Wide_Wide_Character'Val (Rcod);
+ end if;
+ end if;
+
+ -- If input is in the range 16#DC00#..16#DFFF#, we have a second
+ -- surrogate character with no corresponding first surrogate.
+
+ else
+ Raise_Encoding_Error;
+ end if;
+
+ J := J + 1;
+ end loop;
+
+ return Result (1 .. Len);
+ end Decode_UTF_16;
+
+ ------------------
+ -- Decode_UTF_8 --
+ ------------------
+
+ -- Version returning Wide_String result
+
+ function Decode_UTF_8 (Item : String) return Wide_String is
+ Result : Wide_String (1 .. Item'Length);
+ -- Result string (worst case is same length as input)
+
+ Len : Natural := 0;
+ -- Length of result stored so far
+
+ Ptr : Natural;
+ -- Input string pointer
+
+ C : Unsigned_8;
+ R : Unsigned_16;
+
+ procedure Get_Continuation;
+ -- Reads a continuation byte of the form 10xxxxxx, shifts R left
+ -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
+ -- return Ptr is incremented. Raises exceptioon if continuation
+ -- byte does not exist or is invalid.
+
+ ----------------------
+ -- Get_Continuation --
+ ----------------------
+
+ procedure Get_Continuation is
+ begin
+ if Ptr > Item'Last then
+ Raise_Encoding_Error;
+
+ else
+ C := To_Unsigned_8 (Item (Ptr));
+ Ptr := Ptr + 1;
+
+ if C < 2#10_000000# or else C > 2#10_111111# then
+ Raise_Encoding_Error;
+
+ else
+ R := Shift_Left (R, 6) or
+ Unsigned_16 (C and 2#00_111111#);
+ end if;
+ end if;
+ end Get_Continuation;
+
+ -- Start of processing for Decode_UTF_8
+
+ begin
+ Ptr := Item'First;
+
+ -- Skip BOM at start
+
+ if Ptr + 2 <= Item'Last
+ and then Item (Ptr .. Ptr + 2) = BOM_8
+ then
+ Ptr := Ptr + 3;
+ end if;
+
+ -- Loop through input characters
+
+ while Ptr <= Item'Last loop
+ C := To_Unsigned_8 (Item (Ptr));
+ Ptr := Ptr + 1;
+
+ -- Codes in the range 16#00# - 16#7F# are represented as
+ -- 0xxxxxxx
+
+ if C <= 16#7F# then
+ R := Unsigned_16 (C);
+
+ -- No initial code can be of the form 10xxxxxx. Such codes are used
+ -- only for continuations.
+
+ elsif C <= 2#10_111111# then
+ Raise_Encoding_Error;
+
+ -- Codes in the range 16#80# - 16#7FF# are represented as
+ -- 110yyyxx 10xxxxxx
+
+ elsif C <= 2#110_11111# then
+ R := Unsigned_16 (C and 2#000_11111#);
+ Get_Continuation;
+
+ -- Codes in the range 16#800# - 16#FFFF# are represented as
+ -- 1110yyyy 10yyyyxx 10xxxxxx
+
+ elsif C <= 2#1110_1111# then
+ R := Unsigned_16 (C and 2#0000_1111#);
+ Get_Continuation;
+ Get_Continuation;
+
+ -- Codes in the range 16#10000# - 16#10FFFF# are represented as
+ -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+ -- Such codes are out of range for Wide_String output
+
+ else
+ Raise_Encoding_Error;
+ end if;
+
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (R);
+ end loop;
+
+ return Result (1 .. Len);
+ end Decode_UTF_8;
+
+ -- Version returning Wide_Wide_String result
+
+ function Decode_UTF_8 (Item : String) return Wide_Wide_String is
+ Result : Wide_Wide_String (1 .. Item'Length);
+ -- Result string (worst case is same length as input)
+
+ Len : Natural := 0;
+ -- Length of result stored so far
+
+ Ptr : Natural;
+ -- Input string pointer
+
+ C : Unsigned_8;
+ R : Unsigned_32;
+
+ procedure Get_Continuation;
+ -- Reads a continuation byte of the form 10xxxxxx, shifts R left
+ -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
+ -- return Ptr is incremented. Raises exceptioon if continuation
+ -- byte does not exist or is invalid.
+
+ ----------------------
+ -- Get_Continuation --
+ ----------------------
+
+ procedure Get_Continuation is
+ begin
+ if Ptr > Item'Last then
+ raise Encoding_Error with
+ "incomplete UTF-8 encoding sequence";
+
+ else
+ C := To_Unsigned_8 (Item (Ptr));
+ Ptr := Ptr + 1;
+
+ if C < 2#10_000000# or else C > 2#10_111111# then
+ Raise_Encoding_Error;
+
+ else
+ R := Shift_Left (R, 6) or
+ Unsigned_32 (C and 2#00_111111#);
+ end if;
+ end if;
+ end Get_Continuation;
+
+ -- Start of processing for UTF8_Decode
+
+ begin
+ Ptr := Item'First;
+
+ -- Skip BOM at start
+
+ if Ptr + 2 <= Item'Last
+ and then Item (Ptr .. Ptr + 2) = BOM_8
+ then
+ Ptr := Ptr + 3;
+ end if;
+
+ -- Loop through input characters
+
+ while Ptr <= Item'Last loop
+ C := To_Unsigned_8 (Item (Ptr));
+ Ptr := Ptr + 1;
+
+ -- Codes in the range 16#00# - 16#7F# are represented as
+ -- 0xxxxxxx
+
+ if C <= 16#7F# then
+ R := Unsigned_32 (C);
+
+ -- No initial code can be of the form 10xxxxxx. Such codes are used
+ -- only for continuations.
+
+ elsif C <= 2#10_111111# then
+ Raise_Encoding_Error;
+
+ -- Codes in the range 16#80# - 16#7FF# are represented as
+ -- 110yyyxx 10xxxxxx
+
+ elsif C <= 2#110_11111# then
+ R := Unsigned_32 (C and 2#000_11111#);
+ Get_Continuation;
+
+ -- Codes in the range 16#800# - 16#FFFF# are represented as
+ -- 1110yyyy 10yyyyxx 10xxxxxx
+
+ elsif C <= 2#1110_1111# then
+ R := Unsigned_32 (C and 2#0000_1111#);
+ Get_Continuation;
+ Get_Continuation;
+
+ -- Codes in the range 16#10000# - 16#10FFFF# are represented as
+ -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+ elsif C <= 2#11110_111# then
+ R := Unsigned_32 (C and 2#00000_111#);
+ Get_Continuation;
+ Get_Continuation;
+ Get_Continuation;
+
+ -- Any other code is an error
+
+ else
+ Raise_Encoding_Error;
+ end if;
+
+ Len := Len + 1;
+ Result (Len) := Wide_Wide_Character'Val (R);
+ end loop;
+
+ return Result (1 .. Len);
+ end Decode_UTF_8;
+
+ ------------
+ -- Encode --
+ ------------
+
+ -- Version with Wide_String input returning encoded String
+
+ function Encode
+ (Item : Wide_String;
+ Scheme : Short_Encoding := UTF_8) return String
+ is
+ begin
+ -- Case of UTF_8
+
+ if Scheme = UTF_8 then
+ return Encode_UTF_8 (Item);
+
+ -- Case of UTF_16LE or UTF_16BE
+
+ else
+ UTF16XE_Encode : declare
+ UTF16_Str : constant Wide_String := Encode_UTF_16 (Item);
+ Result : String (1 .. 2 * UTF16_Str'Last);
+
+ H, L : Integer range -1 .. 0;
+ -- Offset for high and low order bytes
+
+ C : Unsigned_16;
+ -- One UTF_16 output value
+
+ begin
+ -- Set proper byte offsets
+
+ -- Set the byte order for the two bytes of each UTF_16 input code.
+ -- For UTF_16LE we have low order/high order. For UTF_16BE we have
+ -- high order/low order.
+
+ if Scheme = UTF_16LE then
+ L := -1;
+ H := 0;
+ else
+ L := 0;
+ H := -1;
+ end if;
+
+ -- Now copy the UTF_16 string to the result string
+
+ pragma Warnings (Off);
+ for J in 1 .. UTF16_Str'Last loop
+ C := To_Unsigned_16 (UTF16_Str (J));
+ Result (2 * J + L) := Character'Val (C and 16#FF#);
+ Result (2 * J + H) := Character'Val (Shift_Right (C, 8));
+ end loop;
+
+ return Result;
+ end UTF16XE_Encode;
+ end if;
+ end Encode;
+
+ -- Version with Wide_Wide_String input returning String
+
+ function Encode
+ (Item : Wide_Wide_String;
+ Scheme : Short_Encoding := UTF_8) return String
+ is
+ begin
+ -- Case of UTF_8
+
+ if Scheme = UTF_8 then
+ return Encode_UTF_8 (Item);
+
+ -- Case of UTF_16LE or UTF_16BE
+
+ else
+ UTF16XE_Encode : declare
+ UTF16_Str : constant Wide_String := Encode (Item, UTF_16);
+ Result : String (1 .. 2 * UTF16_Str'Last);
+
+ H, L : Integer range -1 .. 0;
+ -- Offset for high and low order bytes
+
+ C : Unsigned_16;
+ -- One UTF_16 output value
+
+ begin
+ -- Set proper byte offsets
+
+ -- Set the byte order for the two bytes of each UTF_16 input code.
+ -- For UTF_16LE we have low order/high order. For UTF_16BE we have
+ -- high order/low order.
+
+ if Scheme = UTF_16LE then
+ L := -1;
+ H := 0;
+ else
+ L := 0;
+ H := -1;
+ end if;
+
+ -- Now copy the UTF_16 string to the result string
+
+ for J in 1 .. UTF16_Str'Last loop
+ C := To_Unsigned_16 (UTF16_Str (J));
+ Result (2 * J + L) := Character'Val (C and 16#FF#);
+ Result (2 * J + H) := Character'Val (Shift_Right (C, 8));
+ end loop;
+
+ return Result;
+ end UTF16XE_Encode;
+ end if;
+ end Encode;
+
+ -- Wide_String input returning encoded Wide_String (long encodings)
+
+ function Encode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String
+ is
+ pragma Unreferenced (Scheme);
+ begin
+ return Encode_UTF_16 (Item);
+ end Encode;
+
+ -- Wide_Wide_String input returning Wide_String (long encodings)
+
+ function Encode
+ (Item : Wide_Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String
+ is
+ pragma Unreferenced (Scheme);
+ begin
+ return Encode_UTF_16 (Item);
+ end Encode;
+
+ -------------------
+ -- Encode_UTF_16 --
+ -------------------
+
+ -- Wide_String input with UTF-16 encoded Wide_String output
+
+ function Encode_UTF_16 (Item : Wide_String) return Wide_String is
+ Result : Wide_String (1 .. Item'Length);
+ -- Output is same length as input (we do not add a BOM!)
+
+ Len : Integer := 0;
+ -- Length of output string
+
+ Cod : Unsigned_16;
+
+ begin
+ -- Loop through input characters encoding them
+
+ for J in Item'Range loop
+ Cod := To_Unsigned_16 (Item (J));
+
+ -- Codes in the range 16#0000#..16#D7FF# are output unchanged
+
+ if Cod <= 16#D7FF# then
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (Cod);
+
+ -- Codes in tne range 16#D800#..16#DFFF# should never appear in the
+ -- input, since no valid Unicode characters are in this range (which
+ -- would conflict with the UTF-16 surrogate encodings).
+
+ elsif Cod <= 16#DFFF# then
+ raise Constraint_Error with
+ "Wide_Character in range 16#D800# .. 16#DFFF#";
+
+ -- Codes in the range 16#E000#..16#FFFF# are output unchanged
+
+ else
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (Cod);
+ end if;
+ end loop;
+
+ return Result (1 .. Len);
+ end Encode_UTF_16;
+
+ -- Wide_Wide_String input with UTF-16 encoded Wide_String output
+
+ function Encode_UTF_16 (Item : Wide_Wide_String) return Wide_String is
+ Result : Wide_String (1 .. 2 * Item'Length);
+ -- Worst case is each input character generates two output characters
+
+ Len : Integer := 0;
+ -- Length of output string
+
+ Cod : Unsigned_32;
+
+ begin
+ -- Loop through input characters encoding them
+
+ for J in Item'Range loop
+ Cod := To_Unsigned_32 (Item (J));
+
+ -- Codes in the range 16#00_0000#..16#00_D7FF# are output unchanged
+
+ if Cod <= 16#00_D7FF# then
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (Cod);
+
+ -- Codes in tne range 16#00_D800#..16#00_DFFF# should never appear
+ -- in the input, since no valid Unicode characters are in this range
+ -- (which would conflict with the UTF-16 surrogate encodings).
+
+ elsif Cod <= 16#00_DFFF# then
+ raise Constraint_Error with
+ "Wide_Wide_Character in range 16#00_D800# .. 16#00_DFFF#";
+
+ -- Codes in the range 16#00_E000#..16#00_FFFF# are output unchanged
+
+ elsif Cod <= 16#00_FFFF# then
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (Cod);
+
+ -- Codes in the range 16#01_0000#..16#10_FFFF# are output using two
+ -- surrogate characters. First 16#1_0000# is subtracted from the code
+ -- point to give a 20-bit value. This is then split into two separate
+ -- 10-bit values each of which is represented as a surrogate with the
+ -- most significant half placed in the first surrogate. To allow safe
+ -- use of simple word-oriented string processing, separate ranges of
+ -- values are used for the two surrogates: 16#D800#-16#DBFF# for the
+ -- first, most significant surrogate and 16#DC00#-16#DFFF# for the
+ -- second, least significant surrogate.
+
+ elsif Cod <= 16#10_FFFF# then
+ Cod := Cod - 16#1_0000#;
+
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (16#D800# + Cod / 2 ** 10);
+
+ Len := Len + 1;
+ Result (Len) := Wide_Character'Val (16#DC00# + Cod mod 2 ** 10);
+
+ -- Codes larger than 16#10_FFFF# are invalid
+
+ else
+ raise Constraint_Error with
+ "Wide_Wide_Character exceeds maximum value of 16#10_FFFF#";
+ end if;
+ end loop;
+
+ return Result (1 .. Len);
+ end Encode_UTF_16;
+
+ ------------------
+ -- Encode_UTF_8 --
+ ------------------
+
+ -- Wide_String input with UTF_8 encoded String output
+
+ function Encode_UTF_8 (Item : Wide_String) return String is
+ Result : String (1 .. 3 * Item'Length);
+ -- Worst case is three bytes per input byte
+
+ N : Natural := 0;
+ -- Number of output codes stored in Result
+
+ C : Unsigned_16;
+ -- Single input character
+
+ procedure Store (C : Unsigned_16);
+ pragma Inline (Store);
+ -- Store one output code, C is in the range 0 .. 255
+
+ -----------
+ -- Store --
+ -----------
+
+ procedure Store (C : Unsigned_16) is
+ begin
+ N := N + 1;
+ Result (N) := Character'Val (C);
+ end Store;
+
+ -- Start of processing for UTF8_Encode
+
+ begin
+ -- Loop through characters of input
+
+ for J in Item'Range loop
+ C := To_Unsigned_16 (Item (J));
+
+ -- Codes in the range 16#00# - 16#7F# are represented as
+ -- 0xxxxxxx
+
+ if C <= 16#7F# then
+ Store (C);
+
+ -- Codes in the range 16#80# - 16#7FF# are represented as
+ -- 110yyyxx 10xxxxxx
+
+ elsif C <= 16#7FF# then
+ Store (2#110_00000# or Shift_Right (C, 6));
+ Store (2#10_000000# or (C and 2#00_111111#));
+
+ -- Codes in the range 16#800# - 16#FFFF# are represented as
+ -- 1110yyyy 10yyyyxx 10xxxxxx
+
+ else
+ Store (2#1110_0000# or Shift_Right (C, 12));
+ Store (2#10_000000# or
+ Shift_Right (C and 2#111111_000000#, 6));
+ Store (2#10_000000# or (C and 2#00_111111#));
+ end if;
+ end loop;
+
+ return Result (1 .. N);
+ end Encode_UTF_8;
+
+ -- Wide_Wide_String input with UTF_8 encoded String output
+
+ function Encode_UTF_8 (Item : Wide_Wide_String) return String is
+ Result : String (1 .. 4 * Item'Length);
+ -- Worst case is four bytes per input byte
+
+ N : Natural := 0;
+ -- Number of output codes stored in Result
+
+ C : Unsigned_32;
+ -- Single input character
+
+ procedure Store (C : Unsigned_32);
+ pragma Inline (Store);
+ -- Store one output code (input is in range 0 .. 255)
+
+ -----------
+ -- Store --
+ -----------
+
+ procedure Store (C : Unsigned_32) is
+ begin
+ N := N + 1;
+ Result (N) := Character'Val (C);
+ end Store;
+
+ -- Start of processing for UTF8_Encode
+
+ begin
+ -- Loop through characters of input
+
+ for J in Item'Range loop
+ C := To_Unsigned_32 (Item (J));
+
+ -- Codes in the range 16#00# - 16#7F# are represented as
+ -- 0xxxxxxx
+
+ if C <= 16#7F# then
+ Store (C);
+
+ -- Codes in the range 16#80# - 16#7FF# are represented as
+ -- 110yyyxx 10xxxxxx
+
+ elsif C <= 16#7FF# then
+ Store (2#110_00000# or Shift_Right (C, 6));
+ Store (2#10_000000# or (C and 2#00_111111#));
+
+ -- Codes in the range 16#800# - 16#FFFF# are represented as
+ -- 1110yyyy 10yyyyxx 10xxxxxx
+
+ elsif C <= 16#FFFF# then
+ Store (2#1110_0000# or Shift_Right (C, 12));
+ Store (2#10_000000# or
+ Shift_Right (C and 2#111111_000000#, 6));
+ Store (2#10_000000# or (C and 2#00_111111#));
+
+ -- Codes in the range 16#10000# - 16#10FFFF# are represented as
+ -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+ elsif C <= 16#10_FFFF# then
+ Store (2#11110_000# or Shift_Right (C, 18));
+ Store (2#10_000000# or
+ Shift_Right (C and 2#111111_000000_000000#, 12));
+ Store (2#10_000000#
+ or Shift_Right (C and 2#111111_000000#, 6));
+ Store (2#10_000000# or (C and 2#00_111111#));
+
+ -- Codes higher than 16#10_FFFF# should not appear
+
+ else
+ raise Constraint_Error with
+ "out of range invalid value in Encode input";
+ end if;
+ end loop;
+
+ return Result (1 .. N);
+ end Encode_UTF_8;
+
+ --------------
+ -- Encoding --
+ --------------
+
+ -- Version taking String input
+
+ function Encoding (Item : String) return Encoding_Scheme is
+ begin
+ if Item'Length >= 2 then
+ if Item (Item'First .. Item'First + 1) = BOM_16BE then
+ return UTF_16BE;
+
+ elsif Item (Item'First .. Item'First + 1) = BOM_16LE then
+ return UTF_16LE;
+
+ elsif Item'Length >= 3
+ and then Item (Item'First .. Item'First + 2) = BOM_8
+ then
+ return UTF_8;
+ end if;
+ end if;
+
+ return UTF_None;
+ end Encoding;
+
+ -- Version taking Wide_String input
+
+ function Encoding (Item : Wide_String) return Encoding_Scheme is
+ begin
+ if Item'Length >= 1
+ and then Item (Item'First .. Item'First) = BOM_16
+ then
+ return UTF_16;
+ else
+ return UTF_None;
+ end if;
+ end Encoding;
+
+ ------------------------
+ -- Raise_Encoding_Error --
+ ------------------------
+
+ procedure Raise_Encoding_Error is
+ begin
+ raise Encoding_Error with "invalid input encoding sequence";
+ end Raise_Encoding_Error;
+
+end Ada.Strings.UTF_Encoding;
===================================================================
@@ -0,0 +1,117 @@
+------------------------------------------------------------------------------
+-- --
+-- GNAT RUN-TIME COMPONENTS --
+-- --
+-- A D A . S T R I N G S . U T F _ E N C O D I N G --
+-- --
+-- S p e c --
+-- --
+-- This specification is derived from the Ada Reference Manual for use with --
+-- GNAT. In accordance with the copyright of that document, you can freely --
+-- copy and modify this specification, provided that if you redistribute a --
+-- modified version, any changes that you have made are clearly indicated. --
+-- --
+------------------------------------------------------------------------------
+
+-- This is the Ada 2012 package defined in AI05-0137-1. It is used for
+-- encoding strings using UTF encodings (UTF-8, UTF-16LE, UTF-16BE, UTF-16).
+
+-- Compared with version 05 of the AI, we have added routines for UTF-16
+-- encoding and decoding of wide strings, which seems missing from the AI,
+-- added comments, and reordered the declarations.
+
+-- Note: although this is an Ada 2012 package, the earlier versions of the
+-- language permit the addition of new grandchildren of Ada, so we are able
+-- to add this package unconditionally for use in Ada 2005 mode. We cannot
+-- allow it in earlier versions, since it requires Wide_Wide_Character/String.
+
+package Ada.Strings.UTF_Encoding is
+ pragma Pure (UTF_Encoding);
+
+ type Encoding_Scheme is (UTF_None, UTF_8, UTF_16BE, UTF_16LE, UTF_16);
+
+ subtype Short_Encoding is Encoding_Scheme range UTF_8 .. UTF_16LE;
+ subtype Long_Encoding is Encoding_Scheme range UTF_16 .. UTF_16;
+
+ -- The BOM (BYTE_ORDER_MARK) values defined here are used at the start of
+ -- a string to indicate the encoding. The convention in this package is
+ -- that decoding routines ignore a BOM, and output of encoding routines
+ -- does not include a BOM. If you want to include a BOM in the output,
+ -- you simply concatenate the appropriate value at the start of the string.
+
+ BOM_8 : constant String :=
+ Character'Val (16#EF#) &
+ Character'Val (16#BB#) &
+ Character'Val (16#BF#);
+
+ BOM_16BE : constant String :=
+ Character'Val (16#FE#) &
+ Character'Val (16#FF#);
+
+ BOM_16LE : constant String :=
+ Character'Val (16#FF#) &
+ Character'Val (16#FE#);
+
+ BOM_16 : constant Wide_String :=
+ (1 => Wide_Character'Val (16#FEFF#));
+
+ -- The encoding routines take a wide string or wide wide string as input
+ -- and encode the result using the specified UTF encoding method. For
+ -- UTF-16, the output is returned as a Wide_String, this is not a normal
+ -- Wide_String, since the codes in it may represent UTF-16 surrogate
+ -- characters used to encode large values. Similarly for UTF-8, UTF-16LE,
+ -- and UTF-16BE, the output is returned in a String, and again this String
+ -- is not a standard format string, since it may include UTF-8 surrogates.
+ -- As previously noted, the returned value does NOT start with a BOM.
+
+ -- Note: invalid codes in calls to one of the Encode routines represent
+ -- invalid values in the sense that they are not defined. For example, the
+ -- code 16#DC03# is not a valid wide character value. Such values result
+ -- in undefined behavior. For GNAT, Constraint_Error is raised with an
+ -- appropriate exception message.
+
+ function Encode
+ (Item : Wide_String;
+ Scheme : Short_Encoding := UTF_8) return String;
+ function Encode
+ (Item : Wide_Wide_String;
+ Scheme : Short_Encoding := UTF_8) return String;
+
+ function Encode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String;
+ function Encode
+ (Item : Wide_Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String;
+
+ -- The decoding routines take a String or Wide_String input which is an
+ -- encoded string using the specified encoding. The output is a normal
+ -- Ada Wide_String or Wide_Wide_String value representing the decoded
+ -- values. Note that a BOM in the input matching the encoding is skipped.
+
+ Encoding_Error : exception;
+ -- Exception raised if an invalid encoding sequence is encountered by
+ -- one of the Decode routines.
+
+ function Decode
+ (Item : String;
+ Scheme : Short_Encoding := UTF_8) return Wide_String;
+ function Decode
+ (Item : String;
+ Scheme : Short_Encoding := UTF_8) return Wide_Wide_String;
+
+ function Decode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_String;
+ function Decode
+ (Item : Wide_String;
+ Scheme : Long_Encoding := UTF_16) return Wide_Wide_String;
+
+ -- The Encoding functions inspect an encoded string or wide_string and
+ -- determine if a BOM is present. If so, the appropriate Encoding_Scheme
+ -- is returned. If not, then UTF_None is returned.
+
+ function Encoding (Item : String) return Encoding_Scheme;
+ function Encoding (Item : Wide_String) return Encoding_Scheme;
+
+end Ada.Strings.UTF_Encoding;
===================================================================
@@ -211,6 +211,7 @@ GNATRTL_NONTASKING_OBJS= \
a-ststio$(objext) \
a-stunau$(objext) \
a-stunha$(objext) \
+ a-stuten$(objext) \
a-stwibo$(objext) \
a-stwifi$(objext) \
a-stwiha$(objext) \