Patchwork [Ada] Implement Ada.Strings.UTF_Encoding

login
register
mail settings
Submitter Arnaud Charlet
Date June 23, 2010, 9:54 a.m.
Message ID <20100623095419.GA2828@adacore.com>
Download mbox | patch
Permalink /patch/56623/
State New
Headers show

Comments

Arnaud Charlet - June 23, 2010, 9:54 a.m.
This patch provides the first full implementation of the new Ada
2012 package Ada.Strings.UTF_Encoding, which is also available in
Ada 2005 mode (but not Ada 95 mode, since Wide_Wide_Character is
required). The package is a full implementation of AI05-137-1, see
http://www.ada-auth.org/cgi-bin/cvsweb.cgi/ai05s/ai05-0137-1.txt,
with some additions. Full documentation is in the a-stuten.ads file.

The following is a test of all capabilities with output:

pragma Ada_05;
with Ada.Strings.UTF_Encoding;
use  Ada.Strings.UTF_Encoding;
with Ada.Text_IO; use Ada.Text_IO;

procedure UTF_Test is
   subtype WC is Wide_Character;
   subtype WS is Wide_String;
   subtype WWC is Wide_Wide_Character;
   subtype WWS is Wide_Wide_String;

   procedure Test (Test_Name : String; S1, S2 : WS);
   procedure Test (Test_Name : String; S1, S2 : WWS);
   --  S1 should equal S2 for given test name

   procedure Test (Test_Name : String; S1, S2 : WS) is
   begin
      if S1 = S2 then
         Put_Line ("Test " & Test_Name & " passed");
      else
         Put_Line ("Test " & Test_Name & " failed");
         for J in S1'Range loop
            Put_Line
              ("  S1 ("
               & J'Img
               & " ) = Wide_Character'Val ("
               & Integer'Image (WC'Pos (S1 (J)))
               & " )");
         end loop;

         for J in S2'Range loop
            Put_Line
              ("  S2 ("
               & J'Img
               & " ) = Wide_Character'Val ("
               & Integer'Image (WC'Pos (S2 (J)))
               & " )");
         end loop;
      end if;
   end Test;

   procedure Test (Test_Name : String; S1, S2 : WWS) is
   begin
      if S1 = S2 then
         Put_Line ("Test " & Test_Name & " passed");
      else
         Put_Line ("Test " & Test_Name & " failed");
         for J in S1'Range loop
            Put_Line
              ("  S1 ("
               & J'Img
               & " ) = Wide_Wide_Character'Val ("
               & Integer'Image (WWC'Pos (S1 (J)))
               & " )");
         end loop;

         for J in S2'Range loop
            Put_Line
              ("  S2 ("
               & J'Img
               & " ) = Wide_Wide_Character'Val ("
               & Integer'Image (WWC'Pos (S2 (J)))
               & " )");
         end loop;
      end if;
   end Test;

begin
   --  Test series A: Wide_String in UTF_8

   declare
      T0 : WS := (
             WC'Val (16#70#),
             WC'Val (16#700#),
             WC'Val (16#7000#));
      T1 : String := Encode (T0, UTF_8);
      T2 : String := BOM_8 & T1;
      T3 : WS := Decode (T1, UTF_8);
      T4 : WS := Decode (T2, UTF_8);
      T5 : WS := Decode (T2, Encoding (T2));
   begin
      Test ("A1", T0, T3);
      Test ("A2", T0, T4);
      Test ("A3", T0, T5);
   end;

   --  Test series B: Wide_Wide_String in UTF_8

   declare
      T0 : WWS := (
             WWC'Val (16#70#),
             WWC'Val (16#700#),
             WWC'Val (16#7000#),
             WWC'Val (16#01_7000#));
      T1 : String := Encode (T0, UTF_8);
      T2 : String := BOM_8 & T1;
      T3 : WWS := Decode (T1, UTF_8);
      T4 : WWS := Decode (T2, UTF_8);
      T5 : WWS := Decode (T2, Encoding (T2));
   begin
      Test ("B1", T0, T3);
      Test ("B2", T0, T4);
      Test ("B3", T0, T5);
   end;

   --  Test series C: Wide_String in UTF_16LE

   declare
      T0 : WS := (
             WC'Val (16#8900#),
             WC'Val (16#E900#));
      T1 : String := Encode (T0, UTF_16LE);
      T2 : String := BOM_16LE & T1;
      T3 : WS := Decode (T1, UTF_16LE);
      T4 : WS := Decode (T2, UTF_16LE);
      T5 : WS := Decode (T2, Encoding (T2));
   begin
      Test ("C1", T0, T3);
      Test ("C2", T0, T4);
      Test ("C3", T0, T5);
   end;

   --  Test series D: Wide_String in UTF_16BE

   declare
      T0 : WS := (
             WC'Val (16#8900#),
             WC'Val (16#E900#));
      T1 : String := Encode (T0, UTF_16BE);
      T2 : String := BOM_16BE & T1;
      T3 : WS := Decode (T1, UTF_16BE);
      T4 : WS := Decode (T2, UTF_16BE);
      T5 : WS := Decode (T2, Encoding (T2));
   begin
      Test ("D1", T0, T3);
      Test ("D2", T0, T4);
      Test ("D3", T0, T5);
   end;

   --  Test series E: Wide_Wide_String in UTF_16BE

   declare
      T0 : WWS := (
             WWC'Val (16#00_8900#),
             WWC'Val (16#00_E900#),
             WWC'Val (16#07_0000#));
      T1 : String := Encode (T0, UTF_16BE);
      T2 : String := BOM_16BE & T1;
      T3 : WWS := Decode (T1, UTF_16BE);
      T4 : WWS := Decode (T2, UTF_16BE);
      T5 : WWS := Decode (T2, Encoding (T2));
   begin
      Test ("E1", T0, T3);
      Test ("E2", T0, T4);
      Test ("E3", T0, T5);
   end;

   --  Test series F: Wide_Wide_String in UTF_16LE

   declare
      T0 : WWS := (
             WWC'Val (16#00_8900#),
             WWC'Val (16#00_E900#),
             WWC'Val (16#07_0000#));
      T1 : String := Encode (T0, UTF_16LE);
      T2 : String := BOM_16LE & T1;
      T3 : WWS := Decode (T1, UTF_16LE);
      T4 : WWS := Decode (T2, UTF_16LE);
      T5 : WWS := Decode (T2, Encoding (T2));
   begin
      Test ("F1", T0, T3);
      Test ("F2", T0, T4);
      Test ("F3", T0, T5);
   end;

   --  Test series G: Wide_String in UTF_16

   declare
      T0 : WS := (
             WC'Val (16#00_8900#),
             WC'Val (16#00_E900#));
      T1 : WS := Encode (T0, UTF_16);
      T2 : WS := BOM_16 & T1;
      T3 : WS := Decode (T1, UTF_16);
      T4 : WS := Decode (T2, UTF_16);
      T5 : WS := Decode (T2, Encoding (T2));
   begin
      Test ("G1", T0, T3);
      Test ("G2", T0, T4);
      Test ("G3", T0, T5);
   end;

   --  Test series H:  Wide_Wide_String in UTF_16

   declare
      T0 : WWS := (
             WWC'Val (16#00_8900#),
             WWC'Val (16#00_E900#),
             WWC'Val (16#07_0000#));
      T1 : WS := Encode (T0, UTF_16);
      T2 : WS := BOM_16 & T1;
      T3 : WWS := Decode (T1, UTF_16);
      T4 : WWS := Decode (T2, UTF_16);
      T5 : WWS := Decode (T2, Encoding (T2));
   begin
      Test ("H1", T0, T3);
      Test ("H2", T0, T4);
      Test ("H3", T0, T5);
   end;

   --  Test series I: Invalid codes in Wide_String

   declare
      T0 : WS := (
             WC'Val (16#D900#),
             WC'Val (16#E900#));
   begin
      begin
         declare
            T1 : String := Encode (T0, UTF_16LE);
         begin
            null;
         end;
         Put_Line ("Test I1 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test I1 passed");
      end;

      begin
         declare
            T1 : String := Encode (T0, UTF_16BE);
         begin
            null;
         end;
         Put_Line ("Test I2 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test I2 passed");
      end;

      begin
         declare
            T1 : Wide_String := Encode (T0, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test I3 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test I3 passed");
      end;
   end;

   --  Test series J: Invalid codes in Wide_Wide_String

   declare
      T0 : WWS := (
             WWC'Val (16#00_D900#),
             WWC'Val (16#00_E900#));
   begin
      begin
         declare
            T1 : String := Encode (T0, UTF_16LE);
         begin
            null;
         end;
         Put_Line ("Test J1 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test J1 passed");
      end;

      begin
         declare
            T1 : String := Encode (T0, UTF_16BE);
         begin
            null;
         end;
         Put_Line ("Test J2 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test J2 passed");
      end;

      begin
         declare
            T1 : Wide_String := Encode (T0, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test J3 failed");
      exception
         when Constraint_Error =>
            Put_Line ("Test J3 passed");
      end;
   end;

   --  Test series K: Invalid UTF-8 codes

   declare
      T0 : String := (
             Character'Val (16#80#),
             Character'Val (16#20#));
      T1 : String := (
             Character'Val (16#C1#),
             Character'Val (16#05#));

   begin
      begin
         declare
            T2 : Wide_String := Decode (T0, UTF_8);
         begin
            null;
         end;
         Put_Line ("Test K1 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test K1 passed");
      end;

      begin
         declare
            T2 : Wide_Wide_String := Decode (T0, UTF_8);
         begin
            null;
         end;
         Put_Line ("Test K2 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test K2 passed");
      end;

       begin
         declare
            T2 : Wide_String := Decode (T1, UTF_8);
         begin
            null;
         end;
         Put_Line ("Test K3 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test K3 passed");
      end;

      begin
         declare
            T2 : Wide_Wide_String := Decode (T0, UTF_8);
         begin
            null;
         end;
         Put_Line ("Test K4 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test K4 passed");
      end;
   end;

   --  Test series L: Invalid UTF-8 codes

   declare
      T0 : Wide_String := (
             WC'Val (16#DC00#),
             WC'Val (16#007F#));
      T1 : Wide_String := (
             WC'Val (16#D801#),
             WC'Val (16#D801#));

   begin
      begin
         declare
            T2 : Wide_String := Decode (T0, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test L1 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test L1 passed");
      end;

      begin
         declare
            T2 : Wide_Wide_String := Decode (T0, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test L2 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test L2 passed");
      end;

       begin
         declare
            T2 : Wide_String := Decode (T1, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test L3 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test L3 passed");
      end;

      begin
         declare
            T2 : Wide_Wide_String := Decode (T0, UTF_16);
         begin
            null;
         end;
         Put_Line ("Test L4 failed");
      exception
         when Encoding_Error =>
            Put_Line ("Test L4 passed");
      end;
   end;
end UTF_Test;

And the expected output is:

Test A1 passed
Test A2 passed
Test A3 passed
Test B1 passed
Test B2 passed
Test B3 passed
Test C1 passed
Test C2 passed
Test C3 passed
Test D1 passed
Test D2 passed
Test D3 passed
Test E1 passed
Test E2 passed
Test E3 passed
Test F1 passed
Test F2 passed
Test F3 passed
Test G1 passed
Test G2 passed
Test G3 passed
Test H1 passed
Test H2 passed
Test H3 passed
Test I1 passed
Test I2 passed
Test I3 passed
Test J1 passed
Test J2 passed
Test J3 passed
Test K1 passed
Test K2 passed
Test K3 passed
Test K4 passed
Test L1 passed
Test L2 passed
Test L3 passed
Test L4 passed

Tested on x86_64-pc-linux-gnu, committed on trunk

2010-06-23  Robert Dewar  <dewar@adacore.com>

	* a-stuten.ads, a-stuten.adb: New files.
	* impunit.adb: Add engtry for Ada.Strings.UTF_Encoding (a-stuten.ads)
	* Makefile.rtl: Add entry for a-stuten (Ada.Strings.UTF_Encoding)

Patch

Index: impunit.adb
===================================================================
--- impunit.adb	(revision 161191)
+++ impunit.adb	(working copy)
@@ -459,6 +459,11 @@  package body Impunit is
      "a-szuzti",    -- Ada.Strings.Wide_Wide_Unbounded.Wide_Wide_Text_IO
      "a-zchuni",    -- Ada.Wide_Wide_Characters.Unicode
 
+      --  Note: strictly the next one should be an Ada 2012 unit, but it seems
+      --  harmless (and useful) to make it available in Ada 2005 mode.
+
+     "a-stuten",    -- Ada.Strings.UTF_Encoding
+
    ---------------------------
    -- GNAT Special IO Units --
    ---------------------------
Index: a-stuten.adb
===================================================================
--- a-stuten.adb	(revision 0)
+++ a-stuten.adb	(revision 0)
@@ -0,0 +1,1032 @@ 
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--              A D A . S T R I N G S . U T F _ E N C O D I N G             --
+--                                                                          --
+--                                 B o d y                                  --
+--                                                                          --
+--             Copyright (C) 2010, Free Software Foundation, Inc.           --
+--                                                                          --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
+--                                                                          --
+-- As a special exception under Section 7 of GPL version 3, you are granted --
+-- additional permissions described in the GCC Runtime Library Exception,   --
+-- version 3.1, as published by the Free Software Foundation.               --
+--                                                                          --
+-- You should have received a copy of the GNU General Public License and    --
+-- a copy of the GCC Runtime Library Exception along with this program;     --
+-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
+-- <http://www.gnu.org/licenses/>.                                          --
+--                                                                          --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.      --
+--                                                                          --
+-------------------------------------------------------------------------------
+with Interfaces; use Interfaces;
+with Unchecked_Conversion;
+
+package body Ada.Strings.UTF_Encoding is
+
+   function To_Unsigned_8 is new
+     Unchecked_Conversion (Character, Unsigned_8);
+
+   function To_Unsigned_16 is new
+     Unchecked_Conversion (Wide_Character, Unsigned_16);
+
+   function To_Unsigned_32 is new
+     Unchecked_Conversion (Wide_Wide_Character, Unsigned_32);
+
+   --  Local subprograms
+
+   procedure Raise_Encoding_Error;
+   --  Called if an invalid input encoding sequence is found by Decode
+
+   function Decode_UTF_8 (Item : String) return Wide_String;
+   --  Equivalent to Decode (Item, UTF_8), but smaller and faster
+
+   function Decode_UTF_8 (Item : String) return Wide_Wide_String;
+   --  Equivalent to Decode (Item, UTF_8), but smaller and faster
+
+   function Encode_UTF_8 (Item : Wide_String) return String;
+   --  Equivalent to Encode (Item, UTF_8) but smaller and faster
+
+   function Encode_UTF_8 (Item : Wide_Wide_String) return String;
+   --  Equivalent to Encode (Item, UTF_8) but smaller and faster
+
+   function Decode_UTF_16 (Item : Wide_String) return Wide_String;
+   --  Equivalent to Decode (Item, UTF_16)
+
+   function Decode_UTF_16 (Item : Wide_String) return Wide_Wide_String;
+   --  Equivalent to Decode (Item, UTF_16)
+
+   function Encode_UTF_16 (Item : Wide_String) return Wide_String;
+   --  Equivalent to Encode (Item, UTF_16)
+
+   function Encode_UTF_16 (Item : Wide_Wide_String) return Wide_String;
+   --  Equivalent to Encode (Item, UTF_16)
+
+   ------------
+   -- Decode --
+   ------------
+
+   --  String input with Wide_String output (short encodings)
+
+   function Decode
+     (Item   : String;
+      Scheme : Short_Encoding := UTF_8) return Wide_String
+   is
+   begin
+      --  UTF-8 encoding case
+
+      if Scheme = UTF_8 then
+         return Decode_UTF_8 (Item);
+
+      --  Case of UTF_16LE or UTF_16BE
+
+      else
+         UTF16_XE : declare
+            Input_UTF16 : Wide_String (1 .. Item'Length / 2);
+            --  UTF_16 input string
+
+            Iptr : Natural;
+            --  Pointer to next location to store in Input_UTF16
+
+            Ptr : Natural;
+            --  Input string pointer
+
+            H, L : Natural range 0 .. 1;
+            --  Offset for high and low order bytes
+
+         begin
+            --  In both cases, the input string must be even in length, since
+            --  we have two input characters for each input code in UTF_16.
+
+            if Item'Length mod 2 /= 0 then
+               Raise_Encoding_Error;
+            end if;
+
+            --  We first assemble the UTF_16 string from the input. Set offsets
+            --  for the two bytes. For UTF_16LE we have low order/high order.
+            --  For UTF_16BE we have high order/low order.
+
+            if Scheme = UTF_16LE then
+               L := 0;
+               H := 1;
+            else
+               L := 1;
+               H := 0;
+            end if;
+
+            --  Loop to convert input to UTF_16 form
+
+            Iptr := 1;
+            Ptr := Item'First;
+            while Ptr < Item'Last loop
+               Input_UTF16 (Iptr) :=
+                 Wide_Character'Val
+                   (Unsigned_16 (To_Unsigned_8 (Item (Ptr + L)))
+                     or
+                    Shift_Left
+                      (Unsigned_16 (To_Unsigned_8 (Item (Ptr + H))), 8));
+               Iptr := Iptr + 1;
+               Ptr := Ptr + 2;
+            end loop;
+
+            --  Result is obtained by converting this UTF_16 input. Note that
+            --  we rely on this nested call to Decode to skip any BOM present.
+
+            return Decode (Input_UTF16);
+         end UTF16_XE;
+      end if;
+   end Decode;
+
+   --  String input with Wide_Wide_String output (short encodings)
+
+   function Decode
+     (Item   : String;
+      Scheme : Short_Encoding := UTF_8) return Wide_Wide_String
+   is
+   begin
+      --  UTF-8 encoding case
+
+      if Scheme = UTF_8 then
+         return Decode_UTF_8 (Item);
+
+      --  Case of UTF_16LE or UTF_16BE
+
+      else
+         UTF16_XE : declare
+            Input_UTF16 : Wide_String (1 .. Item'Length / 2);
+            --  UTF_16 input string
+
+            Iptr : Natural;
+            --  Pointer to next location to store in Input_UTF16
+
+            Ptr : Natural;
+            --  Input string pointer
+
+            H, L : Integer range 0 .. 1;
+            --  Offset for high and low order bytes
+
+         begin
+            --  In both cases, the input string must be even in length, since
+            --  we have two input characters for each input code in UTF_16.
+
+            if Item'Length mod 2 /= 0 then
+               Raise_Encoding_Error;
+            end if;
+
+            --  We first assemble the UTF_16 string from the input. Set offsets
+            --  for the two bytes. For UTF_16LE we have low order/high order.
+            --  For UTF_16BE we have high order/low order.
+
+            if Scheme = UTF_16LE then
+               L := 0;
+               H := 1;
+            else
+               L := 1;
+               H := 0;
+            end if;
+
+            --  Loop to convert input to UTF_16 form
+
+            Ptr := Item'First;
+            Iptr := 1;
+            while Ptr < Item'Last loop
+               Input_UTF16 (Iptr) :=
+                 Wide_Character'Val
+                   (Unsigned_16 (To_Unsigned_8 (Item (Ptr + L)))
+                      or
+                    Shift_Left
+                      (Unsigned_16 (To_Unsigned_8 (Item (Ptr + H))), 8));
+               Iptr := Iptr + 1;
+               Ptr := Ptr + 2;
+            end loop;
+
+            --  Result is obtained by converting this UTF_16 input. Note that
+            --  we rely on this nested call to Decode to skip any BOM present.
+
+            return Decode_UTF_16 (Input_UTF16);
+         end UTF16_XE;
+      end if;
+   end Decode;
+
+   --  Wide_String input with Wide_Wide_String output (long encodings)
+
+   function Decode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String
+   is
+      pragma Unreferenced (Scheme);
+   begin
+      return Decode_UTF_16 (Item);
+   end Decode;
+
+   --  Wide_String input with Wide_Wide_String output (long encodings)
+
+   function Decode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_Wide_String
+   is
+      pragma Unreferenced (Scheme);
+   begin
+      return Decode_UTF_16 (Item);
+   end Decode;
+
+   -------------------
+   -- Decode_UTF_16 --
+   -------------------
+
+   --  Version returning Wide_String result
+
+   function Decode_UTF_16 (Item : Wide_String) return Wide_String is
+      Result : Wide_String (1 .. Item'Length);
+      --  Result is same length as input (possibly minus 1 if BOM present)
+
+      Len : Natural := 0;
+      --  Length of result
+
+      Cod : Unsigned_16;
+      J   : Positive;
+
+   begin
+      --  Skip UTF-16 BOM at start
+
+      J := Item'First;
+
+      if J <= Item'Last and then Item (J) = BOM_16 (1) then
+         J := J + 1;
+      end if;
+
+      --  Loop through input characters
+
+      while J <= Item'Last loop
+         Cod := To_Unsigned_16 (Item (J));
+
+         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFF#
+         --  represent their own value.
+
+         if Cod <= 16#D7FF# or else Cod >= 16#E000# then
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (Cod);
+
+         --  Codes in the range 16#D800#..16#DBFF# represent the first of the
+         --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
+         --  Such codes are out of range for 16-bit output.
+
+         --  The remaining case of input in the range 16#DC00#..16#DFFF# must
+         --  never occur, since it means we have a second surrogate character
+         --  with no corresponding first surrogate.
+
+         --  Thus all remaining codes are invalid
+
+         else
+            Raise_Encoding_Error;
+         end if;
+
+         J := J + 1;
+      end loop;
+
+      return Result (1 .. Len);
+   end Decode_UTF_16;
+
+   --  Version returning Wide_Wide_String result
+
+   function Decode_UTF_16 (Item : Wide_String) return Wide_Wide_String is
+      Result : Wide_Wide_String (1 .. Item'Length);
+      --  Result cannot be longer than the input string
+
+      Len : Natural := 0;
+      --  Length of result
+
+      Cod  : Unsigned_16;
+      J    : Positive;
+      Rcod : Unsigned_32;
+
+   begin
+      --  Skip UTF-16 BOM at start
+
+      J := Item'First;
+
+      if J <= Item'Last and then Item (J) = BOM_16 (1) then
+         J := J + 1;
+      end if;
+
+      --  Loop through input characters
+
+      while J <= Item'Last loop
+         Cod := To_Unsigned_16 (Item (J));
+
+         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFF#
+         --  represent their own value.
+
+         if Cod <= 16#D7FF# or else Cod >= 16#E000# then
+            Len := Len + 1;
+            Result (Len) := Wide_Wide_Character'Val (Cod);
+
+         --  Codes in the range 16#D800#..16#DBFF# represent the first of the
+         --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
+
+         elsif Cod <= 16#DBFF# then
+            Rcod := (Unsigned_32 (Cod) - 16#D800#) * 2 ** 10;
+
+            --  Error if at end of string
+
+            if J = Item'Last then
+               Raise_Encoding_Error;
+
+            --  Otherwise next character must be valid low order surrogate
+
+            else
+               J := J + 1;
+               Cod := To_Unsigned_16 (Item (J));
+
+               if Cod < 16#DC00# or else Cod > 16#DFFF# then
+                  Raise_Encoding_Error;
+
+               else
+                  Rcod := Rcod + (Unsigned_32 (Cod) mod 2 ** 10) + 16#01_0000#;
+                  Len := Len + 1;
+                  Result (Len) := Wide_Wide_Character'Val (Rcod);
+               end if;
+            end if;
+
+         --  If input is in the range 16#DC00#..16#DFFF#, we have a second
+         --  surrogate character with no corresponding first surrogate.
+
+         else
+            Raise_Encoding_Error;
+         end if;
+
+         J := J + 1;
+      end loop;
+
+      return Result (1 .. Len);
+   end Decode_UTF_16;
+
+   ------------------
+   -- Decode_UTF_8 --
+   ------------------
+
+   --  Version returning Wide_String result
+
+   function Decode_UTF_8 (Item : String) return Wide_String is
+      Result : Wide_String (1 .. Item'Length);
+      --  Result string (worst case is same length as input)
+
+      Len : Natural := 0;
+      --  Length of result stored so far
+
+      Ptr : Natural;
+      --  Input string pointer
+
+      C : Unsigned_8;
+      R : Unsigned_16;
+
+      procedure Get_Continuation;
+      --  Reads a continuation byte of the form 10xxxxxx, shifts R left
+      --  by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
+      --  return Ptr is incremented. Raises exceptioon if continuation
+      --  byte does not exist or is invalid.
+
+      ----------------------
+      -- Get_Continuation --
+      ----------------------
+
+      procedure Get_Continuation is
+      begin
+         if Ptr > Item'Last then
+            Raise_Encoding_Error;
+
+         else
+            C := To_Unsigned_8 (Item (Ptr));
+            Ptr := Ptr + 1;
+
+            if C < 2#10_000000# or else C > 2#10_111111# then
+               Raise_Encoding_Error;
+
+            else
+               R := Shift_Left (R, 6) or
+                      Unsigned_16 (C and 2#00_111111#);
+            end if;
+         end if;
+      end Get_Continuation;
+
+   --  Start of processing for Decode_UTF_8
+
+   begin
+      Ptr := Item'First;
+
+      --  Skip BOM at start
+
+      if Ptr + 2 <= Item'Last
+        and then Item (Ptr .. Ptr + 2) = BOM_8
+      then
+         Ptr := Ptr + 3;
+      end if;
+
+      --  Loop through input characters
+
+      while Ptr <= Item'Last loop
+         C := To_Unsigned_8 (Item (Ptr));
+         Ptr := Ptr + 1;
+
+         --  Codes in the range 16#00# - 16#7F# are represented as
+         --    0xxxxxxx
+
+         if C <= 16#7F# then
+            R := Unsigned_16 (C);
+
+         --  No initial code can be of the form 10xxxxxx. Such codes are used
+         --  only for continuations.
+
+         elsif C <= 2#10_111111# then
+            Raise_Encoding_Error;
+
+         --  Codes in the range 16#80# - 16#7FF# are represented as
+         --    110yyyxx 10xxxxxx
+
+         elsif C <= 2#110_11111# then
+            R := Unsigned_16 (C and 2#000_11111#);
+            Get_Continuation;
+
+         --  Codes in the range 16#800# - 16#FFFF# are represented as
+         --    1110yyyy 10yyyyxx 10xxxxxx
+
+         elsif C <= 2#1110_1111# then
+            R := Unsigned_16 (C and 2#0000_1111#);
+            Get_Continuation;
+            Get_Continuation;
+
+         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
+         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+         --  Such codes are out of range for Wide_String output
+
+         else
+            Raise_Encoding_Error;
+         end if;
+
+         Len := Len + 1;
+         Result (Len) := Wide_Character'Val (R);
+      end loop;
+
+      return Result (1 .. Len);
+   end Decode_UTF_8;
+
+   --  Version returning Wide_Wide_String result
+
+   function Decode_UTF_8 (Item : String) return Wide_Wide_String is
+      Result : Wide_Wide_String (1 .. Item'Length);
+      --  Result string (worst case is same length as input)
+
+      Len : Natural := 0;
+      --  Length of result stored so far
+
+      Ptr : Natural;
+      --  Input string pointer
+
+      C : Unsigned_8;
+      R : Unsigned_32;
+
+      procedure Get_Continuation;
+      --  Reads a continuation byte of the form 10xxxxxx, shifts R left
+      --  by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
+      --  return Ptr is incremented. Raises exceptioon if continuation
+      --  byte does not exist or is invalid.
+
+      ----------------------
+      -- Get_Continuation --
+      ----------------------
+
+      procedure Get_Continuation is
+      begin
+         if Ptr > Item'Last then
+            raise Encoding_Error with
+              "incomplete UTF-8 encoding sequence";
+
+         else
+            C := To_Unsigned_8 (Item (Ptr));
+            Ptr := Ptr + 1;
+
+            if C < 2#10_000000# or else C > 2#10_111111# then
+               Raise_Encoding_Error;
+
+            else
+               R := Shift_Left (R, 6) or
+                 Unsigned_32 (C and 2#00_111111#);
+            end if;
+         end if;
+      end Get_Continuation;
+
+   --  Start of processing for UTF8_Decode
+
+   begin
+      Ptr := Item'First;
+
+      --  Skip BOM at start
+
+      if Ptr + 2 <= Item'Last
+        and then Item (Ptr .. Ptr + 2) = BOM_8
+      then
+         Ptr := Ptr + 3;
+      end if;
+
+      --  Loop through input characters
+
+      while Ptr <= Item'Last loop
+         C := To_Unsigned_8 (Item (Ptr));
+         Ptr := Ptr + 1;
+
+         --  Codes in the range 16#00# - 16#7F# are represented as
+         --    0xxxxxxx
+
+         if C <= 16#7F# then
+            R := Unsigned_32 (C);
+
+         --  No initial code can be of the form 10xxxxxx. Such codes are used
+         --  only for continuations.
+
+         elsif C <= 2#10_111111# then
+            Raise_Encoding_Error;
+
+         --  Codes in the range 16#80# - 16#7FF# are represented as
+         --    110yyyxx 10xxxxxx
+
+         elsif C <= 2#110_11111# then
+            R := Unsigned_32 (C and 2#000_11111#);
+            Get_Continuation;
+
+         --  Codes in the range 16#800# - 16#FFFF# are represented as
+         --    1110yyyy 10yyyyxx 10xxxxxx
+
+         elsif C <= 2#1110_1111# then
+            R := Unsigned_32 (C and 2#0000_1111#);
+            Get_Continuation;
+            Get_Continuation;
+
+         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
+         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+         elsif C <= 2#11110_111# then
+            R := Unsigned_32 (C and 2#00000_111#);
+            Get_Continuation;
+            Get_Continuation;
+            Get_Continuation;
+
+         --  Any other code is an error
+
+         else
+            Raise_Encoding_Error;
+         end if;
+
+         Len := Len + 1;
+         Result (Len) := Wide_Wide_Character'Val (R);
+      end loop;
+
+      return Result (1 .. Len);
+   end Decode_UTF_8;
+
+   ------------
+   -- Encode --
+   ------------
+
+   --  Version with Wide_String input returning encoded String
+
+   function Encode
+     (Item   : Wide_String;
+      Scheme : Short_Encoding := UTF_8) return String
+   is
+   begin
+      --  Case of UTF_8
+
+      if Scheme = UTF_8 then
+         return Encode_UTF_8 (Item);
+
+      --  Case of UTF_16LE or UTF_16BE
+
+      else
+         UTF16XE_Encode : declare
+            UTF16_Str : constant Wide_String := Encode_UTF_16 (Item);
+            Result    : String (1 .. 2 * UTF16_Str'Last);
+
+            H, L : Integer range -1 .. 0;
+            --  Offset for high and low order bytes
+
+            C : Unsigned_16;
+            --  One UTF_16 output value
+
+         begin
+            --  Set proper byte offsets
+
+            --  Set the byte order for the two bytes of each UTF_16 input code.
+            --  For UTF_16LE we have low order/high order. For UTF_16BE we have
+            --  high order/low order.
+
+            if Scheme = UTF_16LE then
+               L := -1;
+               H := 0;
+            else
+               L := 0;
+               H := -1;
+            end if;
+
+            --  Now copy the UTF_16 string to the result string
+
+            pragma Warnings (Off);
+            for J in 1 .. UTF16_Str'Last loop
+               C := To_Unsigned_16 (UTF16_Str (J));
+               Result (2 * J + L) := Character'Val (C and 16#FF#);
+               Result (2 * J + H) := Character'Val (Shift_Right (C, 8));
+            end loop;
+
+            return Result;
+         end UTF16XE_Encode;
+      end if;
+   end Encode;
+
+   --  Version with Wide_Wide_String input returning String
+
+   function Encode
+     (Item   : Wide_Wide_String;
+      Scheme : Short_Encoding := UTF_8) return String
+   is
+   begin
+      --  Case of UTF_8
+
+      if Scheme = UTF_8 then
+         return Encode_UTF_8 (Item);
+
+      --  Case of UTF_16LE or UTF_16BE
+
+      else
+         UTF16XE_Encode : declare
+            UTF16_Str : constant Wide_String := Encode (Item, UTF_16);
+            Result    : String (1 .. 2 * UTF16_Str'Last);
+
+            H, L : Integer range -1 .. 0;
+            --  Offset for high and low order bytes
+
+            C : Unsigned_16;
+            --  One UTF_16 output value
+
+         begin
+            --  Set proper byte offsets
+
+            --  Set the byte order for the two bytes of each UTF_16 input code.
+            --  For UTF_16LE we have low order/high order. For UTF_16BE we have
+            --  high order/low order.
+
+            if Scheme = UTF_16LE then
+               L := -1;
+               H := 0;
+            else
+               L := 0;
+               H := -1;
+            end if;
+
+            --  Now copy the UTF_16 string to the result string
+
+            for J in 1 .. UTF16_Str'Last loop
+               C := To_Unsigned_16 (UTF16_Str (J));
+               Result (2 * J + L) := Character'Val (C and 16#FF#);
+               Result (2 * J + H) := Character'Val (Shift_Right (C, 8));
+            end loop;
+
+            return Result;
+         end UTF16XE_Encode;
+      end if;
+   end Encode;
+
+   --  Wide_String input returning encoded Wide_String (long encodings)
+
+   function Encode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String
+   is
+      pragma Unreferenced (Scheme);
+   begin
+      return Encode_UTF_16 (Item);
+   end Encode;
+
+   --  Wide_Wide_String input returning Wide_String (long encodings)
+
+   function Encode
+     (Item   : Wide_Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String
+   is
+      pragma Unreferenced (Scheme);
+   begin
+      return Encode_UTF_16 (Item);
+   end Encode;
+
+   -------------------
+   -- Encode_UTF_16 --
+   -------------------
+
+   --  Wide_String input with UTF-16 encoded Wide_String output
+
+   function Encode_UTF_16 (Item : Wide_String) return Wide_String is
+      Result : Wide_String (1 .. Item'Length);
+      --  Output is same length as input (we do not add a BOM!)
+
+      Len : Integer := 0;
+      --  Length of output string
+
+      Cod : Unsigned_16;
+
+   begin
+      --  Loop through input characters encoding them
+
+      for J in Item'Range loop
+         Cod := To_Unsigned_16 (Item (J));
+
+         --  Codes in the range 16#0000#..16#D7FF# are output unchanged
+
+         if Cod <= 16#D7FF# then
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (Cod);
+
+         --  Codes in tne range 16#D800#..16#DFFF# should never appear in the
+         --  input, since no valid Unicode characters are in this range (which
+         --  would conflict with the UTF-16 surrogate encodings).
+
+         elsif Cod <= 16#DFFF# then
+            raise Constraint_Error with
+              "Wide_Character in range 16#D800# .. 16#DFFF#";
+
+         --  Codes in the range 16#E000#..16#FFFF# are output unchanged
+
+         else
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (Cod);
+         end if;
+      end loop;
+
+      return Result (1 .. Len);
+   end Encode_UTF_16;
+
+   --  Wide_Wide_String input with UTF-16 encoded Wide_String output
+
+   function Encode_UTF_16 (Item : Wide_Wide_String) return Wide_String is
+      Result : Wide_String (1 .. 2 * Item'Length);
+      --  Worst case is each input character generates two output characters
+
+      Len : Integer := 0;
+      --  Length of output string
+
+      Cod : Unsigned_32;
+
+   begin
+      --  Loop through input characters encoding them
+
+      for J in Item'Range loop
+         Cod := To_Unsigned_32 (Item (J));
+
+         --  Codes in the range 16#00_0000#..16#00_D7FF# are output unchanged
+
+         if Cod <= 16#00_D7FF# then
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (Cod);
+
+         --  Codes in tne range 16#00_D800#..16#00_DFFF# should never appear
+         --  in the input, since no valid Unicode characters are in this range
+         --  (which would conflict with the UTF-16 surrogate encodings).
+
+         elsif Cod <= 16#00_DFFF# then
+            raise Constraint_Error with
+              "Wide_Wide_Character in range 16#00_D800# .. 16#00_DFFF#";
+
+         --  Codes in the range 16#00_E000#..16#00_FFFF# are output unchanged
+
+         elsif Cod <= 16#00_FFFF# then
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (Cod);
+
+         --  Codes in the range 16#01_0000#..16#10_FFFF# are output using two
+         --  surrogate characters. First 16#1_0000# is subtracted from the code
+         --  point to give a 20-bit value. This is then split into two separate
+         --  10-bit values each of which is represented as a surrogate with the
+         --  most significant half placed in the first surrogate. To allow safe
+         --  use of simple word-oriented string processing, separate ranges of
+         --  values are used for the two surrogates: 16#D800#-16#DBFF# for the
+         --  first, most significant surrogate and 16#DC00#-16#DFFF# for the
+         --  second, least significant surrogate.
+
+         elsif Cod <= 16#10_FFFF# then
+            Cod := Cod - 16#1_0000#;
+
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (16#D800# + Cod / 2 ** 10);
+
+            Len := Len + 1;
+            Result (Len) := Wide_Character'Val (16#DC00# + Cod mod 2 ** 10);
+
+         --  Codes larger than 16#10_FFFF# are invalid
+
+         else
+            raise Constraint_Error with
+              "Wide_Wide_Character exceeds maximum value of 16#10_FFFF#";
+         end if;
+      end loop;
+
+      return Result (1 .. Len);
+   end Encode_UTF_16;
+
+   ------------------
+   -- Encode_UTF_8 --
+   ------------------
+
+   --  Wide_String input with UTF_8 encoded String output
+
+   function Encode_UTF_8 (Item : Wide_String) return String is
+      Result : String (1 .. 3 * Item'Length);
+      --  Worst case is three bytes per input byte
+
+      N : Natural := 0;
+      --  Number of output codes stored in Result
+
+      C : Unsigned_16;
+      --  Single input character
+
+      procedure Store (C : Unsigned_16);
+      pragma Inline (Store);
+      --  Store one output code, C is in the range 0 .. 255
+
+      -----------
+      -- Store --
+      -----------
+
+      procedure Store (C : Unsigned_16) is
+      begin
+         N := N + 1;
+         Result (N) := Character'Val (C);
+      end Store;
+
+   --  Start of processing for UTF8_Encode
+
+   begin
+      --  Loop through characters of input
+
+      for J in Item'Range loop
+         C := To_Unsigned_16 (Item (J));
+
+         --  Codes in the range 16#00# - 16#7F# are represented as
+         --    0xxxxxxx
+
+         if C <= 16#7F# then
+            Store (C);
+
+         --  Codes in the range 16#80# - 16#7FF# are represented as
+         --    110yyyxx 10xxxxxx
+
+         elsif C <= 16#7FF# then
+            Store (2#110_00000# or Shift_Right (C, 6));
+            Store (2#10_000000# or (C and 2#00_111111#));
+
+         --  Codes in the range 16#800# - 16#FFFF# are represented as
+         --    1110yyyy 10yyyyxx 10xxxxxx
+
+         else
+            Store (2#1110_0000# or Shift_Right (C, 12));
+            Store (2#10_000000# or
+                     Shift_Right (C and 2#111111_000000#, 6));
+            Store (2#10_000000# or (C and 2#00_111111#));
+         end if;
+      end loop;
+
+      return Result (1 .. N);
+   end Encode_UTF_8;
+
+   --  Wide_Wide_String input with UTF_8 encoded String output
+
+   function Encode_UTF_8 (Item : Wide_Wide_String) return String is
+      Result : String (1 .. 4 * Item'Length);
+      --  Worst case is four bytes per input byte
+
+      N  : Natural := 0;
+      --  Number of output codes stored in Result
+
+      C : Unsigned_32;
+      --  Single input character
+
+      procedure Store (C : Unsigned_32);
+      pragma Inline (Store);
+      --  Store one output code (input is in range 0 .. 255)
+
+      -----------
+      -- Store --
+      -----------
+
+      procedure Store (C : Unsigned_32) is
+      begin
+         N := N + 1;
+         Result (N) := Character'Val (C);
+      end Store;
+
+   --  Start of processing for UTF8_Encode
+
+   begin
+      --  Loop through characters of input
+
+      for J in Item'Range loop
+         C := To_Unsigned_32 (Item (J));
+
+         --  Codes in the range 16#00# - 16#7F# are represented as
+         --    0xxxxxxx
+
+         if C <= 16#7F# then
+            Store (C);
+
+         --  Codes in the range 16#80# - 16#7FF# are represented as
+         --    110yyyxx 10xxxxxx
+
+         elsif C <= 16#7FF# then
+            Store (2#110_00000# or Shift_Right (C, 6));
+            Store (2#10_000000# or (C and 2#00_111111#));
+
+         --  Codes in the range 16#800# - 16#FFFF# are represented as
+         --    1110yyyy 10yyyyxx 10xxxxxx
+
+         elsif C <= 16#FFFF# then
+            Store (2#1110_0000# or Shift_Right (C, 12));
+            Store (2#10_000000# or
+                     Shift_Right (C and 2#111111_000000#, 6));
+            Store (2#10_000000# or (C and 2#00_111111#));
+
+         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
+         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+
+         elsif C <= 16#10_FFFF# then
+            Store (2#11110_000# or Shift_Right (C, 18));
+            Store (2#10_000000# or
+                     Shift_Right (C and 2#111111_000000_000000#, 12));
+            Store (2#10_000000#
+                   or Shift_Right (C and 2#111111_000000#, 6));
+            Store (2#10_000000# or (C and 2#00_111111#));
+
+         --  Codes higher than 16#10_FFFF# should not appear
+
+         else
+            raise Constraint_Error with
+              "out of range invalid value in Encode input";
+         end if;
+      end loop;
+
+      return Result (1 .. N);
+   end Encode_UTF_8;
+
+   --------------
+   -- Encoding --
+   --------------
+
+   --  Version taking String input
+
+   function Encoding (Item : String) return Encoding_Scheme is
+   begin
+      if Item'Length >= 2 then
+         if Item (Item'First .. Item'First + 1) = BOM_16BE then
+            return UTF_16BE;
+
+         elsif Item (Item'First .. Item'First + 1) = BOM_16LE then
+            return UTF_16LE;
+
+         elsif Item'Length >= 3
+           and then Item (Item'First .. Item'First + 2) = BOM_8
+         then
+            return UTF_8;
+         end if;
+      end if;
+
+      return UTF_None;
+   end Encoding;
+
+   --  Version taking Wide_String input
+
+   function Encoding (Item : Wide_String) return Encoding_Scheme is
+   begin
+      if Item'Length >= 1
+        and then Item (Item'First .. Item'First) = BOM_16
+      then
+         return UTF_16;
+      else
+         return UTF_None;
+      end if;
+   end Encoding;
+
+   ------------------------
+   -- Raise_Encoding_Error --
+   ------------------------
+
+   procedure Raise_Encoding_Error is
+   begin
+      raise Encoding_Error with "invalid input encoding sequence";
+   end Raise_Encoding_Error;
+
+end Ada.Strings.UTF_Encoding;
Index: a-stuten.ads
===================================================================
--- a-stuten.ads	(revision 0)
+++ a-stuten.ads	(revision 0)
@@ -0,0 +1,117 @@ 
+------------------------------------------------------------------------------
+--                                                                          --
+--                         GNAT RUN-TIME COMPONENTS                         --
+--                                                                          --
+--              A D A . S T R I N G S . U T F _ E N C O D I N G             --
+--                                                                          --
+--                                 S p e c                                  --
+--                                                                          --
+-- This specification is derived from the Ada Reference Manual for use with --
+-- GNAT.  In accordance with the copyright of that document, you can freely --
+-- copy and modify this specification,  provided that if you redistribute a --
+-- modified version,  any changes that you have made are clearly indicated. --
+--                                                                          --
+------------------------------------------------------------------------------
+
+--  This is the Ada 2012 package defined in AI05-0137-1. It is used for
+--  encoding strings using UTF encodings (UTF-8, UTF-16LE, UTF-16BE, UTF-16).
+
+--  Compared with version 05 of the AI, we have added routines for UTF-16
+--  encoding and decoding of wide strings, which seems missing from the AI,
+--  added comments, and reordered the declarations.
+
+--  Note: although this is an Ada 2012 package, the earlier versions of the
+--  language permit the addition of new grandchildren of Ada, so we are able
+--  to add this package unconditionally for use in Ada 2005 mode. We cannot
+--  allow it in earlier versions, since it requires Wide_Wide_Character/String.
+
+package Ada.Strings.UTF_Encoding is
+   pragma Pure (UTF_Encoding);
+
+   type Encoding_Scheme is (UTF_None, UTF_8, UTF_16BE, UTF_16LE, UTF_16);
+
+   subtype Short_Encoding is Encoding_Scheme range UTF_8 .. UTF_16LE;
+   subtype Long_Encoding  is Encoding_Scheme range UTF_16 .. UTF_16;
+
+   --  The BOM (BYTE_ORDER_MARK) values defined here are used at the start of
+   --  a string to indicate the encoding. The convention in this package is
+   --  that decoding routines ignore a BOM, and output of encoding routines
+   --  does not include a BOM. If you want to include a BOM in the output,
+   --  you simply concatenate the appropriate value at the start of the string.
+
+   BOM_8    : constant String :=
+                Character'Val (16#EF#) &
+                Character'Val (16#BB#) &
+                Character'Val (16#BF#);
+
+   BOM_16BE : constant String :=
+                Character'Val (16#FE#) &
+                Character'Val (16#FF#);
+
+   BOM_16LE : constant String :=
+                Character'Val (16#FF#) &
+                Character'Val (16#FE#);
+
+   BOM_16   : constant Wide_String :=
+                (1 => Wide_Character'Val (16#FEFF#));
+
+   --  The encoding routines take a wide string or wide wide string as input
+   --  and encode the result using the specified UTF encoding method. For
+   --  UTF-16, the output is returned as a Wide_String, this is not a normal
+   --  Wide_String, since the codes in it may represent UTF-16 surrogate
+   --  characters used to encode large values. Similarly for UTF-8, UTF-16LE,
+   --  and UTF-16BE, the output is returned in a String, and again this String
+   --  is not a standard format string, since it may include UTF-8 surrogates.
+   --  As previously noted, the returned value does NOT start with a BOM.
+
+   --  Note: invalid codes in calls to one of the Encode routines represent
+   --  invalid values in the sense that they are not defined. For example, the
+   --  code 16#DC03# is not a valid wide character value. Such values result
+   --  in undefined behavior. For GNAT, Constraint_Error is raised with an
+   --  appropriate exception message.
+
+   function Encode
+     (Item   : Wide_String;
+      Scheme : Short_Encoding := UTF_8) return String;
+   function Encode
+     (Item   : Wide_Wide_String;
+      Scheme : Short_Encoding := UTF_8) return String;
+
+   function Encode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String;
+   function Encode
+     (Item   : Wide_Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String;
+
+   --  The decoding routines take a String or Wide_String input which is an
+   --  encoded string using the specified encoding. The output is a normal
+   --  Ada Wide_String or Wide_Wide_String value representing the decoded
+   --  values. Note that a BOM in the input matching the encoding is skipped.
+
+   Encoding_Error : exception;
+   --  Exception raised if an invalid encoding sequence is encountered by
+   --  one of the Decode routines.
+
+   function Decode
+     (Item   : String;
+      Scheme : Short_Encoding := UTF_8) return Wide_String;
+   function Decode
+     (Item   : String;
+      Scheme : Short_Encoding := UTF_8) return Wide_Wide_String;
+
+   function Decode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_String;
+   function Decode
+     (Item   : Wide_String;
+      Scheme : Long_Encoding := UTF_16) return Wide_Wide_String;
+
+   --  The Encoding functions inspect an encoded string or wide_string and
+   --  determine if a BOM is present. If so, the appropriate Encoding_Scheme
+   --  is returned. If not, then UTF_None is returned.
+
+   function Encoding (Item : String)      return Encoding_Scheme;
+   function Encoding (Item : Wide_String) return Encoding_Scheme;
+
+end Ada.Strings.UTF_Encoding;
Index: Makefile.rtl
===================================================================
--- Makefile.rtl	(revision 161191)
+++ Makefile.rtl	(working copy)
@@ -211,6 +211,7 @@  GNATRTL_NONTASKING_OBJS= \
   a-ststio$(objext) \
   a-stunau$(objext) \
   a-stunha$(objext) \
+  a-stuten$(objext) \
   a-stwibo$(objext) \
   a-stwifi$(objext) \
   a-stwiha$(objext) \