Patchwork [Ada] Minor improvements to regexps

login
register
mail settings
Submitter Arnaud Charlet
Date June 21, 2010, 1:27 p.m.
Message ID <20100621132738.GA27362@adacore.com>
Download mbox | patch
Permalink /patch/56314/
State New
Headers show

Comments

Arnaud Charlet - June 21, 2010, 1:27 p.m.
This change optimizes Compile by avoiding double compilation in some
cases. It also does a minor reduction in the size of the compiled
pattern.

Tested on x86_64-pc-linux-gnu, committed on trunk

2010-06-21  Emmanuel Briot  <briot@adacore.com>

	* s-regpat.adb (Next_Instruction, Get_Next_Offset): Removed, merged
	into Get_Next.
	(Insert_Operator_Before): New subprogram, avoids duplicated code
	(Compile): Avoid doing two compilations when the pattern matcher ends
	up being small.

Patch

Index: s-regpat.adb
===================================================================
--- s-regpat.adb	(revision 161073)
+++ s-regpat.adb	(working copy)
@@ -50,13 +50,6 @@  package body System.Regpat is
    Debug : constant Boolean := False;
    --  Set to True to activate debug traces
 
-   MAGIC : constant Character := Character'Val (10#0234#);
-   --  The first byte of the regexp internal "program" is actually
-   --  this magic number; the start node begins in the second byte.
-   --
-   --  This is used to make sure that a regular expression was correctly
-   --  compiled.
-
    ----------------------------
    -- Implementation details --
    ----------------------------
@@ -79,21 +72,19 @@  package body System.Regpat is
    --  You can see the exact byte-compiled version by using the Dump
    --  subprogram. However, here are a few examples:
 
-   --  (a|b):  1 : MAGIC
-   --          2 : BRANCH  (next at  10)
-   --          5 :    EXACT  (next at  18)   operand=a
-   --         10 : BRANCH  (next at  18)
-   --         13 :    EXACT  (next at  18)   operand=b
-   --         18 : EOP  (next at 0)
+   --  (a|b):  1 : BRANCH  (next at  9)
+   --          4 :    EXACT  (next at  17)   operand=a
+   --          9 : BRANCH  (next at  17)
+   --         12 :    EXACT  (next at  17)   operand=b
+   --         17 : EOP  (next at 0)
    --
-   --  (ab)*:  1 : MAGIC
-   --          2 : CURLYX  (next at  26)  { 0, 32767}
-   --          9 :    OPEN 1  (next at  13)
-   --         13 :       EXACT  (next at  19)   operand=ab
-   --         19 :    CLOSE 1  (next at  23)
-   --         23 :    WHILEM  (next at 0)
-   --         26 : NOTHING  (next at  29)
-   --         29 : EOP  (next at 0)
+   --  (ab)*:  1 : CURLYX  (next at  25)  { 0, 32767}
+   --          8 :    OPEN 1  (next at  12)
+   --         12 :       EXACT  (next at  18)   operand=ab
+   --         18 :    CLOSE 1  (next at  22)
+   --         22 :    WHILEM  (next at 0)
+   --         25 : NOTHING  (next at  28)
+   --         28 : EOP  (next at 0)
 
    --  The opcodes are:
 
@@ -282,11 +273,6 @@  package body System.Regpat is
       Op      : out Character_Class);
    --  Return a pointer to the string argument of the node at P
 
-   function Get_Next_Offset
-     (Program : Program_Data;
-      IP      : Pointer) return Pointer;
-   --  Get the offset field of a node. Used by Get_Next
-
    function Get_Next
      (Program : Program_Data;
       IP      : Pointer) return Pointer;
@@ -306,7 +292,6 @@  package body System.Regpat is
    pragma Inline (Is_Alnum);
    pragma Inline (Is_White_Space);
    pragma Inline (Get_Next);
-   pragma Inline (Get_Next_Offset);
    pragma Inline (Operand);
    pragma Inline (Read_Natural);
    pragma Inline (String_Length);
@@ -389,7 +374,6 @@  package body System.Regpat is
       PM        : Pattern_Matcher renames Matcher;
       Program   : Program_Data renames PM.Program;
 
-      Emit_Code : constant Boolean := PM.Size > 0;
       Emit_Ptr  : Pointer := Program_First;
 
       Parse_Pos : Natural := Expression'First; -- Input-scan pointer
@@ -456,6 +440,17 @@  package body System.Regpat is
       --  This applies to PLUS and STAR.
       --  If Minmod is True, then the operator is non-greedy.
 
+      function Insert_Operator_Before
+        (Op      : Opcode;
+         Operand : Pointer;
+         Greedy  : Boolean;
+         Opsize  : Pointer) return Pointer;
+      --  Insert an operator before Operand (and move the latter forward in the
+      --  program). Opsize is the size needed to represent the operator.
+      --  This returns the position at which the operator was
+      --  inserted, and moves Emit_Ptr after the new position of the
+      --  operand.
+
       procedure Insert_Curly_Operator
         (Op      : Opcode;
          Min     : Natural;
@@ -471,9 +466,6 @@  package body System.Regpat is
       procedure Link_Operand_Tail (P, Val : Pointer);
       --  Link_Tail on operand of first argument; noop if operand-less
 
-      function  Next_Instruction (P : Pointer) return Pointer;
-      --  Dig the "next" pointer out of a node
-
       procedure Fail (M : String);
       pragma No_Return (Fail);
       --  Fail with a diagnostic message, if possible
@@ -533,7 +525,7 @@  package body System.Regpat is
 
       procedure Emit (B : Character) is
       begin
-         if Emit_Code then
+         if Emit_Ptr <= PM.Size then
             Program (Emit_Ptr) := B;
          end if;
 
@@ -551,7 +543,7 @@  package body System.Regpat is
            (Character_Class, Program31);
 
       begin
-         if Emit_Code then
+         if Emit_Ptr + 31 <= PM.Size then
             Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap);
          end if;
 
@@ -564,7 +556,7 @@  package body System.Regpat is
 
       procedure Emit_Natural (IP : Pointer; N : Natural) is
       begin
-         if Emit_Code then
+         if IP + 1 <= PM.Size then
             Program (IP + 1) := Character'Val (N / 256);
             Program (IP) := Character'Val (N mod 256);
          end if;
@@ -578,7 +570,7 @@  package body System.Regpat is
          Result : constant Pointer := Emit_Ptr;
 
       begin
-         if Emit_Code then
+         if Emit_Ptr + 2 <= PM.Size then
             Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
             Program (Emit_Ptr + 1) := ASCII.NUL;
             Program (Emit_Ptr + 2) := ASCII.NUL;
@@ -659,12 +651,29 @@  package body System.Regpat is
          Operand : Pointer;
          Greedy  : Boolean := True)
       is
-         Dest   : constant Pointer := Emit_Ptr;
          Old    : Pointer;
-         Size   : Pointer := 7;
+      begin
+         Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7);
+         Emit_Natural (Old + 3, Min);
+         Emit_Natural (Old + 5, Max);
+      end Insert_Curly_Operator;
+
+      ----------------------------
+      -- Insert_Operator_Before --
+      ----------------------------
+
+      function Insert_Operator_Before
+        (Op      : Opcode;
+         Operand : Pointer;
+         Greedy  : Boolean;
+         Opsize  : Pointer) return Pointer
+      is
+         Dest : constant Pointer := Emit_Ptr;
+         Old  : Pointer;
+         Size : Pointer := Opsize;
 
       begin
-         --  If the operand is not greedy, insert an extra operand before it
+         --  If not greedy, we have to emit another opcode first
 
          if not Greedy then
             Size := Size + 3;
@@ -673,7 +682,7 @@  package body System.Regpat is
          --  Move the operand in the byte-compilation, so that we can insert
          --  the operator before it.
 
-         if Emit_Code then
+         if Emit_Ptr + Size <= PM.Size then
             Program (Operand + Size .. Emit_Ptr + Size) :=
               Program (Operand .. Emit_Ptr);
          end if;
@@ -689,11 +698,9 @@  package body System.Regpat is
          end if;
 
          Old := Emit_Node (Op);
-         Emit_Natural (Old + 3, Min);
-         Emit_Natural (Old + 5, Max);
-
          Emit_Ptr := Dest + Size;
-      end Insert_Curly_Operator;
+         return Old;
+      end Insert_Operator_Before;
 
       ---------------------
       -- Insert_Operator --
@@ -704,40 +711,10 @@  package body System.Regpat is
          Operand : Pointer;
          Greedy  : Boolean := True)
       is
-         Dest : constant Pointer := Emit_Ptr;
-         Old  : Pointer;
-         Size : Pointer := 3;
-
          Discard : Pointer;
          pragma Warnings (Off, Discard);
-
       begin
-         --  If not greedy, we have to emit another opcode first
-
-         if not Greedy then
-            Size := Size + 3;
-         end if;
-
-         --  Move the operand in the byte-compilation, so that we can insert
-         --  the operator before it.
-
-         if Emit_Code then
-            Program (Operand + Size .. Emit_Ptr + Size) :=
-              Program (Operand .. Emit_Ptr);
-         end if;
-
-         --  Insert the operator at the position previously occupied by the
-         --  operand.
-
-         Emit_Ptr := Operand;
-
-         if not Greedy then
-            Old := Emit_Node (MINMOD);
-            Link_Tail (Old, Old + 3);
-         end if;
-
-         Discard := Emit_Node (Op);
-         Emit_Ptr := Dest + Size;
+         Discard := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 3);
       end Insert_Operator;
 
       -----------------------
@@ -804,7 +781,7 @@  package body System.Regpat is
 
       procedure Link_Operand_Tail (P, Val : Pointer) is
       begin
-         if Emit_Code and then Program (P) = BRANCH then
+         if Program (P) = BRANCH then
             Link_Tail (Operand (P), Val);
          end if;
       end Link_Operand_Tail;
@@ -819,7 +796,7 @@  package body System.Regpat is
          Offset : Pointer;
 
       begin
-         if not Emit_Code then
+         if Emit_Ptr > PM.Size then
             return;
          end if;
 
@@ -827,8 +804,8 @@  package body System.Regpat is
 
          Scan := P;
          loop
-            Temp := Next_Instruction (Scan);
-            exit when Temp = 0;
+            Temp := Get_Next (Program, Scan);
+            exit when Temp = Scan;
             Scan := Temp;
          end loop;
 
@@ -837,27 +814,6 @@  package body System.Regpat is
          Emit_Natural (Scan + 1, Natural (Offset));
       end Link_Tail;
 
-      ----------------------
-      -- Next_Instruction --
-      ----------------------
-
-      function Next_Instruction (P : Pointer) return Pointer is
-         Offset : Pointer;
-
-      begin
-         if not Emit_Code then
-            return 0;
-         end if;
-
-         Offset := Get_Next_Offset (Program, P);
-
-         if Offset = 0 then
-            return 0;
-         end if;
-
-         return P + Offset;
-      end Next_Instruction;
-
       -----------
       -- Parse --
       -----------
@@ -873,7 +829,7 @@  package body System.Regpat is
          IP             : out Pointer)
       is
          E              : String renames Expression;
-         Br             : Pointer;
+         Br, Br2        : Pointer;
          Ender          : Pointer;
          Par_No         : Natural;
          New_Flags      : Expression_Flags;
@@ -964,9 +920,10 @@  package body System.Regpat is
 
             Br := IP;
             loop
-               exit when Br = 0;
                Link_Operand_Tail (Br, Ender);
-               Br := Next_Instruction (Br);
+               Br2 := Get_Next (Program, Br);
+               exit when Br2 = Br;
+               Br := Br2;
             end loop;
          end if;
 
@@ -1665,7 +1622,7 @@  package body System.Regpat is
             Parse_Pos := Start_Pos;
          end if;
 
-         if Emit_Code then
+         if Length_Ptr <= PM.Size then
             Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2);
          end if;
 
@@ -2007,7 +1964,6 @@  package body System.Regpat is
    --  Start of processing for Compile
 
    begin
-      Emit (MAGIC);
       Parse (False, Expr_Flags, Result);
 
       if Result = 0 then
@@ -2019,7 +1975,7 @@  package body System.Regpat is
       --  Do we want to actually compile the expression, or simply get the
       --  code size ???
 
-      if Emit_Code then
+      if Emit_Ptr <= PM.Size then
          Optimize (PM);
       end if;
 
@@ -2030,19 +1986,37 @@  package body System.Regpat is
      (Expression : String;
       Flags      : Regexp_Flags := No_Flags) return Pattern_Matcher
    is
+      --  Assume the compiled regexp will fit in 1000 chars. If it does not
+      --  we will have to compile a second time once the correct size is
+      --  known. If it fits, we save a significant amount of time by avoiding
+      --  the second compilation.
+      Dummy : Pattern_Matcher (1000);
       Size  : Program_Size;
-      Dummy : Pattern_Matcher (0);
-      pragma Unreferenced (Dummy);
 
    begin
       Compile (Dummy, Expression, Size, Flags);
 
-      declare
-         Result : Pattern_Matcher (Size);
-      begin
-         Compile (Result, Expression, Size, Flags);
-         return Result;
-      end;
+      if Size <= Dummy.Size then
+         return Pattern_Matcher'
+           (Size             => Size,
+            First            => Dummy.First,
+            Anchored         => Dummy.Anchored,
+            Must_Have        => Dummy.Must_Have,
+            Must_Have_Length => Dummy.Must_Have_Length,
+            Paren_Count      => Dummy.Paren_Count,
+            Flags            => Dummy.Flags,
+            Program          => Dummy.Program
+              (Dummy.Program'First .. Dummy.Program'First + Size - 1));
+      else
+         --  We have to recompile now that we know the size
+         --  ??? Can we use Ada05's return construct ?
+         declare
+            Result : Pattern_Matcher (Size);
+         begin
+            Compile (Result, Expression, Size, Flags);
+            return Result;
+         end;
+      end if;
    end Compile;
 
    procedure Compile
@@ -2051,9 +2025,11 @@  package body System.Regpat is
       Flags      : Regexp_Flags := No_Flags)
    is
       Size : Program_Size;
-      pragma Unreferenced (Size);
    begin
       Compile (Matcher, Expression, Size, Flags);
+      if Size > Matcher.Size then
+         raise Expression_Error with "Pattern_Matcher is too small";
+      end if;
    end Compile;
 
    --------------------
@@ -2101,7 +2077,7 @@  package body System.Regpat is
    begin
       while Index < Till loop
          Op   := Opcode'Val (Character'Pos ((Program (Index))));
-         Next := Index + Get_Next_Offset (Program, Index);
+         Next := Get_Next (Program, Index);
 
          if Do_Print then
             declare
@@ -2254,14 +2230,11 @@  package body System.Regpat is
 
    procedure Dump (Self : Pattern_Matcher) is
       Program : Program_Data renames Self.Program;
-      Index   : Pointer := Program'First + 1;
+      Index   : Pointer := Program'First;
 
    --  Start of processing for Dump
 
    begin
-      pragma Assert (Self.Program (Program_First) = MAGIC,
-                     "Corrupted Pattern_Matcher");
-
       Put_Line ("Must start with (Self.First) = "
                 & Character'Image (Self.First));
 
@@ -2277,7 +2250,6 @@  package body System.Regpat is
          Put_Line ("  Multiple_Lines mode");
       end if;
 
-      Put_Line ("   1:MAGIC");
       Dump_Until (Program, Index, Self.Program'Last + 1, 0);
    end Dump;
 
@@ -2300,27 +2272,10 @@  package body System.Regpat is
    --------------
 
    function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
-      Offset : constant Pointer := Get_Next_Offset (Program, IP);
    begin
-      if Offset = 0 then
-         return 0;
-      else
-         return IP + Offset;
-      end if;
+      return IP + Pointer (Read_Natural (Program, IP + 1));
    end Get_Next;
 
-   ---------------------
-   -- Get_Next_Offset --
-   ---------------------
-
-   function Get_Next_Offset
-     (Program : Program_Data;
-      IP      : Pointer) return Pointer
-   is
-   begin
-      return Pointer (Read_Natural (Program, IP + 1));
-   end Get_Next_Offset;
-
    --------------
    -- Is_Alnum --
    --------------
@@ -3366,7 +3321,7 @@  package body System.Regpat is
          Last_Paren := 0;
          Matches_Full := (others => No_Match);
 
-         if Match (Program_First + 1) then
+         if Match (Program_First) then
             Matches_Full (0) := (Pos, Input_Pos - 1);
             return True;
          end if;
@@ -3384,12 +3339,6 @@  package body System.Regpat is
          return;
       end if;
 
-      --  Check validity of program
-
-      pragma Assert
-        (Program (Program_First) = MAGIC,
-         "Corrupted Pattern_Matcher");
-
       --  If there is a "must appear" string, look for it
 
       if Self.Must_Have_Length > 0 then
@@ -3618,7 +3567,7 @@  package body System.Regpat is
       Self.Must_Have := Program'Last + 1;
       Self.Must_Have_Length := 0;
 
-      Scan := Program_First + 1;  --  First instruction (can be anything)
+      Scan := Program_First;  --  First instruction (can be anything)
 
       if Program (Scan) = EXACT then
          Self.First := Program (String_Operand (Scan));