From patchwork Mon Jun 21 13:27:38 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Arnaud Charlet X-Patchwork-Id: 56314 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 28CF8B7D88 for ; Mon, 21 Jun 2010 23:27:54 +1000 (EST) Received: (qmail 9646 invoked by alias); 21 Jun 2010 13:27:51 -0000 Received: (qmail 9623 invoked by uid 22791); 21 Jun 2010 13:27:48 -0000 X-SWARE-Spam-Status: No, hits=-0.5 required=5.0 tests=AWL, BAYES_50, T_RP_MATCHES_RCVD X-Spam-Check-By: sourceware.org Received: from mel.act-europe.fr (HELO mel.act-europe.fr) (212.99.106.210) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Mon, 21 Jun 2010 13:27:39 +0000 Received: from localhost (localhost [127.0.0.1]) by filtered-smtp.eu.adacore.com (Postfix) with ESMTP id 851A8CB0225; Mon, 21 Jun 2010 15:27:39 +0200 (CEST) Received: from mel.act-europe.fr ([127.0.0.1]) by localhost (smtp.eu.adacore.com [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id 1dGxAh1WSCv9; Mon, 21 Jun 2010 15:27:39 +0200 (CEST) Received: from saumur.act-europe.fr (saumur.act-europe.fr [10.10.0.183]) by mel.act-europe.fr (Postfix) with ESMTP id 6F6D4CB01F0; Mon, 21 Jun 2010 15:27:39 +0200 (CEST) Received: by saumur.act-europe.fr (Postfix, from userid 525) id 243BAD9A01; Mon, 21 Jun 2010 15:27:38 +0200 (CEST) Date: Mon, 21 Jun 2010 15:27:38 +0200 From: Arnaud Charlet To: gcc-patches@gcc.gnu.org Cc: Emmanuel Briot Subject: [Ada] Minor improvements to regexps Message-ID: <20100621132738.GA27362@adacore.com> Mime-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.9i X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org This change optimizes Compile by avoiding double compilation in some cases. It also does a minor reduction in the size of the compiled pattern. Tested on x86_64-pc-linux-gnu, committed on trunk 2010-06-21 Emmanuel Briot * s-regpat.adb (Next_Instruction, Get_Next_Offset): Removed, merged into Get_Next. (Insert_Operator_Before): New subprogram, avoids duplicated code (Compile): Avoid doing two compilations when the pattern matcher ends up being small. Index: s-regpat.adb =================================================================== --- s-regpat.adb (revision 161073) +++ s-regpat.adb (working copy) @@ -50,13 +50,6 @@ package body System.Regpat is Debug : constant Boolean := False; -- Set to True to activate debug traces - MAGIC : constant Character := Character'Val (10#0234#); - -- The first byte of the regexp internal "program" is actually - -- this magic number; the start node begins in the second byte. - -- - -- This is used to make sure that a regular expression was correctly - -- compiled. - ---------------------------- -- Implementation details -- ---------------------------- @@ -79,21 +72,19 @@ package body System.Regpat is -- You can see the exact byte-compiled version by using the Dump -- subprogram. However, here are a few examples: - -- (a|b): 1 : MAGIC - -- 2 : BRANCH (next at 10) - -- 5 : EXACT (next at 18) operand=a - -- 10 : BRANCH (next at 18) - -- 13 : EXACT (next at 18) operand=b - -- 18 : EOP (next at 0) + -- (a|b): 1 : BRANCH (next at 9) + -- 4 : EXACT (next at 17) operand=a + -- 9 : BRANCH (next at 17) + -- 12 : EXACT (next at 17) operand=b + -- 17 : EOP (next at 0) -- - -- (ab)*: 1 : MAGIC - -- 2 : CURLYX (next at 26) { 0, 32767} - -- 9 : OPEN 1 (next at 13) - -- 13 : EXACT (next at 19) operand=ab - -- 19 : CLOSE 1 (next at 23) - -- 23 : WHILEM (next at 0) - -- 26 : NOTHING (next at 29) - -- 29 : EOP (next at 0) + -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} + -- 8 : OPEN 1 (next at 12) + -- 12 : EXACT (next at 18) operand=ab + -- 18 : CLOSE 1 (next at 22) + -- 22 : WHILEM (next at 0) + -- 25 : NOTHING (next at 28) + -- 28 : EOP (next at 0) -- The opcodes are: @@ -282,11 +273,6 @@ package body System.Regpat is Op : out Character_Class); -- Return a pointer to the string argument of the node at P - function Get_Next_Offset - (Program : Program_Data; - IP : Pointer) return Pointer; - -- Get the offset field of a node. Used by Get_Next - function Get_Next (Program : Program_Data; IP : Pointer) return Pointer; @@ -306,7 +292,6 @@ package body System.Regpat is pragma Inline (Is_Alnum); pragma Inline (Is_White_Space); pragma Inline (Get_Next); - pragma Inline (Get_Next_Offset); pragma Inline (Operand); pragma Inline (Read_Natural); pragma Inline (String_Length); @@ -389,7 +374,6 @@ package body System.Regpat is PM : Pattern_Matcher renames Matcher; Program : Program_Data renames PM.Program; - Emit_Code : constant Boolean := PM.Size > 0; Emit_Ptr : Pointer := Program_First; Parse_Pos : Natural := Expression'First; -- Input-scan pointer @@ -456,6 +440,17 @@ package body System.Regpat is -- This applies to PLUS and STAR. -- If Minmod is True, then the operator is non-greedy. + function Insert_Operator_Before + (Op : Opcode; + Operand : Pointer; + Greedy : Boolean; + Opsize : Pointer) return Pointer; + -- Insert an operator before Operand (and move the latter forward in the + -- program). Opsize is the size needed to represent the operator. + -- This returns the position at which the operator was + -- inserted, and moves Emit_Ptr after the new position of the + -- operand. + procedure Insert_Curly_Operator (Op : Opcode; Min : Natural; @@ -471,9 +466,6 @@ package body System.Regpat is procedure Link_Operand_Tail (P, Val : Pointer); -- Link_Tail on operand of first argument; noop if operand-less - function Next_Instruction (P : Pointer) return Pointer; - -- Dig the "next" pointer out of a node - procedure Fail (M : String); pragma No_Return (Fail); -- Fail with a diagnostic message, if possible @@ -533,7 +525,7 @@ package body System.Regpat is procedure Emit (B : Character) is begin - if Emit_Code then + if Emit_Ptr <= PM.Size then Program (Emit_Ptr) := B; end if; @@ -551,7 +543,7 @@ package body System.Regpat is (Character_Class, Program31); begin - if Emit_Code then + if Emit_Ptr + 31 <= PM.Size then Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); end if; @@ -564,7 +556,7 @@ package body System.Regpat is procedure Emit_Natural (IP : Pointer; N : Natural) is begin - if Emit_Code then + if IP + 1 <= PM.Size then Program (IP + 1) := Character'Val (N / 256); Program (IP) := Character'Val (N mod 256); end if; @@ -578,7 +570,7 @@ package body System.Regpat is Result : constant Pointer := Emit_Ptr; begin - if Emit_Code then + if Emit_Ptr + 2 <= PM.Size then Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); Program (Emit_Ptr + 1) := ASCII.NUL; Program (Emit_Ptr + 2) := ASCII.NUL; @@ -659,12 +651,29 @@ package body System.Regpat is Operand : Pointer; Greedy : Boolean := True) is - Dest : constant Pointer := Emit_Ptr; Old : Pointer; - Size : Pointer := 7; + begin + Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); + Emit_Natural (Old + 3, Min); + Emit_Natural (Old + 5, Max); + end Insert_Curly_Operator; + + ---------------------------- + -- Insert_Operator_Before -- + ---------------------------- + + function Insert_Operator_Before + (Op : Opcode; + Operand : Pointer; + Greedy : Boolean; + Opsize : Pointer) return Pointer + is + Dest : constant Pointer := Emit_Ptr; + Old : Pointer; + Size : Pointer := Opsize; begin - -- If the operand is not greedy, insert an extra operand before it + -- If not greedy, we have to emit another opcode first if not Greedy then Size := Size + 3; @@ -673,7 +682,7 @@ package body System.Regpat is -- Move the operand in the byte-compilation, so that we can insert -- the operator before it. - if Emit_Code then + if Emit_Ptr + Size <= PM.Size then Program (Operand + Size .. Emit_Ptr + Size) := Program (Operand .. Emit_Ptr); end if; @@ -689,11 +698,9 @@ package body System.Regpat is end if; Old := Emit_Node (Op); - Emit_Natural (Old + 3, Min); - Emit_Natural (Old + 5, Max); - Emit_Ptr := Dest + Size; - end Insert_Curly_Operator; + return Old; + end Insert_Operator_Before; --------------------- -- Insert_Operator -- @@ -704,40 +711,10 @@ package body System.Regpat is Operand : Pointer; Greedy : Boolean := True) is - Dest : constant Pointer := Emit_Ptr; - Old : Pointer; - Size : Pointer := 3; - Discard : Pointer; pragma Warnings (Off, Discard); - begin - -- If not greedy, we have to emit another opcode first - - if not Greedy then - Size := Size + 3; - end if; - - -- Move the operand in the byte-compilation, so that we can insert - -- the operator before it. - - if Emit_Code then - Program (Operand + Size .. Emit_Ptr + Size) := - Program (Operand .. Emit_Ptr); - end if; - - -- Insert the operator at the position previously occupied by the - -- operand. - - Emit_Ptr := Operand; - - if not Greedy then - Old := Emit_Node (MINMOD); - Link_Tail (Old, Old + 3); - end if; - - Discard := Emit_Node (Op); - Emit_Ptr := Dest + Size; + Discard := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 3); end Insert_Operator; ----------------------- @@ -804,7 +781,7 @@ package body System.Regpat is procedure Link_Operand_Tail (P, Val : Pointer) is begin - if Emit_Code and then Program (P) = BRANCH then + if Program (P) = BRANCH then Link_Tail (Operand (P), Val); end if; end Link_Operand_Tail; @@ -819,7 +796,7 @@ package body System.Regpat is Offset : Pointer; begin - if not Emit_Code then + if Emit_Ptr > PM.Size then return; end if; @@ -827,8 +804,8 @@ package body System.Regpat is Scan := P; loop - Temp := Next_Instruction (Scan); - exit when Temp = 0; + Temp := Get_Next (Program, Scan); + exit when Temp = Scan; Scan := Temp; end loop; @@ -837,27 +814,6 @@ package body System.Regpat is Emit_Natural (Scan + 1, Natural (Offset)); end Link_Tail; - ---------------------- - -- Next_Instruction -- - ---------------------- - - function Next_Instruction (P : Pointer) return Pointer is - Offset : Pointer; - - begin - if not Emit_Code then - return 0; - end if; - - Offset := Get_Next_Offset (Program, P); - - if Offset = 0 then - return 0; - end if; - - return P + Offset; - end Next_Instruction; - ----------- -- Parse -- ----------- @@ -873,7 +829,7 @@ package body System.Regpat is IP : out Pointer) is E : String renames Expression; - Br : Pointer; + Br, Br2 : Pointer; Ender : Pointer; Par_No : Natural; New_Flags : Expression_Flags; @@ -964,9 +920,10 @@ package body System.Regpat is Br := IP; loop - exit when Br = 0; Link_Operand_Tail (Br, Ender); - Br := Next_Instruction (Br); + Br2 := Get_Next (Program, Br); + exit when Br2 = Br; + Br := Br2; end loop; end if; @@ -1665,7 +1622,7 @@ package body System.Regpat is Parse_Pos := Start_Pos; end if; - if Emit_Code then + if Length_Ptr <= PM.Size then Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); end if; @@ -2007,7 +1964,6 @@ package body System.Regpat is -- Start of processing for Compile begin - Emit (MAGIC); Parse (False, Expr_Flags, Result); if Result = 0 then @@ -2019,7 +1975,7 @@ package body System.Regpat is -- Do we want to actually compile the expression, or simply get the -- code size ??? - if Emit_Code then + if Emit_Ptr <= PM.Size then Optimize (PM); end if; @@ -2030,19 +1986,37 @@ package body System.Regpat is (Expression : String; Flags : Regexp_Flags := No_Flags) return Pattern_Matcher is + -- Assume the compiled regexp will fit in 1000 chars. If it does not + -- we will have to compile a second time once the correct size is + -- known. If it fits, we save a significant amount of time by avoiding + -- the second compilation. + Dummy : Pattern_Matcher (1000); Size : Program_Size; - Dummy : Pattern_Matcher (0); - pragma Unreferenced (Dummy); begin Compile (Dummy, Expression, Size, Flags); - declare - Result : Pattern_Matcher (Size); - begin - Compile (Result, Expression, Size, Flags); - return Result; - end; + if Size <= Dummy.Size then + return Pattern_Matcher' + (Size => Size, + First => Dummy.First, + Anchored => Dummy.Anchored, + Must_Have => Dummy.Must_Have, + Must_Have_Length => Dummy.Must_Have_Length, + Paren_Count => Dummy.Paren_Count, + Flags => Dummy.Flags, + Program => Dummy.Program + (Dummy.Program'First .. Dummy.Program'First + Size - 1)); + else + -- We have to recompile now that we know the size + -- ??? Can we use Ada05's return construct ? + declare + Result : Pattern_Matcher (Size); + begin + Compile (Result, Expression, Size, Flags); + return Result; + end; + end if; end Compile; procedure Compile @@ -2051,9 +2025,11 @@ package body System.Regpat is Flags : Regexp_Flags := No_Flags) is Size : Program_Size; - pragma Unreferenced (Size); begin Compile (Matcher, Expression, Size, Flags); + if Size > Matcher.Size then + raise Expression_Error with "Pattern_Matcher is too small"; + end if; end Compile; -------------------- @@ -2101,7 +2077,7 @@ package body System.Regpat is begin while Index < Till loop Op := Opcode'Val (Character'Pos ((Program (Index)))); - Next := Index + Get_Next_Offset (Program, Index); + Next := Get_Next (Program, Index); if Do_Print then declare @@ -2254,14 +2230,11 @@ package body System.Regpat is procedure Dump (Self : Pattern_Matcher) is Program : Program_Data renames Self.Program; - Index : Pointer := Program'First + 1; + Index : Pointer := Program'First; -- Start of processing for Dump begin - pragma Assert (Self.Program (Program_First) = MAGIC, - "Corrupted Pattern_Matcher"); - Put_Line ("Must start with (Self.First) = " & Character'Image (Self.First)); @@ -2277,7 +2250,6 @@ package body System.Regpat is Put_Line (" Multiple_Lines mode"); end if; - Put_Line (" 1:MAGIC"); Dump_Until (Program, Index, Self.Program'Last + 1, 0); end Dump; @@ -2300,27 +2272,10 @@ package body System.Regpat is -------------- function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is - Offset : constant Pointer := Get_Next_Offset (Program, IP); begin - if Offset = 0 then - return 0; - else - return IP + Offset; - end if; + return IP + Pointer (Read_Natural (Program, IP + 1)); end Get_Next; - --------------------- - -- Get_Next_Offset -- - --------------------- - - function Get_Next_Offset - (Program : Program_Data; - IP : Pointer) return Pointer - is - begin - return Pointer (Read_Natural (Program, IP + 1)); - end Get_Next_Offset; - -------------- -- Is_Alnum -- -------------- @@ -3366,7 +3321,7 @@ package body System.Regpat is Last_Paren := 0; Matches_Full := (others => No_Match); - if Match (Program_First + 1) then + if Match (Program_First) then Matches_Full (0) := (Pos, Input_Pos - 1); return True; end if; @@ -3384,12 +3339,6 @@ package body System.Regpat is return; end if; - -- Check validity of program - - pragma Assert - (Program (Program_First) = MAGIC, - "Corrupted Pattern_Matcher"); - -- If there is a "must appear" string, look for it if Self.Must_Have_Length > 0 then @@ -3618,7 +3567,7 @@ package body System.Regpat is Self.Must_Have := Program'Last + 1; Self.Must_Have_Length := 0; - Scan := Program_First + 1; -- First instruction (can be anything) + Scan := Program_First; -- First instruction (can be anything) if Program (Scan) = EXACT then Self.First := Program (String_Operand (Scan));