diff mbox

[Ada] Add support for non-capturing parenthesis in GNAT.Regpat

Message ID 20140801101613.GA27276@adacore.com
State New
Headers show

Commit Message

Arnaud Charlet Aug. 1, 2014, 10:16 a.m. UTC
It is now possible to use the (?:...) syntax to group elements in
a regular expression without making their matched substring available
in the Match_Array.

The following test must output:
   Matched (0)= 1.. 6
   Matched (1)= 5.. 5
   Matched (2)= 0.. 0

   with GNAT.Regpat;   use GNAT.Regpat;
   with Ada.Text_IO;   use Ada.Text_IO;
   procedure Main is
      P : constant Pattern_Matcher := Compile ("ab(?:a*)(a+)b");
      M : Match_Array (0 .. 3);
   begin
      Match (P, "abaaab", M);
      if M (0) /= No_Match then
         Put_Line ("Matched (0)=" & M (0).First'Img & ".." & M (0).Last'Img);
         Put_Line ("Matched (1)=" & M (1).First'Img & ".." & M (1).Last'Img);
         Put_Line ("Matched (2)=" & M (2).First'Img & ".." & M (2).Last'Img);
      end if;
   end Main;

Tested on x86_64-pc-linux-gnu, committed on trunk

2014-08-01  Emmanuel Briot  <briot@adacore.com>

	* g-regpat.adb (Parse): Add support for non-capturing parenthesis.
diff mbox

Patch

Index: s-regpat.adb
===================================================================
--- s-regpat.adb	(revision 213263)
+++ s-regpat.adb	(working copy)
@@ -7,7 +7,7 @@ 
 --                                 B o d y                                  --
 --                                                                          --
 --               Copyright (C) 1986 by University of Toronto.               --
---                      Copyright (C) 1999-2013, AdaCore                    --
+--                      Copyright (C) 1999-2014, AdaCore                    --
 --                                                                          --
 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
 -- terms of the  GNU General Public License as published  by the Free Soft- --
@@ -410,10 +410,13 @@ 
 
       procedure Parse
         (Parenthesized : Boolean;
+         Capturing     : Boolean;
          Flags         : out Expression_Flags;
          IP            : out Pointer);
       --  Parse regular expression, i.e. main body or parenthesized thing
       --  Caller must absorb opening parenthesis.
+      --  Capturing should be set to True when we have an open parenthesis
+      --  from which we want the user to extra text.
 
       procedure Parse_Branch
         (Flags         : out Expression_Flags;
@@ -831,9 +834,10 @@ 
       --  the branches to what follows makes it hard to avoid.
 
       procedure Parse
-         (Parenthesized  : Boolean;
-          Flags          : out Expression_Flags;
-          IP             : out Pointer)
+         (Parenthesized : Boolean;
+          Capturing     : Boolean;
+          Flags         : out Expression_Flags;
+          IP            : out Pointer)
       is
          E           : String renames Expression;
          Br, Br2     : Pointer;
@@ -847,7 +851,7 @@ 
 
          --  Make an OPEN node, if parenthesized
 
-         if Parenthesized then
+         if Parenthesized and then Capturing then
             if Matcher.Paren_Count > Max_Paren_Count then
                Fail ("too many ()");
             end if;
@@ -856,7 +860,6 @@ 
             Matcher.Paren_Count := Matcher.Paren_Count + 1;
             IP := Emit_Node (OPEN);
             Emit (Character'Val (Par_No));
-
          else
             IP := 0;
             Par_No := 0;
@@ -913,14 +916,19 @@ 
          --  Make a closing node, and hook it on the end
 
          if Parenthesized then
-            Ender := Emit_Node (CLOSE);
-            Emit (Character'Val (Par_No));
+            if Capturing then
+               Ender := Emit_Node (CLOSE);
+               Emit (Character'Val (Par_No));
+               Link_Tail (IP, Ender);
+            else
+               --  need to keep looking after the closing parenthesis
+               null;
+            end if;
          else
             Ender := Emit_Node (EOP);
+            Link_Tail (IP, Ender);
          end if;
 
-         Link_Tail (IP, Ender);
-
          if Have_Branch and then Emit_Ptr <= PM.Size + 1 then
 
             --  Hook the tails of the branches to the closing node
@@ -945,7 +953,7 @@ 
 
          elsif Parse_Pos <= Parse_End then
             if E (Parse_Pos) = ')'  then
-               Fail ("unmatched ()");
+               Fail ("unmatched ')'");
             else
                Fail ("junk on end");         -- "Can't happen"
             end if;
@@ -1003,16 +1011,24 @@ 
                   New_Flags : Expression_Flags;
 
                begin
-                  Parse (True, New_Flags, IP);
-
-                  if IP = 0 then
-                     return;
+                  if Parse_Pos <= Parse_End - 1
+                     and then Expression (Parse_Pos) = '?'
+                     and then Expression (Parse_Pos + 1) = ':'
+                  then
+                     Parse_Pos := Parse_Pos + 2;
+                     --  non-capturing parenthesis
+                     Parse (True, False, New_Flags, IP);
+                  else
+                     --  capturing parenthesis
+                     Parse (True, True, New_Flags, IP);
+                     Expr_Flags.Has_Width :=
+                       Expr_Flags.Has_Width or else New_Flags.Has_Width;
+                     Expr_Flags.SP_Start :=
+                       Expr_Flags.SP_Start or else New_Flags.SP_Start;
+                     if IP = 0 then
+                        return;
+                     end if;
                   end if;
-
-                  Expr_Flags.Has_Width :=
-                    Expr_Flags.Has_Width or else New_Flags.Has_Width;
-                  Expr_Flags.SP_Start :=
-                    Expr_Flags.SP_Start or else New_Flags.SP_Start;
                end;
 
             when '|' | ASCII.LF | ')' =>
@@ -1971,7 +1987,7 @@ 
    --  Start of processing for Compile
 
    begin
-      Parse (False, Expr_Flags, Result);
+      Parse (False, False, Expr_Flags, Result);
 
       if Result = 0 then
          Fail ("Couldn't compile expression");
Index: s-regpat.ads
===================================================================
--- s-regpat.ads	(revision 213263)
+++ s-regpat.ads	(working copy)
@@ -7,7 +7,7 @@ 
 --                                 S p e c                                  --
 --                                                                          --
 --               Copyright (C) 1986 by University of Toronto.               --
---                     Copyright (C) 1996-2010, AdaCore                     --
+--                     Copyright (C) 1996-2014, AdaCore                     --
 --                                                                          --
 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
 -- terms of the  GNU General Public License as published  by the Free Soft- --
@@ -78,8 +78,10 @@ 
    --            ::= [^ range range ...]  -- matches any character not listed
    --            ::= .                    -- matches any single character
    --                                     -- except newlines
-   --            ::= ( expr )             -- parens used for grouping
-   --            ::= \ num                -- reference to num-th parenthesis
+   --            ::= ( expr )             -- parenthesis used for grouping
+   --            ::= (?: expr )           -- non-capturing parenthesis
+   --            ::= \ num                -- reference to num-th capturing
+   --                                        parenthesis
 
    --     range  ::= char - char          -- matches chars in given range
    --            ::= nchr
@@ -345,6 +347,9 @@ 
    --  N'th parenthesized subexpressions; Matches (0) is for the whole
    --  expression.
    --
+   --  Non-capturing parenthesis (introduced with (?:...)) can not be
+   --  retrieved and do not count in the match array index.
+   --
    --  For instance, if your regular expression is: "a((b*)c+)(d+)", then
    --                                                 12      3
    --     Matches (0) is for "a((b*)c+)(d+)" (the entire expression)