diff mbox series

[v7,2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor

Message ID 695cd581035b59c759477b806640ee0b70df05f7.1686530834.git.nabijaczleweli@nabijaczleweli.xyz
State New
Headers show
Series [v7,1/3] posix: regcomp(): clear RE_DOT_NOT_NULL | expand

Commit Message

Ahelenia Ziemiańska June 12, 2023, 12:47 a.m. UTC
re_search_internal () starts with
  /* If initial states with non-begbuf contexts have no elements,
     the regex must be anchored.  If preg->newline_anchor is set,
     we'll never use init_state_nl, so do not check it.  */
  if (dfa->init_state->nodes.nelem == 0
      && dfa->init_state_word->nodes.nelem == 0
      && (dfa->init_state_nl->nodes.nelem == 0
	  || !preg->newline_anchor))
    {
      if (start != 0 && last_start != 0)
        return REG_NOMATCH;
      start = last_start = 0;
    }
and heretofor start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.

Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.

This brings us to compatibility with the BSD spec and implementations.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

Comments

Carlos O'Donell June 12, 2023, 1:11 p.m. UTC | #1
On 6/11/23 20:47, наб wrote:
> re_search_internal () starts with
>   /* If initial states with non-begbuf contexts have no elements,
>      the regex must be anchored.  If preg->newline_anchor is set,
>      we'll never use init_state_nl, so do not check it.  */
>   if (dfa->init_state->nodes.nelem == 0
>       && dfa->init_state_word->nodes.nelem == 0
>       && (dfa->init_state_nl->nodes.nelem == 0
> 	  || !preg->newline_anchor))
>     {
>       if (start != 0 && last_start != 0)
>         return REG_NOMATCH;
>       start = last_start = 0;
>     }
> and heretofor start and last_start (for example when "abc", {1, 2},
> so matching just the "b") were != 0, and the return was taken for a "^b"
> regex, which is erroneous.
> 
> Fix this by giving re_search_internal (string+rm_so, start=0),
> then fixing up the returned matches in an after-pass.
> 
> This brings us to compatibility with the BSD spec and implementations.
> 
> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>

These changes are being made to sources shared between gnulib and glibc.

As the files are listed in SHARED-SOURCES we cannot easily accept changes to them
via DCO since they should be shared with gnulib which still requires copyright
assignment.

Would you be willing to disclaim these changes or assign copyright?

> ---
>  posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
>  1 file changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/posix/regexec.c b/posix/regexec.c
> index bd0cd412d0..2ef868e1f6 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
>     string; if REG_NOTEOL is set, then $ does not match at the end.
>  
>     Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
> -   EFLAGS is invalid.  */
> +   EFLAGS is invalid.
> +
> +   If REG_STARTEND, the bounds are
> +     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
> +   instead of the usual
> +     [STRING, STRING + strlen(STRING)),
> +   but returned matches are still referenced to STRING,
> +   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
> +   re_search_internal () has a built-in assumption of
> +   (start != 0) <=> (^ doesn't match), so give it a truncated view
> +   and fix up the matches afterward.  */
>  
>  int
>  regexec (const regex_t *__restrict preg, const char *__restrict string,
>  	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
>  {
>    reg_errcode_t err;
> -  Idx start, length;
> +  Idx startoff = 0, length;
>    re_dfa_t *dfa = preg->buffer;
> +  size_t i = 0;
>  
>    if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
>      return REG_BADPAT;
>  
>    if (eflags & REG_STARTEND)
>      {
> -      start = pmatch[0].rm_so;
> -      length = pmatch[0].rm_eo;
> +      startoff = pmatch[0].rm_so;
> +      string += startoff;
> +      length = pmatch[0].rm_eo - startoff;
>      }
>    else
> -    {
> -      start = 0;
> -      length = strlen (string);
> -    }
> +    length = strlen (string);
>  
>    lock_lock (dfa->lock);
>    if (preg->no_sub)
> -    err = re_search_internal (preg, string, length, start, length,
> -			      length, 0, NULL, eflags);
> -  else
> -    err = re_search_internal (preg, string, length, start, length,
> -			      length, nmatch, pmatch, eflags);
> +    nmatch = 0;
> +  err = re_search_internal (preg, string, length, 0, length,
> +			    length, nmatch, pmatch, eflags);
>    lock_unlock (dfa->lock);
> +
> +  if (err == REG_NOERROR && startoff)
> +    for (i = 0; i < nmatch; ++i)
> +      if (pmatch[i].rm_so != -1)
> +	{
> +	  pmatch[i].rm_so += startoff;
> +	  pmatch[i].rm_eo += startoff;
> +	}
>    return err != REG_NOERROR;
>  }
>
Ahelenia Ziemiańska June 12, 2023, 2:03 p.m. UTC | #2
On Mon, Jun 12, 2023 at 09:11:54AM -0400, Carlos O'Donell wrote:
> On 6/11/23 20:47, наб wrote:
> > re_search_internal () starts with
> >   /* If initial states with non-begbuf contexts have no elements,
> >      the regex must be anchored.  If preg->newline_anchor is set,
> >      we'll never use init_state_nl, so do not check it.  */
> >   if (dfa->init_state->nodes.nelem == 0
> >       && dfa->init_state_word->nodes.nelem == 0
> >       && (dfa->init_state_nl->nodes.nelem == 0
> > 	  || !preg->newline_anchor))
> >     {
> >       if (start != 0 && last_start != 0)
> >         return REG_NOMATCH;
> >       start = last_start = 0;
> >     }
> > and heretofor start and last_start (for example when "abc", {1, 2},
> > so matching just the "b") were != 0, and the return was taken for a "^b"
> > regex, which is erroneous.
> > 
> > Fix this by giving re_search_internal (string+rm_so, start=0),
> > then fixing up the returned matches in an after-pass.
> > 
> > This brings us to compatibility with the BSD spec and implementations.
> > 
> > Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
> 
> These changes are being made to sources shared between gnulib and glibc.
> 
> As the files are listed in SHARED-SOURCES we cannot easily accept changes to them
> via DCO since they should be shared with gnulib which still requires copyright
> assignment.
> 
> Would you be willing to disclaim these changes or assign copyright?

Quite happy to disclaim all patches here, yeah;
the process itself is unclear to me, however. 

Best,
diff mbox series

Patch

diff --git a/posix/regexec.c b/posix/regexec.c
index bd0cd412d0..2ef868e1f6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -187,38 +187,53 @@  static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
    string; if REG_NOTEOL is set, then $ does not match at the end.
 
    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
-   EFLAGS is invalid.  */
+   EFLAGS is invalid.
+
+   If REG_STARTEND, the bounds are
+     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+   instead of the usual
+     [STRING, STRING + strlen(STRING)),
+   but returned matches are still referenced to STRING,
+   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+   re_search_internal () has a built-in assumption of
+   (start != 0) <=> (^ doesn't match), so give it a truncated view
+   and fix up the matches afterward.  */
 
 int
 regexec (const regex_t *__restrict preg, const char *__restrict string,
 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
 {
   reg_errcode_t err;
-  Idx start, length;
+  Idx startoff = 0, length;
   re_dfa_t *dfa = preg->buffer;
+  size_t i = 0;
 
   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
     return REG_BADPAT;
 
   if (eflags & REG_STARTEND)
     {
-      start = pmatch[0].rm_so;
-      length = pmatch[0].rm_eo;
+      startoff = pmatch[0].rm_so;
+      string += startoff;
+      length = pmatch[0].rm_eo - startoff;
     }
   else
-    {
-      start = 0;
-      length = strlen (string);
-    }
+    length = strlen (string);
 
   lock_lock (dfa->lock);
   if (preg->no_sub)
-    err = re_search_internal (preg, string, length, start, length,
-			      length, 0, NULL, eflags);
-  else
-    err = re_search_internal (preg, string, length, start, length,
-			      length, nmatch, pmatch, eflags);
+    nmatch = 0;
+  err = re_search_internal (preg, string, length, 0, length,
+			    length, nmatch, pmatch, eflags);
   lock_unlock (dfa->lock);
+
+  if (err == REG_NOERROR && startoff)
+    for (i = 0; i < nmatch; ++i)
+      if (pmatch[i].rm_so != -1)
+	{
+	  pmatch[i].rm_so += startoff;
+	  pmatch[i].rm_eo += startoff;
+	}
   return err != REG_NOERROR;
 }