diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 55d0a6e8df8..c54f9f81890 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -3068,8 +3068,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) continue; case succeed_n: - /* If N == 0, it should be an on_failure_jump_loop instead. */ - DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0)); + /* If N == 0, it should be an on_failure_jump_loop instead. + `j` can be negative because `EXTRACT_NUMBER` extracts a + signed number whereas `succeed_n` treats it as unsigned. */ + DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0)); p += 4; /* We only care about one iteration of the loop, so we don't need to consider the case where this behaves like an @@ -3743,20 +3745,20 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, } /* True if "p1 matches something" implies "p2 fails". */ -/* Avoiding inf-loops: - We're trying to follow all paths reachable from `p2`, but since some +/* We're trying to follow all paths reachable from `p2`, but since some loops can match the empty string, this can loop back to `p2`. - To avoid inf-looping, we keep track of points that have been considered - "already". Instead of keeping a list of such points, `done_beg` and - `done_end` delimit a chunk of bytecode we already considered. - To guarantee termination, a lexical ordering between `done_*` and `p2` - should be obeyed: - At each recursion, either `done_beg` gets smaller, - or `done_beg` is unchanged and `done_end` gets larger - or they're both unchanged and `p2` gets larger. */ + To avoid inf-looping, we take advantage of the fact that + the bytecode we generate is made of syntactically nested loops, more + specifically, every loop has a single entry point and single exit point. + The function takes 2 more arguments (`loop_entry` and `loop_exit`). + `loop_entry` points to the sole entry point of the current loop and + `loop_exit` points to its sole exit point (when non-NULL). + The function can assume that `loop_exit` is "mutually exclusive". + Termination of recursive calls is ensured because either `loop_entry` + is larger, or it's equal but `p2` is larger. */ static bool mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, - re_char *p2, re_char *done_beg, re_char *done_end) + re_char *p2, re_char *loop_entry, re_char *loop_exit) { re_opcode_t op2; unsigned char *pend = bufp->buffer + bufp->used; @@ -3765,8 +3767,23 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, eassert (p1 >= bufp->buffer && p1 < pend && p2 >= bufp->buffer && p2 <= pend); - eassert (done_beg <= done_end); - eassert (done_end <= p2); + if (p2 == loop_exit) + /* Presumably already checked elsewhere. */ + return true; + eassert (loop_entry && p2 >= loop_entry); + /* eassert (!loop_exit || p2 < loop_exit); */ + if (p2 < loop_entry) + return false; + if (loop_exit && p2 > loop_exit) + { + void print_compiled_pattern (struct re_pattern_buffer *bufp); + print_compiled_pattern (bufp); + fprintf (stderr, "p1==%d, p2==%d, loop-entry==%d, loop-exit==%d\n", + p1 - bufp->buffer, p2 - bufp->buffer, + loop_entry - bufp->buffer, loop_exit - bufp->buffer); + error ("WOW1!"); + } + /* return false; */ /* Skip over open/close-group commands. If what follows this loop is a ...+ construct, @@ -3860,27 +3877,30 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, EXTRACT_NUMBER_AND_INCR (mcnt, p2); re_char *p2_other = p2 + mcnt; - /* When we jump backward we bump `done_end` up to `p3` under - the assumption that any other position between `done_end` - and `p3` is either: - - checked by the other call to RECURSE. - - not reachable from here (e.g. for positions before the - `on_failure_jump`), or at least not without first - jumping before `done_beg`. - This should hold because our state machines are not arbitrary: - they consists of syntaxically nested loops with limited - control flow. - FIXME: This can fail (i.e. return true when it shouldn't) - if we start generating bytecode with a different shape, - so maybe we should bite the bullet and replace done_beg/end - with an actual list of positions we've already processed. */ -#define RECURSE(p3) \ - ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \ - : (p3) <= done_end ? true \ - : mutually_exclusive_aux (bufp, p1, p3, done_beg, \ - (p3) > p2_orig ? done_end : (p3))) - - return RECURSE (p2) && RECURSE (p2_other); + /* If we jump backward to the entry point of the current loop + it means it's a zero-length cycle through that loop, so + this cycle itself does not break mutual-exclusion. + If we jump backward to a new loop nested within the current one + then the `p2_other` points to the exit of that inner loop + and the other RECURSE will check what happens when we leave + the loop so we can focus on checking just the loop body, + so we pass new values for `loop-entry` and `loop_exit`. + In all other cases, we just recurse "normally": if it's before + `loop_entry` it's a bug that will be caught by checks at + the entrace of the function and otherwise it's a forward jump + which doesn't need any special treatment. + FIXME: This is failsafe (can't return true when it shouldn't) + but it could be too conservative if we start generating bytecode + with a different shape, so maybe we should bite the bullet and + replace done_beg/end with an actual list of positions we've + already processed. */ +#define RECURSE(p3, p3_other) \ + ((p3) == loop_entry ? true \ + : loop_entry < (p3) && (p3) < p2_orig ? \ + mutually_exclusive_aux (bufp, p1, p3, p3, p3_other) \ + : mutually_exclusive_aux (bufp, p1, p3, loop_entry, loop_exit)) + + return RECURSE (p2, p2_other) && RECURSE (p2_other, p2); } default: @@ -3895,7 +3915,7 @@ #define RECURSE(p3) \ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, re_char *p2) { - return mutually_exclusive_aux (bufp, p1, p2, p2, p2); + return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL); } /* Matching routines. */