From: Stefan Monnier via "Bug reports for GNU Emacs, the Swiss army knife of text editors" <bug-gnu-emacs@gnu.org>
To: 66261@debbugs.gnu.org
Subject: bug#66261: Disassembling a regexp's bytecode
Date: Thu, 28 Sep 2023 22:28:16 -0400 [thread overview]
Message-ID: <jwvsf6xaf27.fsf@iro.umontreal.ca> (raw)
[-- Attachment #1: Type: text/plain, Size: 1916 bytes --]
Tags: patch
I'd like to add a function that lets us see a regexp's bytecode directly
from within Emacs (recompiling with REGEX_EMACS_DEBUG can be quite useful
in many cases, but it's much more invasive and it's often overkill).
The patch below is what I use currently, but clearly it's not ready for
`master`. Before I try and clean it, I'd like to discuss some issues to
figure out how best to solve them:
- First, in order to easily use the same code between REGEX_EMACS_DEBUG
and my new `re--describe-compiled`, I need to print sometimes to
`stderr` and sometimes to a string, which I do using `open_memstream`.
AFAIK `open_memstream` is not directly available in Windows (and
maybe under some other Unixes either, tho it's in POSIX-2008, IIUC).
Could someone help me get an `opem_memstream` emulation working
(maybe via gnulib)?
- I'm thinking of always providing this function. Another option would
be to do it under the control of a compilation flag, tho it doesn't
seem worth adding a new flag just for that. I guess we could
reuse REGEX_EMACS_DEBUG (tho it's too invasive IMO), or
ENABLE_CHECKING, but I'd rather just always offer the function.
After all, it might encourage users to look more carefully at their
regexps and maybe even to help us improve our regexp engine, who knows.
Stefan
In GNU Emacs 30.0.50 (build 1, x86_64-pc-linux-gnu, X toolkit, cairo
version 1.16.0, Xaw3d scroll bars) of 2023-09-16 built on pastel
Repository revision: 0954f127b8840bf843a2acfb18d2e18e526166e1
Repository branch: work
Windowing system distributor 'The X.Org Foundation', version 11.0.12101007
System Description: Debian GNU/Linux 12 (bookworm)
Configured using:
'configure -C --enable-checking --enable-check-lisp-object-type --with-modules --with-cairo --with-tiff=ifavailable
'CFLAGS=-Wall -g3 -Og -Wno-pointer-sign'
PKG_CONFIG_PATH=/home/monnier/lib/pkgconfig'
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: regexp.patch --]
[-- Type: text/patch, Size: 7798 bytes --]
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index e42c045bb86..bc26bb02dce 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -447,7 +447,7 @@ #define CHARSET_RANGE_TABLE_END(range_table, count) \
# include "sysstdio.h"
static void
-debug_putchar (int c)
+debug_putchar (FILE *stderr, int c)
{
if (c >= 32 && c <= 126)
putc (c, stderr);
@@ -461,7 +461,7 @@ debug_putchar (int c)
/* Print the fastmap in human-readable form. */
static void
-print_fastmap (char *fastmap)
+print_fastmap (FILE *stderr, char *fastmap)
{
bool was_a_range = false;
int i = 0;
@@ -471,7 +471,7 @@ print_fastmap (char *fastmap)
if (fastmap[i++])
{
was_a_range = false;
- debug_putchar (i - 1);
+ debug_putchar (stderr, i - 1);
while (i < (1 << BYTEWIDTH) && fastmap[i])
{
was_a_range = true;
@@ -479,8 +479,8 @@ print_fastmap (char *fastmap)
}
if (was_a_range)
{
- debug_putchar ('-');
- debug_putchar (i - 1);
+ debug_putchar (stderr, '-');
+ debug_putchar (stderr, i - 1);
}
}
}
@@ -492,7 +492,7 @@ print_fastmap (char *fastmap)
the START pointer into it and ending just before the pointer END. */
static void
-print_partial_compiled_pattern (re_char *start, re_char *end)
+print_partial_compiled_pattern (FILE *stderr, re_char *start, re_char *end)
{
int mcnt, mcnt2;
re_char *p = start;
@@ -524,8 +524,8 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
fprintf (stderr, "/exactn/%d", mcnt);
do
{
- debug_putchar ('/');
- debug_putchar (*p++);
+ debug_putchar (stderr, '/');
+ debug_putchar (stderr, *p++);
}
while (--mcnt);
break;
@@ -567,26 +567,26 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
/* Are we starting a range? */
if (last + 1 == c && ! in_range)
{
- debug_putchar ('-');
+ debug_putchar (stderr, '-');
in_range = true;
}
/* Have we broken a range? */
else if (last + 1 != c && in_range)
{
- debug_putchar (last);
+ debug_putchar (stderr, last);
in_range = false;
}
if (! in_range)
- debug_putchar (c);
+ debug_putchar (stderr, c);
last = c;
}
if (in_range)
- debug_putchar (last);
+ debug_putchar (stderr, last);
- debug_putchar (']');
+ debug_putchar (stderr, ']');
p += 1 + length;
@@ -737,28 +737,30 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
}
-static void
-print_compiled_pattern (struct re_pattern_buffer *bufp)
+void
+print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp)
{
re_char *buffer = bufp->buffer;
- print_partial_compiled_pattern (buffer, buffer + bufp->used);
- fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
+ print_partial_compiled_pattern (dest, buffer, buffer + bufp->used);
+ fprintf (dest, "%td bytes used/%td bytes allocated.\n",
bufp->used, bufp->allocated);
if (bufp->fastmap_accurate && bufp->fastmap)
{
- fputs ("fastmap: ", stderr);
- print_fastmap (bufp->fastmap);
+ fputs ("fastmap: ", dest);
+ print_fastmap (dest, bufp->fastmap);
}
- fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
- fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
- fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
+ fprintf (dest, "re_nsub: %td\t", bufp->re_nsub);
+ fprintf (dest, "regs_alloc: %d\t", bufp->regs_allocated);
+ fprintf (dest, "can_be_null: %d\n", bufp->can_be_null);
/* Perhaps we should print the translate table? */
}
+#ifdef REGEX_EMACS_DEBUG
+
static void
print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
re_char *string2, ptrdiff_t size2)
@@ -771,17 +773,15 @@ print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
if (FIRST_STRING_P (where))
{
for (i = 0; i < string1 + size1 - where; i++)
- debug_putchar (where[i]);
+ debug_putchar (stderr, where[i]);
where = string2;
}
for (i = 0; i < string2 + size2 - where; i++)
- debug_putchar (where[i]);
+ debug_putchar (stderr, where[i]);
}
}
-#ifdef REGEX_EMACS_DEBUG
-
static int regex_emacs_debug = -10000;
# define DEBUG_STATEMENT(e) e
@@ -789,7 +789,7 @@ print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
# define DEBUG_COMPILES_ARGUMENTS
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
- if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
+ if (regex_emacs_debug > 0) print_partial_compiled_pattern (stderr, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
@@ -1769,7 +1769,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
if (regex_emacs_debug > 0)
{
for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
- debug_putchar (pattern[debug_count]);
+ debug_putchar (stderr, pattern[debug_count]);
putc ('\n', stderr);
}
#endif
@@ -2700,7 +2700,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
{
re_compile_fastmap (bufp);
DEBUG_PRINT ("\nCompiled pattern:\n");
- print_compiled_pattern (bufp);
+ print_compiled_pattern (stderr, bufp);
}
regex_emacs_debug--;
#endif
diff --git a/src/regex-emacs.h b/src/regex-emacs.h
index bc357633135..e355cd30eb0 100644
--- a/src/regex-emacs.h
+++ b/src/regex-emacs.h
@@ -195,4 +195,6 @@ #define EMACS_REGEX_H 1
extern re_wctype_t re_wctype_parse (const unsigned char **strp,
ptrdiff_t limit);
+extern void print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp);
+
#endif /* EMACS_REGEX_H */
diff --git a/src/search.c b/src/search.c
index 3d86b24c2b5..ed8115d0c54 100644
--- a/src/search.c
+++ b/src/search.c
@@ -115,8 +115,8 @@ compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern,
else
cp->f_whitespace_regexp = Qnil;
- whitespace_regexp = STRINGP (Vsearch_spaces_regexp) ?
- SSDATA (Vsearch_spaces_regexp) : NULL;
+ whitespace_regexp = STRINGP (Vsearch_spaces_regexp)
+ ? SSDATA (Vsearch_spaces_regexp) : NULL;
val = (char *) re_compile_pattern (SSDATA (pattern), SBYTES (pattern),
posix, whitespace_regexp, &cp->buf);
@@ -3385,6 +3385,30 @@ DEFUN ("newline-cache-check", Fnewline_cache_check, Snewline_cache_check,
set_buffer_internal_1 (old);
return val;
}
+
+DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled,
+ 1, 1, 0,
+ doc: /* Return a string describing the compiled form of REGEXP. */)
+ (Lisp_Object regexp)
+{
+ struct regexp_cache *cache_entry
+ = compile_pattern (regexp, NULL,
+ (!NILP (BVAR (current_buffer, case_fold_search))
+ ? BVAR (current_buffer, case_canon_table) : Qnil),
+ false,
+ !NILP (BVAR (current_buffer,
+ enable_multibyte_characters)));
+ char *buffer = NULL;
+ size_t size = 0;
+ FILE* f = open_memstream (&buffer, &size);
+ if (!f)
+ report_file_error ("open_memstream failed", regexp);
+ print_compiled_pattern (f, &cache_entry->buf);
+ fclose (f);
+ if (!buffer)
+ return Qnil;
+ return make_unibyte_string (buffer, size);
+}
\f
static void syms_of_search_for_pdumper (void);
@@ -3464,6 +3488,7 @@ syms_of_search (void)
defsubr (&Smatch_data__translate);
defsubr (&Sregexp_quote);
defsubr (&Snewline_cache_check);
+ defsubr (&Sre__describe_compiled);
pdumper_do_now_and_after_load (syms_of_search_for_pdumper);
}
next reply other threads:[~2023-09-29 2:28 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-29 2:28 Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors [this message]
2023-09-29 15:06 ` bug#66261: Disassembling a regexp's bytecode Eli Zaretskii
2023-09-29 15:47 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-29 16:24 ` Eli Zaretskii
2023-09-29 16:33 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-29 18:56 ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=jwvsf6xaf27.fsf@iro.umontreal.ca \
--to=bug-gnu-emacs@gnu.org \
--cc=66261@debbugs.gnu.org \
--cc=monnier@iro.umontreal.ca \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this external index
https://git.savannah.gnu.org/cgit/emacs.git
https://git.savannah.gnu.org/cgit/emacs/org-mode.git
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.