unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
* bug#66261: Disassembling a regexp's bytecode
@ 2023-09-29  2:28 Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2023-09-29 15:06 ` Eli Zaretskii
  0 siblings, 1 reply; 6+ messages in thread
From: Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2023-09-29  2:28 UTC (permalink / raw)
  To: 66261

[-- Attachment #1: Type: text/plain, Size: 1916 bytes --]

Tags: patch

I'd like to add a function that lets us see a regexp's bytecode directly
from within Emacs (recompiling with REGEX_EMACS_DEBUG can be quite useful
in many cases, but it's much more invasive and it's often overkill).

The patch below is what I use currently, but clearly it's not ready for
`master`.  Before I try and clean it, I'd like to discuss some issues to
figure out how best to solve them:

- First, in order to easily use the same code between REGEX_EMACS_DEBUG
  and my new `re--describe-compiled`, I need to print sometimes to
  `stderr` and sometimes to a string, which I do using `open_memstream`.
  AFAIK `open_memstream` is not directly available in Windows (and
  maybe under some other Unixes either, tho it's in POSIX-2008, IIUC).
  Could someone help me get an `opem_memstream` emulation working
  (maybe via gnulib)?

- I'm thinking of always providing this function.  Another option would
  be to do it under the control of a compilation flag, tho it doesn't
  seem worth adding a new flag just for that.  I guess we could
  reuse REGEX_EMACS_DEBUG (tho it's too invasive IMO), or
  ENABLE_CHECKING, but I'd rather just always offer the function.
  After all, it might encourage users to look more carefully at their
  regexps and maybe even to help us improve our regexp engine, who knows.


        Stefan


 In GNU Emacs 30.0.50 (build 1, x86_64-pc-linux-gnu, X toolkit, cairo
 version 1.16.0, Xaw3d scroll bars) of 2023-09-16 built on pastel
Repository revision: 0954f127b8840bf843a2acfb18d2e18e526166e1
Repository branch: work
Windowing system distributor 'The X.Org Foundation', version 11.0.12101007
System Description: Debian GNU/Linux 12 (bookworm)

Configured using:
 'configure -C --enable-checking --enable-check-lisp-object-type --with-modules --with-cairo --with-tiff=ifavailable
 'CFLAGS=-Wall -g3 -Og -Wno-pointer-sign'
 PKG_CONFIG_PATH=/home/monnier/lib/pkgconfig'


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: regexp.patch --]
[-- Type: text/patch, Size: 7798 bytes --]

diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index e42c045bb86..bc26bb02dce 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -447,7 +447,7 @@ #define CHARSET_RANGE_TABLE_END(range_table, count)	\
 # include "sysstdio.h"
 
 static void
-debug_putchar (int c)
+debug_putchar (FILE *stderr, int c)
 {
   if (c >= 32 && c <= 126)
     putc (c, stderr);
@@ -461,7 +461,7 @@ debug_putchar (int c)
 /* Print the fastmap in human-readable form.  */
 
 static void
-print_fastmap (char *fastmap)
+print_fastmap (FILE *stderr, char *fastmap)
 {
   bool was_a_range = false;
   int i = 0;
@@ -471,7 +471,7 @@ print_fastmap (char *fastmap)
       if (fastmap[i++])
 	{
 	  was_a_range = false;
-	  debug_putchar (i - 1);
+	  debug_putchar (stderr, i - 1);
 	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
 	    {
 	      was_a_range = true;
@@ -479,8 +479,8 @@ print_fastmap (char *fastmap)
 	    }
 	  if (was_a_range)
 	    {
-	      debug_putchar ('-');
-	      debug_putchar (i - 1);
+	      debug_putchar (stderr, '-');
+	      debug_putchar (stderr, i - 1);
 	    }
 	}
     }
@@ -492,7 +492,7 @@ print_fastmap (char *fastmap)
    the START pointer into it and ending just before the pointer END.  */
 
 static void
-print_partial_compiled_pattern (re_char *start, re_char *end)
+print_partial_compiled_pattern (FILE *stderr, re_char *start, re_char *end)
 {
   int mcnt, mcnt2;
   re_char *p = start;
@@ -524,8 +524,8 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
 	  fprintf (stderr, "/exactn/%d", mcnt);
 	  do
 	    {
-	      debug_putchar ('/');
-	      debug_putchar (*p++);
+	      debug_putchar (stderr, '/');
+	      debug_putchar (stderr, *p++);
 	    }
 	  while (--mcnt);
 	  break;
@@ -567,26 +567,26 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
 		  /* Are we starting a range?  */
 		  if (last + 1 == c && ! in_range)
 		    {
-		      debug_putchar ('-');
+		      debug_putchar (stderr, '-');
 		      in_range = true;
 		    }
 		  /* Have we broken a range?  */
 		  else if (last + 1 != c && in_range)
 		    {
-		      debug_putchar (last);
+		      debug_putchar (stderr, last);
 		      in_range = false;
 		    }
 
 		  if (! in_range)
-		    debug_putchar (c);
+		    debug_putchar (stderr, c);
 
 		  last = c;
 	      }
 
 	    if (in_range)
-	      debug_putchar (last);
+	      debug_putchar (stderr, last);
 
-	    debug_putchar (']');
+	    debug_putchar (stderr, ']');
 
 	    p += 1 + length;
 
@@ -737,28 +737,30 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
 }
 
 
-static void
-print_compiled_pattern (struct re_pattern_buffer *bufp)
+void
+print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp)
 {
   re_char *buffer = bufp->buffer;
 
-  print_partial_compiled_pattern (buffer, buffer + bufp->used);
-  fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
+  print_partial_compiled_pattern (dest, buffer, buffer + bufp->used);
+  fprintf (dest, "%td bytes used/%td bytes allocated.\n",
            bufp->used, bufp->allocated);
 
   if (bufp->fastmap_accurate && bufp->fastmap)
     {
-      fputs ("fastmap: ", stderr);
-      print_fastmap (bufp->fastmap);
+      fputs ("fastmap: ", dest);
+      print_fastmap (dest, bufp->fastmap);
     }
 
-  fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
-  fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
-  fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
+  fprintf (dest, "re_nsub: %td\t", bufp->re_nsub);
+  fprintf (dest, "regs_alloc: %d\t", bufp->regs_allocated);
+  fprintf (dest, "can_be_null: %d\n", bufp->can_be_null);
   /* Perhaps we should print the translate table?  */
 }
 
 
+#ifdef REGEX_EMACS_DEBUG
+
 static void
 print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
 		     re_char *string2, ptrdiff_t size2)
@@ -771,17 +773,15 @@ print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
       if (FIRST_STRING_P (where))
 	{
 	  for (i = 0; i < string1 + size1 - where; i++)
-	    debug_putchar (where[i]);
+	    debug_putchar (stderr, where[i]);
 	  where = string2;
 	}
 
       for (i = 0; i < string2 + size2 - where; i++)
-        debug_putchar (where[i]);
+        debug_putchar (stderr, where[i]);
     }
 }
 
-#ifdef REGEX_EMACS_DEBUG
-
 static int regex_emacs_debug = -10000;
 
 # define DEBUG_STATEMENT(e) e
@@ -789,7 +789,7 @@ print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
   if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
 # define DEBUG_COMPILES_ARGUMENTS
 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
-  if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
+  if (regex_emacs_debug > 0) print_partial_compiled_pattern (stderr, s, e)
 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
   if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
 
@@ -1769,7 +1769,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
   if (regex_emacs_debug > 0)
     {
       for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
-	debug_putchar (pattern[debug_count]);
+	debug_putchar (stderr, pattern[debug_count]);
       putc ('\n', stderr);
     }
 #endif
@@ -2700,7 +2700,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
     {
       re_compile_fastmap (bufp);
       DEBUG_PRINT ("\nCompiled pattern:\n");
-      print_compiled_pattern (bufp);
+      print_compiled_pattern (stderr, bufp);
     }
   regex_emacs_debug--;
 #endif
diff --git a/src/regex-emacs.h b/src/regex-emacs.h
index bc357633135..e355cd30eb0 100644
--- a/src/regex-emacs.h
+++ b/src/regex-emacs.h
@@ -195,4 +195,6 @@ #define EMACS_REGEX_H 1
 extern re_wctype_t re_wctype_parse (const unsigned char **strp,
 				    ptrdiff_t limit);
 
+extern void print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp);
+
 #endif /* EMACS_REGEX_H */
diff --git a/src/search.c b/src/search.c
index 3d86b24c2b5..ed8115d0c54 100644
--- a/src/search.c
+++ b/src/search.c
@@ -115,8 +115,8 @@ compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern,
   else
     cp->f_whitespace_regexp = Qnil;
 
-  whitespace_regexp = STRINGP (Vsearch_spaces_regexp) ?
-    SSDATA (Vsearch_spaces_regexp) : NULL;
+  whitespace_regexp = STRINGP (Vsearch_spaces_regexp)
+                      ? SSDATA (Vsearch_spaces_regexp) : NULL;
 
   val = (char *) re_compile_pattern (SSDATA (pattern), SBYTES (pattern),
 				     posix, whitespace_regexp, &cp->buf);
@@ -3385,6 +3385,30 @@ DEFUN ("newline-cache-check", Fnewline_cache_check, Snewline_cache_check,
     set_buffer_internal_1 (old);
   return val;
 }
+
+DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled,
+       1, 1, 0,
+       doc: /* Return a string describing the compiled form of REGEXP.  */)
+  (Lisp_Object regexp)
+{
+  struct regexp_cache *cache_entry
+    = compile_pattern (regexp, NULL,
+                       (!NILP (BVAR (current_buffer, case_fold_search))
+                        ? BVAR (current_buffer, case_canon_table) : Qnil),
+                       false,
+                       !NILP (BVAR (current_buffer,
+                                    enable_multibyte_characters)));
+  char *buffer = NULL;
+  size_t size = 0;
+  FILE* f = open_memstream (&buffer, &size);
+  if (!f)
+    report_file_error ("open_memstream failed", regexp);
+  print_compiled_pattern (f, &cache_entry->buf);
+  fclose (f);
+  if (!buffer)
+    return Qnil;
+  return make_unibyte_string (buffer, size);
+}
 \f
 
 static void syms_of_search_for_pdumper (void);
@@ -3464,6 +3488,7 @@ syms_of_search (void)
   defsubr (&Smatch_data__translate);
   defsubr (&Sregexp_quote);
   defsubr (&Snewline_cache_check);
+  defsubr (&Sre__describe_compiled);
 
   pdumper_do_now_and_after_load (syms_of_search_for_pdumper);
 }

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-09-29 18:56 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-29  2:28 bug#66261: Disassembling a regexp's bytecode Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-29 15:06 ` Eli Zaretskii
2023-09-29 15:47   ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-29 16:24     ` Eli Zaretskii
2023-09-29 16:33       ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors
2023-09-29 18:56         ` Stefan Monnier via Bug reports for GNU Emacs, the Swiss army knife of text editors

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).