unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
From: "Stefan Monnier" <monnier+gnu/emacs@rum.cs.yale.edu>
Cc: emacs-devel@gnu.org
Subject: Re: regex and case-fold-search problem
Date: Fri, 23 Aug 2002 17:52:37 -0400	[thread overview]
Message-ID: <200208232152.g7NLqbe03698@rum.cs.yale.edu> (raw)
In-Reply-To: 200208231736.g7NHafW02174@rum.cs.yale.edu

"Stefan Monnier" <monnier+gnu/emacs@rum.cs.yale.edu> wrote:
> For ASCII it's pretty easy to fix.  But for other charsets, it's
> indeed more tricky.  Maybe we can simply use the smallest contiguous
> range of chars that includes all the chars we should match,
> so the behavior is indeed "implementation-defined" (in the sense
> that it's not necessarily obvious to the user what happens) but
> it's at least less confusing (in the sense that (case-fold-search t)
> matches at least as much as (case-fold-search nil)).

How about the patch below ?


	Stefan


Index: regex.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/regex.c,v
retrieving revision 1.176
diff -u -u -b -r1.176 regex.c
--- regex.c	25 Mar 2002 00:45:48 -0000	1.176
+++ regex.c	23 Aug 2002 21:49:10 -0000
@@ -1914,12 +1914,13 @@
 #define BIT_UPPER	0x10
 #define BIT_MULTIBYTE	0x20
 
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)	\
+/* Set a range START..END to WORK_AREA.
+   The range is passed through TRANSLATE, so START and END
+   should be untranslated.  */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)	\
   do {									\
     EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);			\
-    (work_area).table[(work_area).used++] = (range_start);		\
-    (work_area).table[(work_area).used++] = (range_end);		\
+    set_image_of_range (&work_area, start, end, translate);	\
   } while (0)
 
 /* Free allocated memory for WORK_AREA.	 */
@@ -2077,6 +2078,31 @@
 }
 #endif
 
+
+
+/* We need to find the image of the range start..end when passed through
+   TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+   and is not even necessarily contiguous.
+   We approximate it with the smallest contiguous range that contains
+   all the chars we need.  */
+static void
+set_image_of_range (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end);
+  if (RE_TRANSLATE_P (translate))
+    for (; start <= end; start++)
+      {
+	re_wchar_t c = TRANSLATE (start);
+	cmin = MIN (cmin, c);
+	cmax = MAX (cmax, c);
+      }
+  work_area->table[work_area->used++] = (cmin);
+  work_area->table[work_area->used++] = (cmax);
+}
+
 /* Explicit quit checking is only used on NTemacs.  */
 #if defined WINDOWSNT && defined emacs && defined QUIT
 extern int immediate_quit;
@@ -2525,14 +2551,18 @@
 
 		if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
-		PATFETCH (c);
+		/* Don't translate yet.  The range TRANSLATE(X..Y) cannot
+		   always be determined from TRANSLATE(X) and TRANSLATE(Y)
+		   So the translation is done later in a loop.  Example:
+		   (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
+		PATFETCH_RAW (c);
 
 		/* \ might escape characters inside [...] and [^...].  */
 		if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
 		  {
 		    if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
 
-		    PATFETCH (c);
+		    PATFETCH_RAW (c);
 		    escaped_char = true;
 		  }
 		else
@@ -2636,10 +2668,10 @@
 		  {
 
 		    /* Discard the `-'. */
-		    PATFETCH (c1);
+		    PATFETCH_RAW (c1);
 
 		    /* Fetch the character which ends the range. */
-		    PATFETCH (c1);
+		    PATFETCH_RAW (c1);
 
 		    if (SINGLE_BYTE_CHAR_P (c))
 		      {
@@ -2653,7 +2685,7 @@
 			       starting at the smallest character in
 			       the charset of C1 and ending at C1.  */
 			    int charset = CHAR_CHARSET (c1);
-			    int c2 = MAKE_CHAR (charset, 0, 0);
+			    re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
 			    
 			    SET_RANGE_TABLE_WORK_AREA (range_table_work,
 						       c2, c1);
@@ -2672,7 +2704,7 @@
 		  /* ... into bitmap.  */
 		  {
 		    re_wchar_t this_char;
-		    int range_start = c, range_end = c1;
+		    re_wchar_t range_start = c, range_end = c1;
 
 		    /* If the start is after the end, the range is empty.  */
 		    if (range_start > range_end)

  reply	other threads:[~2002-08-23 21:52 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-08-23  6:25 regex and case-fold-search problem Kenichi Handa
2002-08-23 15:56 ` Eli Zaretskii
2002-08-24  0:51   ` Kenichi Handa
2002-08-24  1:03     ` Miles Bader
2002-08-24  9:42       ` Eli Zaretskii
2002-08-24 16:16       ` Andreas Schwab
2002-08-26  1:54         ` Miles Bader
2002-08-26 16:11           ` Stefan Monnier
2002-08-26 21:51         ` Richard Stallman
2002-08-24  9:39     ` Eli Zaretskii
2002-08-26  1:29       ` Kenichi Handa
2002-08-26  2:31         ` Miles Bader
2002-08-25 22:21     ` Kim F. Storm
2002-08-23 17:36 ` Stefan Monnier
2002-08-23 21:52   ` Stefan Monnier [this message]
2002-08-24  1:16   ` Kenichi Handa
2002-08-25 18:52     ` Stefan Monnier
2002-08-26  1:56       ` Kenichi Handa
2002-08-24 10:40   ` Kai Großjohann
2002-08-26 21:51 ` Richard Stallman
2002-08-29  8:53   ` Kenichi Handa
2002-08-29 12:33     ` Kim F. Storm
2002-08-29 13:38       ` Kenichi Handa
2002-08-29 15:00         ` Kim F. Storm
2002-08-29 16:00         ` Stefan Monnier
2002-08-30  1:11           ` Kenichi Handa
2002-08-30 19:19             ` Richard Stallman
2002-08-30 19:19     ` Richard Stallman
2002-08-30 20:08       ` Stefan Monnier
2002-09-01 13:15         ` Richard Stallman
2002-09-01 16:26           ` Stefan Monnier
2002-09-02 14:54             ` Richard Stallman
2002-09-02 16:58               ` Stefan Monnier
2002-09-04 14:13                 ` Richard Stallman
2002-09-04 16:04                   ` Stefan Monnier
2002-09-05 18:02                     ` Richard Stallman
2002-09-06  1:00                       ` re-search-forward seems to be broken Miles Bader
2002-09-06 20:03                         ` Richard Stallman
2002-08-31  6:14       ` regex and case-fold-search problem Eli Zaretskii
2002-09-01 13:14         ` Richard Stallman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200208232152.g7NLqbe03698@rum.cs.yale.edu \
    --to=monnier+gnu/emacs@rum.cs.yale.edu \
    --cc=emacs-devel@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).