Implement new symbol-start and symbol-end regexp operators

From: Jim Blandy <jimb@redhat.com>
Subject: Implement new symbol-start and symbol-end regexp operators
Date: 29 Apr 2004 17:57:31 -0500	[thread overview]
Message-ID: <vt27jvyo0is.fsf@zenia.home> (raw)

[-- Attachment #1: Type: text/plain, Size: 449 bytes --]

I find these really useful --- finally I can isearch-regexp for
identifiers in code and match exactly what I mean.  Synthesizing these
operators from the existing ones is a pain, partly because "symbol
constituents" are those characters whose syntax is *either* 'word' or
'symbol'.

This also makes etags / ctags regexps behave a bit more like Emacs
regexps: '_' is no longer a word constituent.  I don't know if that's
actually a feature or not.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Implement symbol-start and symbol-end regexp operators. --]
[-- Type: text/x-patch, Size: 8768 bytes --]

src/ChangeLog:
2004-04-29  Jim Blandy  <jimb@redhat.com>

	Add support for new '\_<' and '\_>' regexp operators, matching the
	beginning and ends of symbols.
	* regex.c (enum syntaxcode): Add Ssymbol.
	(init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword.
	(symbeg, symend): New opcodes.
	(print_partial_compiled_pattern): Print the new opcodes properly.
	(regex_compile): Parse the new operators.
	(analyze_first): symbeg and symend match only the empty string.
	(mutually_exclusive_p): symend is mutually exclusive with \s_ and
	\sw; symbeg is mutually exclusive with \S_ and \Sw.
	(re_match_2_internal): Add code for symbeg and symend.
	* search.c (trivial_regexp_p): \_ is no longer a trivial regexp.

man/ChangeLog:
2004-04-29  Jim Blandy  <jimb@redhat.com>

	* search.texi (Regexps): Document the \_< and \_> regexp operators.

*** src/regex.c.~2~	2004-04-29 15:56:53.000000000 -0500
--- src/regex.c	2004-04-29 17:44:24.000000000 -0500
***************
*** 219,225 ****
  /* Define the syntax stuff for \<, \>, etc.  */

  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1 };

  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
--- 219,225 ----
  /* Define the syntax stuff for \<, \>, etc.  */

  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };

  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
***************
*** 399,405 ****
       if (ISALNUM (c))
  	re_syntax_table[c] = Sword;

!    re_syntax_table['_'] = Sword;

     done = 1;
  }
--- 399,405 ----
       if (ISALNUM (c))
  	re_syntax_table[c] = Sword;

!    re_syntax_table['_'] = Ssymbol;

     done = 1;
  }
***************
*** 656,661 ****
--- 656,664 ----
    wordbound,	/* Succeeds if at a word boundary.  */
    notwordbound,	/* Succeeds if not at a word boundary.	*/

+   symbeg,       /* Succeeds if at symbol beginning.  */
+   symend,       /* Succeeds if at symbol end.  */
+ 
  	/* Matches any character whose syntax is specified.  Followed by
  	   a byte which contains a syntax code, e.g., Sword.  */
    syntaxspec,
***************
*** 1095,1100 ****
--- 1098,1110 ----
  	case wordend:
  	  printf ("/wordend");

+ 	case symbeg:
+ 	  printf ("/symbeg");
+ 	  break;
+ 
+ 	case symend:
+ 	  printf ("/symend");
+ 
  	case syntaxspec:
  	  printf ("/syntaxspec");
  	  mcnt = *p++;
***************
*** 3135,3140 ****
--- 3145,3163 ----
  	      BUF_PUSH (wordend);
  	      break;

+ 	    case '_':
+ 	      if (syntax & RE_NO_GNU_OPS)
+ 		goto normal_char;
+               laststart = b;
+               PATFETCH (c);
+               if (c == '<')
+                 BUF_PUSH (symbeg);
+               else if (c == '>')
+                 BUF_PUSH (symend);
+               else
+                 FREE_STACK_RETURN (REG_BADPAT);
+               break;
+ 
  	    case 'b':
  	      if (syntax & RE_NO_GNU_OPS)
  		goto normal_char;
***************
*** 3629,3634 ****
--- 3652,3659 ----
  	case notwordbound:
  	case wordbeg:
  	case wordend:
+ 	case symbeg:
+ 	case symend:
  	  continue;

***************
*** 4396,4409 ****
        break;

      case wordend:
!     case notsyntaxspec:
        return ((re_opcode_t) *p1 == syntaxspec
! 	      && p1[1] == (op2 == wordend ? Sword : p2[1]));

      case wordbeg:
!     case syntaxspec:
        return ((re_opcode_t) *p1 == notsyntaxspec
! 	      && p1[1] == (op2 == wordbeg ? Sword : p2[1]));

      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
--- 4421,4440 ----
        break;

      case wordend:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
!     case symend:
        return ((re_opcode_t) *p1 == syntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case notsyntaxspec:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);

      case wordbeg:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
!     case symbeg:
        return ((re_opcode_t) *p1 == notsyntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case syntaxspec:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);

      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
***************
*** 5528,5533 ****
--- 5559,5650 ----
  	    }
  	  break;

+ 	case symbeg:
+ 	  DEBUG_PRINT1 ("EXECUTING symbeg.\n");
+ 
+ 	  /* We FAIL in one of the following cases: */
+ 
+ 	  /* Case 1: D is at the end of string.	 */
+ 	  if (AT_STRINGS_END (d))
+ 	    goto fail;
+ 	  else
+ 	    {
+ 	      /* C1 is the character before D, S1 is the syntax of C1, C2
+ 		 is the character at D, and S2 is the syntax of C2.  */
+ 	      re_wchar_t c1, c2;
+ 	      int s1, s2;
+ #ifdef emacs
+ 	      int offset = PTR_TO_OFFSET (d);
+ 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ 	      UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ 	      PREFETCH ();
+ 	      c2 = RE_STRING_CHAR (d, dend - d);
+ 	      s2 = SYNTAX (c2);
+ 	
+ 	      /* Case 2: S2 is neither Sword nor Ssymbol. */
+ 	      if (s2 != Sword && s2 != Ssymbol)
+ 		goto fail;
+ 
+ 	      /* Case 3: D is not at the beginning of string ... */
+ 	      if (!AT_STRINGS_BEG (d))
+ 		{
+ 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ #ifdef emacs
+ 		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ #endif
+ 		  s1 = SYNTAX (c1);
+ 
+ 		  /* ... and S1 is Sword or Ssymbol.  */
+ 		  if (s1 == Sword || s1 == Ssymbol)
+ 		    goto fail;
+ 		}
+ 	    }
+ 	  break;
+ 
+ 	case symend:
+ 	  DEBUG_PRINT1 ("EXECUTING symend.\n");
+ 
+ 	  /* We FAIL in one of the following cases: */
+ 
+ 	  /* Case 1: D is at the beginning of string.  */
+ 	  if (AT_STRINGS_BEG (d))
+ 	    goto fail;
+ 	  else
+ 	    {
+ 	      /* C1 is the character before D, S1 is the syntax of C1, C2
+ 		 is the character at D, and S2 is the syntax of C2.  */
+ 	      re_wchar_t c1, c2;
+ 	      int s1, s2;
+ #ifdef emacs
+ 	      int offset = PTR_TO_OFFSET (d) - 1;
+ 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ 	      UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ 	      s1 = SYNTAX (c1);
+ 
+ 	      /* Case 2: S1 is neither Ssymbol nor Sword.  */
+ 	      if (s1 != Sword && s1 != Ssymbol)
+ 		goto fail;
+ 
+ 	      /* Case 3: D is not at the end of string ... */
+ 	      if (!AT_STRINGS_END (d))
+ 		{
+ 		  PREFETCH_NOLIMIT ();
+ 		  c2 = RE_STRING_CHAR (d, dend - d);
+ #ifdef emacs
+ 		  UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ #endif
+ 		  s2 = SYNTAX (c2);
+ 
+ 		  /* ... and S2 is Sword or Ssymbol.  */
+ 		  if (s2 == Sword || s2 == Ssymbol)
+                     goto fail;
+ 		}
+ 	    }
+ 	  break;
+ 
  	case syntaxspec:
  	case notsyntaxspec:
  	  not = (re_opcode_t) *(p - 1) == notsyntaxspec;
*** src/search.c.~1~	2002-05-12 19:04:16.000000000 -0500
--- src/search.c	2004-04-29 17:30:17.000000000 -0500
***************
*** 962,968 ****
  	    {
  	    case '|': case '(': case ')': case '`': case '\'': case 'b':
  	    case 'B': case '<': case '>': case 'w': case 'W': case 's':
! 	    case 'S': case '=': case '{': case '}':
  	    case 'c': case 'C':	/* for categoryspec and notcategoryspec */
  	    case '1': case '2': case '3': case '4': case '5':
  	    case '6': case '7': case '8': case '9':
--- 962,968 ----
  	    {
  	    case '|': case '(': case ')': case '`': case '\'': case 'b':
  	    case 'B': case '<': case '>': case 'w': case 'W': case 's':
! 	    case 'S': case '=': case '{': case '}': case '_':
  	    case 'c': case 'C':	/* for categoryspec and notcategoryspec */
  	    case '1': case '2': case '3': case '4': case '5':
  	    case '6': case '7': case '8': case '9':
*** man/search.texi.~1~	2002-07-06 08:44:06.000000000 -0500
--- man/search.texi	2004-04-29 17:38:41.000000000 -0500
***************
*** 672,677 ****
--- 672,689 ----
  @item \W
  matches any character that is not a word-constituent.

+ @item \_<
+ matches the empty string, but only at the beginning of a symbol.  A
+ symbol is a sequence of one or more word or symbol constituent
+ characters.  @samp{\_<} matches at the beginning of the buffer only if
+ a symbol-constituent character follows.
+ 
+ @item \_>
+ matches the empty string, but only at the end of a symbol.  A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer only if the contents end
+ with a symbol-constituent character.
+ 
  @item \s@var{c}
  matches any character whose syntax is @var{c}.  Here @var{c} is a
  character that designates a particular syntax class: thus, @samp{w}

[-- Attachment #3: Type: text/plain, Size: 141 bytes --]

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel