From: Jim Blandy <jimb@redhat.com>
Subject: Implement new symbol-start and symbol-end regexp operators
Date: 29 Apr 2004 17:57:31 -0500 [thread overview]
Message-ID: <vt27jvyo0is.fsf@zenia.home> (raw)
[-- Attachment #1: Type: text/plain, Size: 449 bytes --]
I find these really useful --- finally I can isearch-regexp for
identifiers in code and match exactly what I mean. Synthesizing these
operators from the existing ones is a pain, partly because "symbol
constituents" are those characters whose syntax is *either* 'word' or
'symbol'.
This also makes etags / ctags regexps behave a bit more like Emacs
regexps: '_' is no longer a word constituent. I don't know if that's
actually a feature or not.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Implement symbol-start and symbol-end regexp operators. --]
[-- Type: text/x-patch, Size: 8768 bytes --]
src/ChangeLog:
2004-04-29 Jim Blandy <jimb@redhat.com>
Add support for new '\_<' and '\_>' regexp operators, matching the
beginning and ends of symbols.
* regex.c (enum syntaxcode): Add Ssymbol.
(init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword.
(symbeg, symend): New opcodes.
(print_partial_compiled_pattern): Print the new opcodes properly.
(regex_compile): Parse the new operators.
(analyze_first): symbeg and symend match only the empty string.
(mutually_exclusive_p): symend is mutually exclusive with \s_ and
\sw; symbeg is mutually exclusive with \S_ and \Sw.
(re_match_2_internal): Add code for symbeg and symend.
* search.c (trivial_regexp_p): \_ is no longer a trivial regexp.
man/ChangeLog:
2004-04-29 Jim Blandy <jimb@redhat.com>
* search.texi (Regexps): Document the \_< and \_> regexp operators.
*** src/regex.c.~2~ 2004-04-29 15:56:53.000000000 -0500
--- src/regex.c 2004-04-29 17:44:24.000000000 -0500
***************
*** 219,225 ****
/* Define the syntax stuff for \<, \>, etc. */
/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
! enum syntaxcode { Swhitespace = 0, Sword = 1 };
# ifdef SWITCH_ENUM_BUG
# define SWITCH_ENUM_CAST(x) ((int)(x))
--- 219,225 ----
/* Define the syntax stuff for \<, \>, etc. */
/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
! enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
# ifdef SWITCH_ENUM_BUG
# define SWITCH_ENUM_CAST(x) ((int)(x))
***************
*** 399,405 ****
if (ISALNUM (c))
re_syntax_table[c] = Sword;
! re_syntax_table['_'] = Sword;
done = 1;
}
--- 399,405 ----
if (ISALNUM (c))
re_syntax_table[c] = Sword;
! re_syntax_table['_'] = Ssymbol;
done = 1;
}
***************
*** 656,661 ****
--- 656,664 ----
wordbound, /* Succeeds if at a word boundary. */
notwordbound, /* Succeeds if not at a word boundary. */
+ symbeg, /* Succeeds if at symbol beginning. */
+ symend, /* Succeeds if at symbol end. */
+
/* Matches any character whose syntax is specified. Followed by
a byte which contains a syntax code, e.g., Sword. */
syntaxspec,
***************
*** 1095,1100 ****
--- 1098,1110 ----
case wordend:
printf ("/wordend");
+ case symbeg:
+ printf ("/symbeg");
+ break;
+
+ case symend:
+ printf ("/symend");
+
case syntaxspec:
printf ("/syntaxspec");
mcnt = *p++;
***************
*** 3135,3140 ****
--- 3145,3163 ----
BUF_PUSH (wordend);
break;
+ case '_':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ laststart = b;
+ PATFETCH (c);
+ if (c == '<')
+ BUF_PUSH (symbeg);
+ else if (c == '>')
+ BUF_PUSH (symend);
+ else
+ FREE_STACK_RETURN (REG_BADPAT);
+ break;
+
case 'b':
if (syntax & RE_NO_GNU_OPS)
goto normal_char;
***************
*** 3629,3634 ****
--- 3652,3659 ----
case notwordbound:
case wordbeg:
case wordend:
+ case symbeg:
+ case symend:
continue;
***************
*** 4396,4409 ****
break;
case wordend:
! case notsyntaxspec:
return ((re_opcode_t) *p1 == syntaxspec
! && p1[1] == (op2 == wordend ? Sword : p2[1]));
case wordbeg:
! case syntaxspec:
return ((re_opcode_t) *p1 == notsyntaxspec
! && p1[1] == (op2 == wordbeg ? Sword : p2[1]));
case wordbound:
return (((re_opcode_t) *p1 == notsyntaxspec
--- 4421,4440 ----
break;
case wordend:
! return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
! case symend:
return ((re_opcode_t) *p1 == syntaxspec
! && (p1[1] == Ssymbol || p1[1] == Sword));
! case notsyntaxspec:
! return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
case wordbeg:
! return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
! case symbeg:
return ((re_opcode_t) *p1 == notsyntaxspec
! && (p1[1] == Ssymbol || p1[1] == Sword));
! case syntaxspec:
! return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
case wordbound:
return (((re_opcode_t) *p1 == notsyntaxspec
***************
*** 5528,5533 ****
--- 5559,5650 ----
}
break;
+ case symbeg:
+ DEBUG_PRINT1 ("EXECUTING symbeg.\n");
+
+ /* We FAIL in one of the following cases: */
+
+ /* Case 1: D is at the end of string. */
+ if (AT_STRINGS_END (d))
+ goto fail;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ re_wchar_t c1, c2;
+ int s1, s2;
+ #ifdef emacs
+ int offset = PTR_TO_OFFSET (d);
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ PREFETCH ();
+ c2 = RE_STRING_CHAR (d, dend - d);
+ s2 = SYNTAX (c2);
+
+ /* Case 2: S2 is neither Sword nor Ssymbol. */
+ if (s2 != Sword && s2 != Ssymbol)
+ goto fail;
+
+ /* Case 3: D is not at the beginning of string ... */
+ if (!AT_STRINGS_BEG (d))
+ {
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ #ifdef emacs
+ UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ #endif
+ s1 = SYNTAX (c1);
+
+ /* ... and S1 is Sword or Ssymbol. */
+ if (s1 == Sword || s1 == Ssymbol)
+ goto fail;
+ }
+ }
+ break;
+
+ case symend:
+ DEBUG_PRINT1 ("EXECUTING symend.\n");
+
+ /* We FAIL in one of the following cases: */
+
+ /* Case 1: D is at the beginning of string. */
+ if (AT_STRINGS_BEG (d))
+ goto fail;
+ else
+ {
+ /* C1 is the character before D, S1 is the syntax of C1, C2
+ is the character at D, and S2 is the syntax of C2. */
+ re_wchar_t c1, c2;
+ int s1, s2;
+ #ifdef emacs
+ int offset = PTR_TO_OFFSET (d) - 1;
+ int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ s1 = SYNTAX (c1);
+
+ /* Case 2: S1 is neither Ssymbol nor Sword. */
+ if (s1 != Sword && s1 != Ssymbol)
+ goto fail;
+
+ /* Case 3: D is not at the end of string ... */
+ if (!AT_STRINGS_END (d))
+ {
+ PREFETCH_NOLIMIT ();
+ c2 = RE_STRING_CHAR (d, dend - d);
+ #ifdef emacs
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ #endif
+ s2 = SYNTAX (c2);
+
+ /* ... and S2 is Sword or Ssymbol. */
+ if (s2 == Sword || s2 == Ssymbol)
+ goto fail;
+ }
+ }
+ break;
+
case syntaxspec:
case notsyntaxspec:
not = (re_opcode_t) *(p - 1) == notsyntaxspec;
*** src/search.c.~1~ 2002-05-12 19:04:16.000000000 -0500
--- src/search.c 2004-04-29 17:30:17.000000000 -0500
***************
*** 962,968 ****
{
case '|': case '(': case ')': case '`': case '\'': case 'b':
case 'B': case '<': case '>': case 'w': case 'W': case 's':
! case 'S': case '=': case '{': case '}':
case 'c': case 'C': /* for categoryspec and notcategoryspec */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
--- 962,968 ----
{
case '|': case '(': case ')': case '`': case '\'': case 'b':
case 'B': case '<': case '>': case 'w': case 'W': case 's':
! case 'S': case '=': case '{': case '}': case '_':
case 'c': case 'C': /* for categoryspec and notcategoryspec */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
*** man/search.texi.~1~ 2002-07-06 08:44:06.000000000 -0500
--- man/search.texi 2004-04-29 17:38:41.000000000 -0500
***************
*** 672,677 ****
--- 672,689 ----
@item \W
matches any character that is not a word-constituent.
+ @item \_<
+ matches the empty string, but only at the beginning of a symbol. A
+ symbol is a sequence of one or more word or symbol constituent
+ characters. @samp{\_<} matches at the beginning of the buffer only if
+ a symbol-constituent character follows.
+
+ @item \_>
+ matches the empty string, but only at the end of a symbol. A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer only if the contents end
+ with a symbol-constituent character.
+
@item \s@var{c}
matches any character whose syntax is @var{c}. Here @var{c} is a
character that designates a particular syntax class: thus, @samp{w}
[-- Attachment #3: Type: text/plain, Size: 141 bytes --]
_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel
next reply other threads:[~2004-04-29 22:57 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-04-29 22:57 Jim Blandy [this message]
2004-05-01 9:44 ` Implement new symbol-start and symbol-end regexp operators Richard Stallman
2004-05-04 19:17 ` Jim Blandy
2004-05-05 6:20 ` Eli Zaretskii
2004-05-05 6:14 ` Jim Blandy
2004-05-05 20:21 ` Richard Stallman
2004-05-05 21:10 ` Stefan Monnier
2004-05-12 17:36 ` Jim Blandy
2004-05-19 16:38 ` Stefan Monnier
2004-06-10 0:32 ` Luc Teirlinck
2004-07-16 2:38 ` Luc Teirlinck
2004-07-16 2:45 ` Luc Teirlinck
2004-07-16 12:43 ` Stefan
2004-07-16 17:49 ` Luc Teirlinck
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=vt27jvyo0is.fsf@zenia.home \
--to=jimb@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this external index
https://git.savannah.gnu.org/cgit/emacs.git
https://git.savannah.gnu.org/cgit/emacs/org-mode.git
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.