unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
blob 43e3cd45ecb46f9b5d24a5c9265675ab517de4a6 22779 bytes (raw)
name: lisp/char-fold.el 	 # note: path name is non-authoritative(*)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
 
;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-

;; Copyright (C) 2015-2022 Free Software Foundation, Inc.

;; Maintainer: emacs-devel@gnu.org
;; Keywords: matching

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.

;;; Commentary:

;;; Code:

(eval-when-compile (require 'subr-x))

(eval-and-compile
  (put 'char-fold-table 'char-table-extra-slots 1)
  (defconst char-fold--default-override nil)
  (defconst char-fold--default-include
    '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
      (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
      (?` "❛" "‘" "‛" "󠀢" "❮" "‹")
      ("ss") ;; de
      ("ΐ")  ;; el for (?ΐ "ΐ") decomposition
      ("ΰ")  ;; el for (?ΰ "ΰ") decomposition
      ))
  (defconst char-fold--default-exclude
    '(
      ("й")  ;; ru
      ))
  (defconst char-fold--default-symmetric nil)
  (defvar char-fold--previous
    (list char-fold--default-override
          char-fold--default-include
          char-fold--default-exclude
          char-fold--default-symmetric)))

\f
(eval-and-compile
  (defvar char-fold--no-regexp nil)
  (defun char-fold--make-table ()
    (let* ((equiv (make-char-table 'char-fold-table))
           (equiv-multi (make-char-table 'char-fold-table))
           (table (unicode-property-table-internal 'decomposition)))
      (set-char-table-extra-slot equiv 0 equiv-multi)

      ;; Ensure the table is populated.
      (let ((func (char-table-extra-slot table 1)))
        (map-char-table (lambda (char v)
                          (when (consp char)
                            (funcall func (car char) v table)))
                        table))

      ;; Compile a list of all complex characters that each simple
      ;; character should match.
      ;; In summary this loop does 3 things:
      ;; - A complex character might be allowed to match its decomp.
      ;; - The decomp is allowed to match the complex character.
      ;; - A single char of the decomp might be allowed to match the
      ;;   character.
      ;; Some examples in the comments below.
      (unless (or (bound-and-true-p char-fold-override)
                  char-fold--default-override)
        (map-char-table
         (lambda (char decomp)
           (when (consp decomp)
             ;; Skip trivial cases like ?a decomposing to (?a).
             (unless (and (not (cdr decomp))
                          (eq char (car decomp)))
               (if (symbolp (car decomp))
                   ;; Discard a possible formatting tag.
                   (setq decomp (cdr decomp))
                 ;; If there's no formatting tag, ensure that char matches
                 ;; its decomp exactly.  This is because we want 'ä' to
                 ;; match 'ä', but we don't want '¹' to match '1'.
                 (aset equiv char
                       (cons (apply #'string decomp)
                             (aref equiv char))))

               ;; Allow the entire decomp to match char.  If decomp has
               ;; multiple characters, this is done by adding an entry
               ;; to the alist of the first character in decomp.  This
               ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
               ;; match '¹'.
               (let ((make-decomp-match-char
                      (lambda (decomp char)
                        (if (cdr decomp)
                            (aset equiv-multi (car decomp)
                                  (cons (cons (apply #'string (cdr decomp))
                                              (regexp-quote (string char)))
                                        (aref equiv-multi (car decomp))))
                          (aset equiv (car decomp)
                                (cons (char-to-string char)
                                      (aref equiv (car decomp))))))))
                 (funcall make-decomp-match-char decomp char)
                 ;; Check to see if the first char of the decomposition
                 ;; has a further decomposition.  If so, add a mapping
                 ;; back from that second decomposition to the original
                 ;; character.  This allows e.g. 'ι' (GREEK SMALL LETTER
                 ;; IOTA) to match both the Basic Greek block and
                 ;; Extended Greek block variants of IOTA +
                 ;; diacritical(s).  Repeat until there are no more
                 ;; decompositions.
                 (let ((dec decomp)
                       next-decomp)
                   (while dec
                     (setq next-decomp (char-table-range table (car dec)))
                     (when (consp next-decomp)
                       (when (symbolp (car next-decomp))
                         (setq next-decomp (cdr next-decomp)))
                       (if (not (eq (car dec)
                                    (car next-decomp)))
                           (funcall make-decomp-match-char (list (car next-decomp)) char)))
                     (setq dec next-decomp)))
                 ;; Do it again, without the non-spacing characters.
                 ;; This allows 'a' to match 'ä'.
                 (let ((simpler-decomp nil)
                       (found-one nil))
                   (dolist (c decomp)
                     (if (> (get-char-code-property c 'canonical-combining-class) 0)
                         (setq found-one t)
                       (push c simpler-decomp)))
                   (when (and simpler-decomp found-one)
                     (funcall make-decomp-match-char simpler-decomp char)
                     ;; Finally, if the decomp only had one spacing
                     ;; character, we allow this character to match the
                     ;; decomp.  This is to let 'a' match 'ä'.
                     (unless (cdr simpler-decomp)
                       (aset equiv (car simpler-decomp)
                             (cons (apply #'string decomp)
                                   (aref equiv (car simpler-decomp)))))))))))
         table))

      ;; Add some entries to default decomposition
      (dolist (it (or (bound-and-true-p char-fold-include)
                      char-fold--default-include))
        (let ((idx (car it))
              (chars (cdr it)))
          (aset equiv idx (append chars (aref equiv idx)))))

      ;; Remove some entries from default decomposition
      (dolist (it (or (bound-and-true-p char-fold-exclude)
                      char-fold--default-exclude))
        (let ((idx (car it))
              (chars (cdr it)))
          (when (aref equiv idx)
            (dolist (char chars)
              (aset equiv idx (remove char (aref equiv idx)))))))

      ;; Add symmetric entries
      (when (or (bound-and-true-p char-fold-symmetric)
                char-fold--default-symmetric)
        (let ((symmetric (make-hash-table :test 'eq)))
          ;; Initialize hashes
          (map-char-table
           (lambda (char decomp-list)
             (puthash char (make-hash-table :test 'equal) symmetric)
             (dolist (decomp decomp-list)
               (puthash (string-to-char decomp) (make-hash-table :test 'equal) symmetric)))
           equiv)

          (map-char-table
           (lambda (char decomp-list)
             (dolist (decomp decomp-list)
               (if (< (length decomp) 2)
                   ;; Add single-char symmetric pairs to hash
                   (let ((decomp-list (cons (char-to-string char) decomp-list))
                         (decomp-hash (gethash (string-to-char decomp) symmetric)))
                     (dolist (decomp2 decomp-list)
                       (unless (equal decomp decomp2)
                         (puthash decomp2 t decomp-hash)
                         (puthash decomp t (gethash (string-to-char decomp2) symmetric)))))
                 ;; Add multi-char symmetric pairs to equiv-multi char-table
                 (let ((decomp-list (cons (char-to-string char) decomp-list))
                       (prefix (string-to-char decomp))
                       (suffix (substring decomp 1)))
                   (puthash decomp t (gethash char symmetric))
                   (dolist (decomp2 decomp-list)
                     (if (< (length decomp2) 2)
                         (aset equiv-multi prefix
                               (cons (cons suffix (regexp-quote decomp2))
                                     (aref equiv-multi prefix)))))))))
           equiv)

          ;; Update equiv char-table from hash
          (maphash
           (lambda (char decomp-hash)
             (let (schars)
               (maphash (lambda (schar _) (push schar schars)) decomp-hash)
               (aset equiv char schars)))
           symmetric)))

      ;; Convert the lists of characters we compiled into regexps.
      (unless char-fold--no-regexp
        ;; Non-nil `char-fold--no-regexp' unoptimized for regexp
        ;; is used by `describe-char-fold-equivalences'.
        (map-char-table
         (lambda (char decomp-list)
           (let ((re (regexp-opt (cons (char-to-string char) decomp-list))))
             (aset equiv char re)))
         equiv))
      equiv)))

(defconst char-fold-table
  (eval-when-compile
    (char-fold--make-table))
  "Used for folding characters of the same group during search.
This is a char-table with the `char-fold-table' subtype.

Let us refer to the character in question by char-x.
Each entry is either nil (meaning char-x only matches literally)
or a regexp.  This regexp should match anything that char-x can
match by itself \(including char-x).  For instance, the default
regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".

This table also has one extra slot which is also a char-table.
Each entry in the extra slot is an alist used for multi-character
matching (which may be nil).  The elements of the alist should
have the form (SUFFIX . OTHER-REGEXP).  If the characters after
char-x are equal to SUFFIX, then this combination of char-x +
SUFFIX is allowed to match OTHER-REGEXP.  This is in addition to
char-x being allowed to match REGEXP.
For instance, the default alist for ?f includes:
    \((\"fl\" . \"\") (\"fi\" . \"\")
     (\"i\" . \"\") (\"f\" . \"\"))

Exceptionally for the space character (32), ALIST is ignored.")

\f
(defun char-fold-update-table ()
  "Update char-fold-table only when one of the options changes its value."
  (let ((new (list (or (bound-and-true-p char-fold-override)
                       char-fold--default-override)
                   (or (bound-and-true-p char-fold-include)
                       char-fold--default-include)
                   (or (bound-and-true-p char-fold-exclude)
                       char-fold--default-exclude)
                   (or (bound-and-true-p char-fold-symmetric)
                       char-fold--default-symmetric))))
    (unless (equal char-fold--previous new)
      (setq char-fold-table (char-fold--make-table)
            char-fold--previous new))))

(defcustom char-fold-override char-fold--default-override
  "Non-nil means to override the default definitions of equivalent characters.
When nil (the default), the table of character equivalences used
for character-folding is populated with the default set of equivalent
characters; customize `char-fold-exclude' to remove unneeded equivalences,
and `char-fold-include' to add your own.
When this variable is non-nil, the table of equivalences starts empty,
and you can add your own equivalences by customizing `char-fold-include'."
  :type 'boolean
  :initialize #'custom-initialize-default
  :set (lambda (sym val)
         (custom-set-default sym val)
         (char-fold-update-table))
  :group 'isearch
  :version "29.1")

(defcustom char-fold-include char-fold--default-include
  "Additional character foldings to include.
Each entry is a list of a character and the strings that fold into it."
  :type '(alist :key-type (character :tag "Fold to character")
                :value-type (repeat (string :tag "Fold from string")))
  :initialize #'custom-initialize-default
  :set (lambda (sym val)
         (custom-set-default sym val)
         (char-fold-update-table))
  :group 'isearch
  :version "27.1")

(defcustom char-fold-exclude char-fold--default-exclude
  "Character foldings to remove from default decompositions.
Each entry is a list of a character and the strings to remove from folding."
  :type '(alist :key-type (character :tag "Fold to character")
                :value-type (repeat (string :tag "Fold from string")))
  :initialize #'custom-initialize-default
  :set (lambda (sym val)
         (custom-set-default sym val)
         (char-fold-update-table))
  :group 'isearch
  :version "27.1")

(defcustom char-fold-symmetric char-fold--default-symmetric
  "Non-nil means char-fold searching treats equivalent chars the same.
That is, use of any of a set of char-fold equivalent chars in a search
string finds any of them in the text being searched.

If nil then only the \"base\" or \"canonical\" char of the set matches
any of them.  The others match only themselves, even when char-folding
is turned on."
  :type 'boolean
  :initialize #'custom-initialize-default
  :set (lambda (sym val)
         (custom-set-default sym val)
         (char-fold-update-table))
  :group 'isearch
  :version "27.1")

(char-fold-update-table)

\f
(defun char-fold--make-space-string (n)
  "Return a string that matches N spaces."
  (format "\\(?:%s\\|%s\\)"
          (make-string n ?\s)
          (apply #'concat
                 (make-list n (or (aref char-fold-table ?\s) " ")))))

;;;###autoload
(defun char-fold-to-regexp (string &optional lax from)
  "Return a regexp matching anything that char-folds into STRING.
Any character in STRING that has an entry in
`char-fold-table' is replaced with that entry (which is a
regexp) and other characters are `regexp-quote'd.

When LAX is non-nil, then the final character also matches ligatures
partially, for instance, the search string \"f\" will match \"\",
so when typing the search string in isearch while the cursor is on
a ligature, the search won't try to immediately advance to the next
complete match, but will stay on the partially matched ligature.

If the resulting regexp would be too long for Emacs to handle,
just return the result of calling `regexp-quote' on STRING.

FROM is for internal use.  It specifies an index in the STRING
from which to start."
  (let* ((spaces 0)
         (multi-char-table (char-table-extra-slot char-fold-table 0))
         (i (or from 0))
         (end (length string))
         (out nil))
    ;; When the user types a space, we want to match the table entry
    ;; for ?\s, which is generally a regexp like "[ ...]".  However,
    ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
    ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
    ;; instead (to manually expose a space).  Furthermore, the lax
    ;; search engine acts on a bunch of spaces, not on individual
    ;; spaces, so if the string contains sequential spaces like "  ", we
    ;; need to keep them grouped together like this: "\\(  \\|[ ...][ ...]\\)".
    (while (< i end)
      (pcase (aref string i)
        (?\s (setq spaces (1+ spaces)))
        ((pred (lambda (c) (and char-fold-symmetric
                                (if isearch-regexp
                                    isearch-regexp-lax-whitespace
                                  isearch-lax-whitespace)
                                (stringp search-whitespace-regexp)
                                (string-match-p search-whitespace-regexp (char-to-string c)))))
         (setq spaces (1+ spaces)))
        (c (when (> spaces 0)
             (push (char-fold--make-space-string spaces) out)
             (setq spaces 0))
           (let ((regexp (or (aref char-fold-table c)
                             (regexp-quote (string c))))
                 ;; Long string.  The regexp would probably be too long.
                 (alist (unless (> end 50)
                          (aref multi-char-table c))))
             (push (if (and lax alist (= (1+ i) end))
                       (concat "\\(?:" regexp "\\|"
                               (mapconcat (lambda (entry)
                                            (cdr entry)) alist "\\|") "\\)")
                     (let ((matched-entries nil)
                           (max-length 0))
                       (dolist (entry alist)
                         (let* ((suffix (car entry))
                                (len-suf (length suffix)))
                           (when (eq (compare-strings suffix 0 nil
                                                      string (1+ i) (+ i 1 len-suf)
                                                      nil)
                                     t)
                             (push (cons len-suf (cdr entry)) matched-entries)
                             (setq max-length (max max-length len-suf)))))
                       ;; If no suffixes matched, just go on.
                       (if (not matched-entries)
                           regexp
;;; If N suffixes match, we "branch" out into N+1 executions for the
;;; length of the longest match.  This means "fix" will match "fix" but
;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
;;; exponentially.  See https://lists.gnu.org/r/emacs-devel/2015-11/msg02562.html
                         (let ((subs (substring string (1+ i) (+ i 1 max-length))))
                           ;; `i' is still going to inc by 1 below.
                           (setq i (+ i max-length))
                           (concat
                            "\\(?:"
                            (mapconcat (lambda (entry)
                                         (let ((length (car entry))
                                               (suffix-regexp (cdr entry)))
                                           (concat suffix-regexp
                                                   (char-fold-to-regexp subs nil length))))
                                       `((0 . ,regexp) . ,matched-entries) "\\|")
                            "\\)")))))
                   out))))
      (setq i (1+ i)))
    (when (> spaces 0)
      (push (char-fold--make-space-string spaces) out))
    (apply #'concat (nreverse out))))

\f
;;; Commands provided for completeness.
(defun char-fold-search-forward (string &optional bound noerror count)
  "Search forward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
which is searched for with `re-search-forward'.
BOUND NOERROR COUNT are passed to `re-search-forward'."
  (interactive "sSearch: ")
  (re-search-forward (char-fold-to-regexp string) bound noerror count))

(defun char-fold-search-backward (string &optional bound noerror count)
  "Search backward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
which is searched for with `re-search-backward'.
BOUND NOERROR COUNT are passed to `re-search-backward'."
  (interactive "sSearch: ")
  (re-search-backward (char-fold-to-regexp string) bound noerror count))

\f
;;;###autoload
(defun describe-char-fold-equivalences (char &optional lax)
  "Display characters equivalent to CHAR under character-folding.
Prompt for CHAR (using `read-char-by-name', which see for how to
specify the character).  With no input, i.e. when CHAR is nil,
describe all available character equivalences of `char-fold-to-regexp'.
Optional argument LAX (interactively, the prefix argument), if
non-nil, means also include partially matching ligatures and
non-canonical equivalences."
  (interactive (list (ignore-errors
                       (read-char-by-name
                        (format-prompt "Unicode name, single char, or hex"
                                       "all")
                        t))
                     current-prefix-arg))
  (require 'help-fns)
  (let ((help-buffer-under-preparation t))
    (help-setup-xref (list #'describe-char-fold-equivalences)
                     (called-interactively-p 'interactive))
    (let* ((equivalences nil)
           (char-fold--no-regexp t)
           (table (char-fold--make-table))
           (extra (char-table-extra-slot table 0)))
      (if (not char)
          (map-char-table
           (lambda (char list)
             (when lax
               (setq list (append list (mapcar (lambda (entry)
                                                 (cdr entry))
                                               (aref extra char)))))
             (setq equivalences (cons (cons char list)
                                      equivalences)))
           table)
        (setq equivalences (aref table char))
        (when lax
          (setq equivalences (append equivalences
                                     (mapcar (lambda (entry)
                                               (cdr entry))
                                             (aref extra char)))))
        (setq equivalences (cons (char-to-string char) equivalences)))
      (with-help-window (help-buffer)
        (with-current-buffer standard-output
          (if char
              (insert
               (mapconcat
                (lambda (c)
                  (format "%s: %s\n"
                          c
                          (mapconcat
                           (lambda (ch)
                             (format "?\\N{%s}"
                                     (or (get-char-code-property ch 'name)
                                         (get-char-code-property ch 'old-name))))
                           c)))
                equivalences))
            (insert "A list of char-fold equivalences for `char-fold-to-regexp':\n\n")
            (setq-local bidi-paragraph-direction 'left-to-right)
            (dolist (equiv (nreverse equivalences))
              (insert (format "%c: %s\n" (car equiv)
                              (string-join (cdr equiv) " "))))))))))

(provide 'char-fold)

;;; char-fold.el ends here

debug log:

solving 43e3cd45ec ...
found 43e3cd45ec in https://git.savannah.gnu.org/cgit/emacs.git

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).