* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:27 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 03/18] Don’t assume character can be either upper- or lower-case when casing Michal Nazarewicz
` (15 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Use Unicode data to generate case tables instead of mostly repeating
them in lisp code. Do that in a way which maps ‘Dz’ (and similar)
digraph to ‘dz’ when down- and ‘DZ’ when upcasing.
* lisp/international/characters.el: Remove case-pairs defined with
explicit Lisp code and instead use Unicode character properties.
* test/src/casefiddle-tests.el (casefiddle-tests--characters,
casefiddle-tests-casing): Update test cases which are now working
as they should.
---
lisp/international/characters.el | 338 ++++++++-------------------------------
test/src/casefiddle-tests.el | 7 +-
2 files changed, 66 insertions(+), 279 deletions(-)
diff --git a/lisp/international/characters.el b/lisp/international/characters.el
index 1757d2b..67b0149 100644
--- a/lisp/international/characters.el
+++ b/lisp/international/characters.el
@@ -543,10 +543,6 @@ ?L
(set-case-syntax ?½ "_" tbl)
(set-case-syntax ?¾ "_" tbl)
(set-case-syntax ?¿ "." tbl)
- (let ((c 192))
- (while (<= c 222)
- (set-case-syntax-pair c (+ c 32) tbl)
- (setq c (1+ c))))
(set-case-syntax ?× "_" tbl)
(set-case-syntax ?ß "w" tbl)
(set-case-syntax ?÷ "_" tbl)
@@ -558,101 +554,8 @@ ?L
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#x0100 . #x012F)
- (#x0132 . #x0137)
- (#x0139 . #x0148)
- (#x014a . #x0177)
- (#x0179 . #x017E)
- (#x0182 . #x0185)
- (#x0187 . #x0188)
- (#x018B . #x018C)
- (#x0191 . #x0192)
- (#x0198 . #x0199)
- (#x01A0 . #x01A5)
- (#x01A7 . #x01A8)
- (#x01AC . #x01AD)
- (#x01AF . #x01B0)
- (#x01B3 . #x01B6)
- (#x01B8 . #x01B9)
- (#x01BC . #x01BD)
- (#x01CD . #x01DC)
- (#x01DE . #x01EF)
- (#x01F4 . #x01F5)
- (#x01F8 . #x021F)
- (#x0222 . #x0233)
- (#x023B . #x023C)
- (#x0241 . #x0242)
- (#x0246 . #x024F))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ÿ ?ÿ tbl)
-
- ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
- ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
- ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
- ;; SMALL LETTER I.
-
- ;; We used to set up half of those correspondence unconditionally,
- ;; but that makes searches slow. So now we don't set up either half
- ;; of these correspondences by default.
-
- ;; (set-downcase-syntax ?İ ?i tbl)
- ;; (set-upcase-syntax ?I ?ı tbl)
-
- (set-case-syntax-pair ?Ɓ ?ɓ tbl)
- (set-case-syntax-pair ?Ɔ ?ɔ tbl)
- (set-case-syntax-pair ?Ɖ ?ɖ tbl)
- (set-case-syntax-pair ?Ɗ ?ɗ tbl)
- (set-case-syntax-pair ?Ǝ ?ǝ tbl)
- (set-case-syntax-pair ?Ə ?ə tbl)
- (set-case-syntax-pair ?Ɛ ?ɛ tbl)
- (set-case-syntax-pair ?Ɠ ?ɠ tbl)
- (set-case-syntax-pair ?Ɣ ?ɣ tbl)
- (set-case-syntax-pair ?Ɩ ?ɩ tbl)
- (set-case-syntax-pair ?Ɨ ?ɨ tbl)
- (set-case-syntax-pair ?Ɯ ?ɯ tbl)
- (set-case-syntax-pair ?Ɲ ?ɲ tbl)
- (set-case-syntax-pair ?Ɵ ?ɵ tbl)
- (set-case-syntax-pair ?Ʀ ?ʀ tbl)
- (set-case-syntax-pair ?Ʃ ?ʃ tbl)
- (set-case-syntax-pair ?Ʈ ?ʈ tbl)
- (set-case-syntax-pair ?Ʊ ?ʊ tbl)
- (set-case-syntax-pair ?Ʋ ?ʋ tbl)
- (set-case-syntax-pair ?Ʒ ?ʒ tbl)
- ;; We use set-downcase-syntax below, since we want upcase of dž
- ;; return DŽ, not Dž, and the same for the rest.
- (set-case-syntax-pair ?DŽ ?dž tbl)
- (set-downcase-syntax ?Dž ?dž tbl)
- (set-case-syntax-pair ?LJ ?lj tbl)
- (set-downcase-syntax ?Lj ?lj tbl)
- (set-case-syntax-pair ?NJ ?nj tbl)
- (set-downcase-syntax ?Nj ?nj tbl)
-
- ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
-
- (set-case-syntax-pair ?DZ ?dz tbl)
- (set-downcase-syntax ?Dz ?dz tbl)
- (set-case-syntax-pair ?Ƕ ?ƕ tbl)
- (set-case-syntax-pair ?Ƿ ?ƿ tbl)
- (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
- (set-case-syntax-pair ?Ƚ ?ƚ tbl)
- (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
- (set-case-syntax-pair ?Ƀ ?ƀ tbl)
- (set-case-syntax-pair ?Ʉ ?ʉ tbl)
- (set-case-syntax-pair ?Ʌ ?ʌ tbl)
-
;; Latin Extended Additional
(modify-category-entry '(#x1e00 . #x1ef9) ?l)
- (setq c #x1e00)
- (while (<= c #x1ef9)
- (and (zerop (% c 2))
- (or (<= c #x1e94) (>= c #x1ea0))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
;; Latin Extended-C
(setq c #x2C60)
@@ -660,57 +563,12 @@ ?L
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#x2C60 . #x2C61)
- (#x2C67 . #x2C6C)
- (#x2C72 . #x2C73)
- (#x2C75 . #x2C76))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ɫ ?ɫ tbl)
- (set-case-syntax-pair ?Ᵽ ?ᵽ tbl)
- (set-case-syntax-pair ?Ɽ ?ɽ tbl)
- (set-case-syntax-pair ?Ɑ ?ɑ tbl)
- (set-case-syntax-pair ?Ɱ ?ɱ tbl)
- (set-case-syntax-pair ?Ɐ ?ɐ tbl)
- (set-case-syntax-pair ?Ɒ ?ɒ tbl)
- (set-case-syntax-pair ?Ȿ ?ȿ tbl)
- (set-case-syntax-pair ?Ɀ ?ɀ tbl)
-
;; Latin Extended-D
(setq c #xA720)
(while (<= c #xA7FF)
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#xA722 . #xA72F)
- (#xA732 . #xA76F)
- (#xA779 . #xA77C)
- (#xA77E . #xA787)
- (#xA78B . #xA78E)
- (#xA790 . #xA793)
- (#xA796 . #xA7A9)
- (#xA7B4 . #xA7B7))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ᵹ ?ᵹ tbl)
- (set-case-syntax-pair ?Ɦ ?ɦ tbl)
- (set-case-syntax-pair ?Ɜ ?ɜ tbl)
- (set-case-syntax-pair ?Ɡ ?ɡ tbl)
- (set-case-syntax-pair ?Ɬ ?ɬ tbl)
- (set-case-syntax-pair ?Ɪ ?ɪ tbl)
- (set-case-syntax-pair ?Ʞ ?ʞ tbl)
- (set-case-syntax-pair ?Ʇ ?ʇ tbl)
- (set-case-syntax-pair ?Ʝ ?ʝ tbl)
- (set-case-syntax-pair ?Ꭓ ?ꭓ tbl)
-
;; Latin Extended-E
(setq c #xAB30)
(while (<= c #xAB64)
@@ -719,102 +577,19 @@ ?L
;; Greek
(modify-category-entry '(#x0370 . #x03ff) ?g)
- (setq c #x0370)
- (while (<= c #x03ff)
- (if (or (and (>= c #x0391) (<= c #x03a1))
- (and (>= c #x03a3) (<= c #x03ab)))
- (set-case-syntax-pair c (+ c 32) tbl))
- (and (>= c #x03da)
- (<= c #x03ee)
- (zerop (% c 2))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ά ?ά tbl)
- (set-case-syntax-pair ?Έ ?έ tbl)
- (set-case-syntax-pair ?Ή ?ή tbl)
- (set-case-syntax-pair ?Ί ?ί tbl)
- (set-case-syntax-pair ?Ό ?ό tbl)
- (set-case-syntax-pair ?Ύ ?ύ tbl)
- (set-case-syntax-pair ?Ώ ?ώ tbl)
;; Armenian
(setq c #x531)
- (while (<= c #x556)
- (set-case-syntax-pair c (+ c #x30) tbl)
- (setq c (1+ c)))
;; Greek Extended
(modify-category-entry '(#x1f00 . #x1fff) ?g)
- (setq c #x1f00)
- (while (<= c #x1fff)
- (and (<= (logand c #x000f) 7)
- (<= c #x1fa7)
- (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
- #x1f50 #x1f52 #x1f54 #x1f56)))
- (/= (logand c #x00f0) #x70)
- (set-case-syntax-pair (+ c 8) c tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
- (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
- (set-case-syntax-pair ?Ὰ ?ὰ tbl)
- (set-case-syntax-pair ?Ά ?ά tbl)
- (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
- (set-case-syntax-pair ?Ὲ ?ὲ tbl)
- (set-case-syntax-pair ?Έ ?έ tbl)
- (set-case-syntax-pair ?Ὴ ?ὴ tbl)
- (set-case-syntax-pair ?Ή ?ή tbl)
- (set-case-syntax-pair ?ῌ ?ῃ tbl)
- (set-case-syntax-pair ?Ῐ ?ῐ tbl)
- (set-case-syntax-pair ?Ῑ ?ῑ tbl)
- (set-case-syntax-pair ?Ὶ ?ὶ tbl)
- (set-case-syntax-pair ?Ί ?ί tbl)
- (set-case-syntax-pair ?Ῠ ?ῠ tbl)
- (set-case-syntax-pair ?Ῡ ?ῡ tbl)
- (set-case-syntax-pair ?Ὺ ?ὺ tbl)
- (set-case-syntax-pair ?Ύ ?ύ tbl)
- (set-case-syntax-pair ?Ῥ ?ῥ tbl)
- (set-case-syntax-pair ?Ὸ ?ὸ tbl)
- (set-case-syntax-pair ?Ό ?ό tbl)
- (set-case-syntax-pair ?Ὼ ?ὼ tbl)
- (set-case-syntax-pair ?Ώ ?ώ tbl)
- (set-case-syntax-pair ?ῼ ?ῳ tbl)
;; cyrillic
(modify-category-entry '(#x0400 . #x04FF) ?y)
- (setq c #x0400)
- (while (<= c #x04ff)
- (and (>= c #x0400)
- (<= c #x040f)
- (set-case-syntax-pair c (+ c 80) tbl))
- (and (>= c #x0410)
- (<= c #x042f)
- (set-case-syntax-pair c (+ c 32) tbl))
- (and (zerop (% c 2))
- (or (and (>= c #x0460) (<= c #x0480))
- (and (>= c #x048c) (<= c #x04be))
- (and (>= c #x04d0) (<= c #x052e)))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ӂ ?ӂ tbl)
- (set-case-syntax-pair ?Ӄ ?ӄ tbl)
- (set-case-syntax-pair ?Ӈ ?ӈ tbl)
- (set-case-syntax-pair ?Ӌ ?ӌ tbl)
-
(modify-category-entry '(#xA640 . #xA69F) ?y)
- (setq c #xA640)
- (while (<= c #xA66C)
- (set-case-syntax-pair c (+ c 1) tbl)
- (setq c (+ c 2)))
- (setq c #xA680)
- (while (<= c #xA69A)
- (set-case-syntax-pair c (+ c 1) tbl)
- (setq c (+ c 2)))
;; Georgian
(setq c #x10A0)
- (while (<= c #x10CD)
- (set-case-syntax-pair c (+ c #x1C60) tbl)
- (setq c (1+ c)))
;; Cyrillic Extended-C
(modify-category-entry '(#x1C80 . #x1C8F) ?y)
@@ -844,12 +619,6 @@ ?L
(set-case-syntax c "." tbl)
(setq c (1+ c)))
- ;; Roman numerals
- (setq c #x2160)
- (while (<= c #x216f)
- (set-case-syntax-pair c (+ c #x10) tbl)
- (setq c (1+ c)))
-
;; Fixme: The following blocks might be better as symbol rather than
;; punctuation.
;; Arrows
@@ -873,25 +642,11 @@ ?L
;; Circled Latin
(setq c #x24b6)
(while (<= c #x24cf)
- (set-case-syntax-pair c (+ c 26) tbl)
(modify-category-entry c ?l)
(modify-category-entry (+ c 26) ?l)
(setq c (1+ c)))
- ;; Glagolitic
- (setq c #x2C00)
- (while (<= c #x2C2E)
- (set-case-syntax-pair c (+ c 48) tbl)
- (setq c (1+ c)))
-
;; Coptic
- (let ((pair-ranges '((#x2C80 . #x2CE2)
- (#x2CEB . #x2CF2))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
;; There's no Coptic category. However, Coptic letters that are
;; part of the Greek block above get the Greek category, and those
;; in this block are derived from Greek letters, so let's be
@@ -901,45 +656,78 @@ ?L
;; Fullwidth Latin
(setq c #xff21)
(while (<= c #xff3a)
- (set-case-syntax-pair c (+ c #x20) tbl)
(modify-category-entry c ?l)
(modify-category-entry (+ c #x20) ?l)
(setq c (1+ c)))
- ;; Deseret
- (setq c #x10400)
- (while (<= c #x10427)
- (set-case-syntax-pair c (+ c 28) tbl)
- (setq c (1+ c)))
+ ;; Combining diacritics
+ (modify-category-entry '(#x300 . #x362) ?^)
+ ;; Combining marks
+ (modify-category-entry '(#x20d0 . #x20ff) ?^)
- ;; Osage
- (setq c #x104B0)
- (while (<= c #x104D3)
- (set-case-syntax-pair c (+ c 40) tbl)
- (setq c (1+ c)))
+ ;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax
+ ;; to word. FIXME: Should this also be done for Letter, modifier and Letter,
+ ;; other? What about other alphabetic characters?
+ (let ((syn-tab (standard-syntax-table)))
+ (map-char-table
+ (lambda (ch cat)
+ (when (memq cat '(Lu Ll Lt))
+ (modify-syntax-entry ch "w " syn-tab)))
+ (unicode-property-table-internal 'general-category)))
- ;; Old Hungarian
- (setq c #x10c80)
- (while (<= c #x10cb2)
- (set-case-syntax-pair c (+ c #x40) tbl)
- (setq c (1+ c)))
+ ;; Set downcase and upcase from Unicode properties
- ;; Warang Citi
- (setq c #x118a0)
- (while (<= c #x118bf)
- (set-case-syntax-pair c (+ c #x20) tbl)
- (setq c (1+ c)))
+ ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I and
+ ;; U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so do U+0130
+ ;; LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I.
- ;; Adlam
- (setq c #x1e900)
- (while (<= c #x1e921)
- (set-case-syntax-pair c (+ c #x22) tbl)
- (setq c (1+ c)))
+ ;; We used to set up half of those correspondence unconditionally, but that
+ ;; makes searches slow. So now we don't set up either half of these
+ ;; correspondences by default.
- ;; Combining diacritics
- (modify-category-entry '(#x300 . #x362) ?^)
- ;; Combining marks
- (modify-category-entry '(#x20d0 . #x20ff) ?^)
+ ;; (set-downcase-syntax ?İ ?i tbl)
+ ;; (set-upcase-syntax ?I ?ı tbl)
+
+ (let ((map-unicode-property
+ (lambda (property func)
+ (map-char-table
+ (lambda (ch cased)
+ ;; ASCII characters skipped due to reasons outlined above. As of
+ ;; Unicode 9.0, this exception affects the following:
+ ;; lc(U+0130 İ) = i
+ ;; uc(U+0131 ı) = I
+ ;; uc(U+017F ſ) = S
+ ;; uc(U+212A K) = k
+ (when (> cased 127)
+ (let ((end (if (consp ch) (cdr ch) ch)))
+ (setq ch (max 128 (if (consp ch) (car ch) ch)))
+ (while (<= ch end)
+ (funcall func ch cased)
+ (setq ch (1+ ch))))))
+ (unicode-property-table-internal property))))
+ (down tbl)
+ (up (case-table-get-table tbl 'up)))
+
+ ;; This works on an assumption that if toUpper(x) != x then toLower(x) ==
+ ;; x (and the opposite for toLower/toUpper). This doesn’t hold for title
+ ;; case characters but those incorrect mappings will be overwritten later.
+ (funcall map-unicode-property 'uppercase
+ (lambda (lc uc) (aset down lc lc) (aset up uc uc)))
+ (funcall map-unicode-property 'lowercase
+ (lambda (uc lc) (aset down lc lc) (aset up uc uc)))
+
+ ;; Now deal with the actual mapping. This will correctly assign casing for
+ ;; title-case characters.
+ (funcall map-unicode-property 'uppercase
+ (lambda (lc uc) (aset up lc uc) (aset up uc uc)))
+ (funcall map-unicode-property 'lowercase
+ (lambda (uc lc) (aset down uc lc) (aset down lc lc))))
+
+ ;; Clear out the extra slots so that they will be recomputed from the main
+ ;; (downcase) table and upcase table. Since we’re side-stepping the usual
+ ;; set-case-syntax-* functions, we need to do it explicitly.
+ (set-char-table-extra-slot tbl 1 nil)
+ (set-char-table-extra-slot tbl 2 nil)
;; Fixme: syntax for symbols &c
)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 4b2eeaf..ca3657d 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -72,8 +72,7 @@ casefiddle-tests--characters
(?Σ ?Σ ?σ ?Σ)
(?σ ?Σ ?σ ?Σ)
- ;; FIXME: Another broken one:
- ;;(?ς ?Σ ?ς ?Σ)
+ (?ς ?Σ ?ς ?Σ)
(?Ⅷ ?Ⅷ ?ⅷ ?Ⅷ)
(?ⅷ ?Ⅷ ?ⅷ ?Ⅷ)))
@@ -151,7 +150,6 @@ casefiddle-tests--characters
;;("fish" "FIsh" "fish" "Fish" "Fish")
;;("Straße" "STRASSE" "straße" "Straße" "Straße")
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
- ;;("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA")
("Džungla" "DžUNGLA" "džungla" "Džungla" "Džungla")
@@ -160,7 +158,8 @@ casefiddle-tests--characters
("fish" "fiSH" "fish" "fish" "fish")
("Straße" "STRAßE" "straße" "Straße" "Straße")
("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
- ("όσος" "ΌΣΟς" "όσος" "Όσος" "Όσος"))
+
+ ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))
(nreverse errors))
(let* ((input (car test))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 1:10 ` bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data Michal Nazarewicz
@ 2016-10-04 7:27 ` Eli Zaretskii
2016-10-04 14:54 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:27 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:25 +0200
>
> + ;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax
> + ;; to word. FIXME: Should this also be done for Letter, modifier and Letter,
> + ;; other? What about other alphabetic characters?
> + (let ((syn-tab (standard-syntax-table)))
> + (map-char-table
> + (lambda (ch cat)
> + (when (memq cat '(Lu Ll Lt))
> + (modify-syntax-entry ch "w " syn-tab)))
> + (unicode-property-table-internal 'general-category)))
The answer to these questions is "as required by backward
compatibility", i.e. compare with the manual setup we had until now.
If that criterion doesn't provide the full answer, I would go by
Unicode guidance, i.e. support all the case conversions specified in
the Unicode character database (UCD).
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 7:27 ` Eli Zaretskii
@ 2016-10-04 14:54 ` Michal Nazarewicz
2016-10-04 15:06 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 14:54 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Date: Tue, 4 Oct 2016 03:10:25 +0200
>>
>> + ;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax
>> + ;; to word. FIXME: Should this also be done for Letter, modifier and Letter,
>> + ;; other? What about other alphabetic characters?
>> + (let ((syn-tab (standard-syntax-table)))
>> + (map-char-table
>> + (lambda (ch cat)
>> + (when (memq cat '(Lu Ll Lt))
>> + (modify-syntax-entry ch "w " syn-tab)))
>> + (unicode-property-table-internal 'general-category)))
>
> The answer to these questions is "as required by backward
> compatibility", i.e. compare with the manual setup we had until now.
With that in mind, I’ve applied the following fix to this patch:
---- >8 ----------------------------------------------------------------
@@ -666,14 +666,21 @@ ?L
(modify-category-entry '(#x20d0 . #x20ff) ?^)
;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax
- ;; to word. FIXME: Should this also be done for Letter, modifier and Letter,
- ;; other? What about other alphabetic characters?
+ ;; to word.
(let ((syn-tab (standard-syntax-table)))
(map-char-table
(lambda (ch cat)
(when (memq cat '(Lu Ll Lt))
(modify-syntax-entry ch "w " syn-tab)))
- (unicode-property-table-internal 'general-category)))
+ (unicode-property-table-internal 'general-category))
+
+ ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well.
+ ;; General category of those characers is Number, Letter.
+ (modify-syntax-entry '(#x2160 . #x216b) "w " syn-tab)
+
+ ;; ⓐ thourgh ⓩ are symbols, other according to Unicode but Emacs set
+ ;; their syntax to word in the past so keep backwards compatibility.
+ (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-tab))
;; Set downcase and upcase from Unicode properties
---- >8 ----------------------------------------------------------------
With that, if I run
(defun mn-dump-tables ()
(interactive)
(switch-to-buffer (get-buffer-create "*Syntax and case tables dump*"))
(widen)
(delete-region (point-min) (point-max))
(insert (emacs-version))
(insert "\n\n")
(let ((case-tab (standard-case-table)))
(dolist (el `(("Syntax" . ,(standard-syntax-table))
("Lower" . ,(case-table-get-table case-tab 'down))
("Upper" . ,(case-table-get-table case-tab 'up))))
(map-char-table
(lambda (k v)
(insert (format "[%s] " (car el)))
(if (consp k)
(insert (format "%06x..%06x -> " (car k) (cdr k)))
(insert (format " %06x -> " k)))
(if (numberp v)
(insert (format "%06x\n" v))
(insert (format "%s\n" v))))
(cdr el)))))
I get the following (annotated) differences:
---- >8 ----------------------------------------------------------------
--- orig-tables.txt 2016-10-04 15:30:12.501069384 +0200
+++ modified-tables.txt 2016-10-04 15:31:55.435676953 +0200
@@ -1,4 +1,4 @@
-GNU Emacs 26.0.50.2 (x86_64-unknown-linux-gnu)
+GNU Emacs 26.0.50.3 (x86_64-unknown-linux-gnu)
of 2016-10-04
[Syntax] 000000..000008 -> (1)
@@ -44,7 +44,9 @@
[Syntax] 0000a7 -> (1)
[Syntax] 0000a8..0000aa -> (3)
[Syntax] 0000ab -> (1)
-[Syntax] 0000ac..0000b6 -> (3)
+[Syntax] 0000ac..0000b4 -> (3)
+[Syntax] 0000b5 -> (2) -- micro sign (µ) is now word
+[Syntax] 0000b6 -> (3)
[Syntax] 0000b7 -> (2)
[Syntax] 0000b8..0000ba -> (3)
[Syntax] 0000bb -> (1)
@@ -54,9 +56,7 @@
[Syntax] 0000d7 -> (3)
[Syntax] 0000d8..0000f6 -> (2)
[Syntax] 0000f7 -> (3)
-[Syntax] 0000f8..000148 -> (2)
-[Syntax] 000149 -> (3) -- ʼn is now word
-[Syntax] 00014a..0002c6 -> (2)
+[Syntax] 0000f8..0002c6 -> (2)
[Syntax] 0002c7 -> (3)
[Syntax] 0002c8 -> (2)
[Syntax] 0002c9 -> (3)
@@ -136,17 +136,11 @@
[Syntax] 002103 -> (3)
[Syntax] 002104..002108 -> (2)
[Syntax] 002109 -> (3)
-[Syntax] 00210a..002112 -> (2)
-[Syntax] 002113 -> (3) -- ℓ is now word
-[Syntax] 002114..002115 -> (2)
+[Syntax] 00210a..002115 -> (2)
[Syntax] 002116 -> (1)
[Syntax] 002117..002120 -> (2)
[Syntax] 002121..002122 -> (3)
-[Syntax] 002123..002125 -> (2)
-[Syntax] 002126 -> (3) -- Ohm sign (Ω) is now word
-[Syntax] 002127..00212a -> (2)
-[Syntax] 00212b -> (3) -- Angstrom sign (Å) is now word
-[Syntax] 00212c..002152 -> (2)
+[Syntax] 002123..002152 -> (2)
[Syntax] 002153..002154 -> (3)
[Syntax] 002155..00215a -> (2)
[Syntax] 00215b..00215e -> (3)
… continued below …
---- >8 ----------------------------------------------------------------
Changes to micro, Ohm and Angstrom signs may be controversial (I
could never understand why those characters are in Unicode) but the
change brings the syntax table in par with Unicode and how other
similar characters are handled (namely, I’m thinking of U+212A:
Kelvin sign).
Changes to ʼn and ℓ seem uncontroversial to me.
Attached below are changes to down and up case tables. I haven’t
looked at every one of the changes but it appears legitimate to me:
- some additions are for ‘foo becomes foo’ mappings,
- mane are for legitimate missing mappings and
- there are also some fixes to incorrect mapping such as
𐐀 (Deseret capital letter long i) being mapped to
𐐜 (Deseret capital letter thee) instead of
𐐨 (Deseret small letter long i).
---- >8 ----------------------------------------------------------------
… continuation from above …
@@ -677,6 +671,7 @@
[Lower] 00019a -> 00019a
[Lower] 00019c -> 00026f
[Lower] 00019d -> 000272
+[Lower] 00019e -> 00019e
[Lower] 00019f -> 000275
[Lower] 0001a0..0001a1 -> 0001a1
[Lower] 0001a2..0001a3 -> 0001a3
@@ -740,6 +735,7 @@
[Lower] 00021a..00021b -> 00021b
[Lower] 00021c..00021d -> 00021d
[Lower] 00021e..00021f -> 00021f
+[Lower] 000220 -> 00019e
[Lower] 000222..000223 -> 000223
[Lower] 000224..000225 -> 000225
[Lower] 000226..000227 -> 000227
@@ -777,6 +773,7 @@
[Lower] 000260 -> 000260
[Lower] 000261 -> 000261
[Lower] 000263 -> 000263
+[Lower] 000265 -> 000265
[Lower] 000266 -> 000266
[Lower] 000268 -> 000268
[Lower] 000269 -> 000269
@@ -799,6 +796,14 @@
[Lower] 000292 -> 000292
[Lower] 00029d -> 00029d
[Lower] 00029e -> 00029e
+[Lower] 000345 -> 000345
+[Lower] 000370..000371 -> 000371
+[Lower] 000372..000373 -> 000373
+[Lower] 000376..000377 -> 000377
+[Lower] 00037b -> 00037b
+[Lower] 00037c -> 00037c
+[Lower] 00037d -> 00037d
+[Lower] 00037f -> 0003f3
[Lower] 000386 -> 0003ac
[Lower] 000388 -> 0003ad
[Lower] 000389 -> 0003ae
@@ -853,6 +858,7 @@
[Lower] 0003bf -> 0003bf
[Lower] 0003c0 -> 0003c0
[Lower] 0003c1 -> 0003c1
+[Lower] 0003c2 -> 0003c2
[Lower] 0003c3 -> 0003c3
[Lower] 0003c4 -> 0003c4
[Lower] 0003c5 -> 0003c5
@@ -865,6 +871,13 @@
[Lower] 0003cc -> 0003cc
[Lower] 0003cd -> 0003cd
[Lower] 0003ce -> 0003ce
+[Lower] 0003cf -> 0003d7
+[Lower] 0003d0 -> 0003d0
+[Lower] 0003d1 -> 0003d1
+[Lower] 0003d5 -> 0003d5
+[Lower] 0003d6 -> 0003d6
+[Lower] 0003d7 -> 0003d7
+[Lower] 0003d8..0003d9 -> 0003d9
[Lower] 0003da..0003db -> 0003db
[Lower] 0003dc..0003dd -> 0003dd
[Lower] 0003de..0003df -> 0003df
@@ -876,6 +889,18 @@
[Lower] 0003ea..0003eb -> 0003eb
[Lower] 0003ec..0003ed -> 0003ed
[Lower] 0003ee..0003ef -> 0003ef
+[Lower] 0003f0 -> 0003f0
+[Lower] 0003f1 -> 0003f1
+[Lower] 0003f2 -> 0003f2
+[Lower] 0003f3 -> 0003f3
+[Lower] 0003f4 -> 0003b8
+[Lower] 0003f5 -> 0003f5
+[Lower] 0003f7..0003f8 -> 0003f8
+[Lower] 0003f9 -> 0003f2
+[Lower] 0003fa..0003fb -> 0003fb
+[Lower] 0003fd -> 00037b
+[Lower] 0003fe -> 00037c
+[Lower] 0003ff -> 00037d
[Lower] 000400 -> 000450
[Lower] 000401 -> 000451
[Lower] 000402 -> 000452
@@ -989,6 +1014,7 @@
[Lower] 00047c..00047d -> 00047d
[Lower] 00047e..00047f -> 00047f
[Lower] 000480..000481 -> 000481
+[Lower] 00048a..00048b -> 00048b
[Lower] 00048c..00048d -> 00048d
[Lower] 00048e..00048f -> 00048f
[Lower] 000490..000491 -> 000491
@@ -1015,10 +1041,15 @@
[Lower] 0004ba..0004bb -> 0004bb
[Lower] 0004bc..0004bd -> 0004bd
[Lower] 0004be..0004bf -> 0004bf
+[Lower] 0004c0 -> 0004cf
[Lower] 0004c1..0004c2 -> 0004c2
[Lower] 0004c3..0004c4 -> 0004c4
+[Lower] 0004c5..0004c6 -> 0004c6
[Lower] 0004c7..0004c8 -> 0004c8
+[Lower] 0004c9..0004ca -> 0004ca
[Lower] 0004cb..0004cc -> 0004cc
+[Lower] 0004cd..0004ce -> 0004ce
+[Lower] 0004cf -> 0004cf
[Lower] 0004d0..0004d1 -> 0004d1
[Lower] 0004d2..0004d3 -> 0004d3
[Lower] 0004d4..0004d5 -> 0004d5
@@ -1043,6 +1074,30 @@
[Lower] 0004fa..0004fb -> 0004fb
[Lower] 0004fc..0004fd -> 0004fd
[Lower] 0004fe..0004ff -> 0004ff
+[Lower] 000500..000501 -> 000501
+[Lower] 000502..000503 -> 000503
+[Lower] 000504..000505 -> 000505
+[Lower] 000506..000507 -> 000507
+[Lower] 000508..000509 -> 000509
+[Lower] 00050a..00050b -> 00050b
+[Lower] 00050c..00050d -> 00050d
+[Lower] 00050e..00050f -> 00050f
+[Lower] 000510..000511 -> 000511
+[Lower] 000512..000513 -> 000513
+[Lower] 000514..000515 -> 000515
+[Lower] 000516..000517 -> 000517
+[Lower] 000518..000519 -> 000519
+[Lower] 00051a..00051b -> 00051b
+[Lower] 00051c..00051d -> 00051d
+[Lower] 00051e..00051f -> 00051f
+[Lower] 000520..000521 -> 000521
+[Lower] 000522..000523 -> 000523
+[Lower] 000524..000525 -> 000525
+[Lower] 000526..000527 -> 000527
+[Lower] 000528..000529 -> 000529
+[Lower] 00052a..00052b -> 00052b
+[Lower] 00052c..00052d -> 00052d
+[Lower] 00052e..00052f -> 00052f
[Lower] 000531 -> 000561
[Lower] 000532 -> 000562
[Lower] 000533 -> 000563
@@ -1157,14 +1212,109 @@
[Lower] 0010c3 -> 002d23
[Lower] 0010c4 -> 002d24
[Lower] 0010c5 -> 002d25
-[Lower] 0010c6 -> 002d26
[Lower] 0010c7 -> 002d27
-[Lower] 0010c8 -> 002d28
-[Lower] 0010c9 -> 002d29
-[Lower] 0010ca -> 002d2a
-[Lower] 0010cb -> 002d2b
-[Lower] 0010cc -> 002d2c
[Lower] 0010cd -> 002d2d
+[Lower] 0013a0 -> 00ab70
+[Lower] 0013a1 -> 00ab71
+[Lower] 0013a2 -> 00ab72
+[Lower] 0013a3 -> 00ab73
+[Lower] 0013a4 -> 00ab74
+[Lower] 0013a5 -> 00ab75
+[Lower] 0013a6 -> 00ab76
+[Lower] 0013a7 -> 00ab77
+[Lower] 0013a8 -> 00ab78
+[Lower] 0013a9 -> 00ab79
+[Lower] 0013aa -> 00ab7a
+[Lower] 0013ab -> 00ab7b
+[Lower] 0013ac -> 00ab7c
+[Lower] 0013ad -> 00ab7d
+[Lower] 0013ae -> 00ab7e
+[Lower] 0013af -> 00ab7f
+[Lower] 0013b0 -> 00ab80
+[Lower] 0013b1 -> 00ab81
+[Lower] 0013b2 -> 00ab82
+[Lower] 0013b3 -> 00ab83
+[Lower] 0013b4 -> 00ab84
+[Lower] 0013b5 -> 00ab85
+[Lower] 0013b6 -> 00ab86
+[Lower] 0013b7 -> 00ab87
+[Lower] 0013b8 -> 00ab88
+[Lower] 0013b9 -> 00ab89
+[Lower] 0013ba -> 00ab8a
+[Lower] 0013bb -> 00ab8b
+[Lower] 0013bc -> 00ab8c
+[Lower] 0013bd -> 00ab8d
+[Lower] 0013be -> 00ab8e
+[Lower] 0013bf -> 00ab8f
+[Lower] 0013c0 -> 00ab90
+[Lower] 0013c1 -> 00ab91
+[Lower] 0013c2 -> 00ab92
+[Lower] 0013c3 -> 00ab93
+[Lower] 0013c4 -> 00ab94
+[Lower] 0013c5 -> 00ab95
+[Lower] 0013c6 -> 00ab96
+[Lower] 0013c7 -> 00ab97
+[Lower] 0013c8 -> 00ab98
+[Lower] 0013c9 -> 00ab99
+[Lower] 0013ca -> 00ab9a
+[Lower] 0013cb -> 00ab9b
+[Lower] 0013cc -> 00ab9c
+[Lower] 0013cd -> 00ab9d
+[Lower] 0013ce -> 00ab9e
+[Lower] 0013cf -> 00ab9f
+[Lower] 0013d0 -> 00aba0
+[Lower] 0013d1 -> 00aba1
+[Lower] 0013d2 -> 00aba2
+[Lower] 0013d3 -> 00aba3
+[Lower] 0013d4 -> 00aba4
+[Lower] 0013d5 -> 00aba5
+[Lower] 0013d6 -> 00aba6
+[Lower] 0013d7 -> 00aba7
+[Lower] 0013d8 -> 00aba8
+[Lower] 0013d9 -> 00aba9
+[Lower] 0013da -> 00abaa
+[Lower] 0013db -> 00abab
+[Lower] 0013dc -> 00abac
+[Lower] 0013dd -> 00abad
+[Lower] 0013de -> 00abae
+[Lower] 0013df -> 00abaf
+[Lower] 0013e0 -> 00abb0
+[Lower] 0013e1 -> 00abb1
+[Lower] 0013e2 -> 00abb2
+[Lower] 0013e3 -> 00abb3
+[Lower] 0013e4 -> 00abb4
+[Lower] 0013e5 -> 00abb5
+[Lower] 0013e6 -> 00abb6
+[Lower] 0013e7 -> 00abb7
+[Lower] 0013e8 -> 00abb8
+[Lower] 0013e9 -> 00abb9
+[Lower] 0013ea -> 00abba
+[Lower] 0013eb -> 00abbb
+[Lower] 0013ec -> 00abbc
+[Lower] 0013ed -> 00abbd
+[Lower] 0013ee -> 00abbe
+[Lower] 0013ef -> 00abbf
+[Lower] 0013f0 -> 0013f8
+[Lower] 0013f1 -> 0013f9
+[Lower] 0013f2 -> 0013fa
+[Lower] 0013f3 -> 0013fb
+[Lower] 0013f4 -> 0013fc
+[Lower] 0013f5 -> 0013fd
+[Lower] 0013f8 -> 0013f8
+[Lower] 0013f9 -> 0013f9
+[Lower] 0013fa -> 0013fa
+[Lower] 0013fb -> 0013fb
+[Lower] 0013fc -> 0013fc
+[Lower] 0013fd -> 0013fd
+[Lower] 001c80 -> 001c80
+[Lower] 001c81 -> 001c81
+[Lower] 001c82 -> 001c82
+[Lower] 001c83 -> 001c83
+[Lower] 001c84 -> 001c84
+[Lower] 001c85 -> 001c85
+[Lower] 001c86 -> 001c86
+[Lower] 001c87 -> 001c87
+[Lower] 001c88 -> 001c88
[Lower] 001d79 -> 001d79
[Lower] 001d7d -> 001d7d
[Lower] 001e00..001e01 -> 001e01
@@ -1242,6 +1392,8 @@
[Lower] 001e90..001e91 -> 001e91
[Lower] 001e92..001e93 -> 001e93
[Lower] 001e94..001e95 -> 001e95
+[Lower] 001e9b -> 001e9b
+[Lower] 001e9e -> 0000df
[Lower] 001ea0..001ea1 -> 001ea1
[Lower] 001ea2..001ea3 -> 001ea3
[Lower] 001ea4..001ea5 -> 001ea5
@@ -1287,6 +1439,9 @@
[Lower] 001ef4..001ef5 -> 001ef5
[Lower] 001ef6..001ef7 -> 001ef7
[Lower] 001ef8..001ef9 -> 001ef9
+[Lower] 001efa..001efb -> 001efb
+[Lower] 001efc..001efd -> 001efd
+[Lower] 001efe..001eff -> 001eff
[Lower] 001f00 -> 001f00
[Lower] 001f01 -> 001f01
[Lower] 001f02 -> 001f02
@@ -1353,22 +1508,20 @@
[Lower] 001f43 -> 001f43
[Lower] 001f44 -> 001f44
[Lower] 001f45 -> 001f45
-[Lower] 001f46 -> 001f46
-[Lower] 001f47 -> 001f47
[Lower] 001f48 -> 001f40
[Lower] 001f49 -> 001f41
[Lower] 001f4a -> 001f42
[Lower] 001f4b -> 001f43
[Lower] 001f4c -> 001f44
[Lower] 001f4d -> 001f45
-[Lower] 001f4e -> 001f46
-[Lower] 001f4f -> 001f47
[Lower] 001f51 -> 001f51
[Lower] 001f53 -> 001f53
[Lower] 001f55 -> 001f55
+[Lower] 001f57 -> 001f57
[Lower] 001f59 -> 001f51
[Lower] 001f5b -> 001f53
[Lower] 001f5d -> 001f55
+[Lower] 001f5f -> 001f57
[Lower] 001f60 -> 001f60
[Lower] 001f61 -> 001f61
[Lower] 001f62 -> 001f62
@@ -1455,6 +1608,7 @@
[Lower] 001fba -> 001f70
[Lower] 001fbb -> 001f71
[Lower] 001fbc -> 001fb3
+[Lower] 001fbe -> 001fbe
[Lower] 001fc3 -> 001fc3
[Lower] 001fc8 -> 001f72
[Lower] 001fc9 -> 001f73
@@ -1593,6 +1747,10 @@
[Lower] 00206d -> 00206d
[Lower] 00206e -> 00206e
[Lower] 00206f -> 00206f
+[Lower] 002126 -> 0003c9
+[Lower] 00212b -> 0000e5
+[Lower] 002132 -> 00214e
+[Lower] 00214e -> 00214e
[Lower] 002160 -> 002170
[Lower] 002161 -> 002171
[Lower] 002162 -> 002172
@@ -1625,6 +1783,7 @@
[Lower] 00217d -> 00217d
[Lower] 00217e -> 00217e
[Lower] 00217f -> 00217f
+[Lower] 002183..002184 -> 002184
[Lower] 002190 -> 002190
[Lower] 002191 -> 002191
[Lower] 002192 -> 002192
@@ -2525,10 +2684,10 @@
[Lower] 002cdc..002cdd -> 002cdd
[Lower] 002cde..002cdf -> 002cdf
[Lower] 002ce0..002ce1 -> 002ce1
+[Lower] 002ce2..002ce3 -> 002ce3
[Lower] 002ceb..002cec -> 002cec
[Lower] 002ced..002cee -> 002cee
-[Lower] 002cef..002cf0 -> 002cf0
-[Lower] 002cf1..002cf2 -> 002cf2
+[Lower] 002cf2..002cf3 -> 002cf3
[Lower] 002d00 -> 002d00
[Lower] 002d01 -> 002d01
[Lower] 002d02 -> 002d02
@@ -2567,13 +2726,7 @@
[Lower] 002d23 -> 002d23
[Lower] 002d24 -> 002d24
[Lower] 002d25 -> 002d25
-[Lower] 002d26 -> 002d26
[Lower] 002d27 -> 002d27
-[Lower] 002d28 -> 002d28
-[Lower] 002d29 -> 002d29
-[Lower] 002d2a -> 002d2a
-[Lower] 002d2b -> 002d2b
-[Lower] 002d2c -> 002d2c
[Lower] 002d2d -> 002d2d
[Lower] 00a640..00a641 -> 00a641
[Lower] 00a642..00a643 -> 00a643
@@ -2659,7 +2812,7 @@
[Lower] 00a784..00a785 -> 00a785
[Lower] 00a786..00a787 -> 00a787
[Lower] 00a78b..00a78c -> 00a78c
-[Lower] 00a78d..00a78e -> 00a78e
+[Lower] 00a78d -> 000265
[Lower] 00a790..00a791 -> 00a791
[Lower] 00a792..00a793 -> 00a793
[Lower] 00a796..00a797 -> 00a797
@@ -2684,6 +2837,86 @@
[Lower] 00a7b4..00a7b5 -> 00a7b5
[Lower] 00a7b6..00a7b7 -> 00a7b7
[Lower] 00ab53 -> 00ab53
+[Lower] 00ab70 -> 00ab70
+[Lower] 00ab71 -> 00ab71
+[Lower] 00ab72 -> 00ab72
+[Lower] 00ab73 -> 00ab73
+[Lower] 00ab74 -> 00ab74
+[Lower] 00ab75 -> 00ab75
+[Lower] 00ab76 -> 00ab76
+[Lower] 00ab77 -> 00ab77
+[Lower] 00ab78 -> 00ab78
+[Lower] 00ab79 -> 00ab79
+[Lower] 00ab7a -> 00ab7a
+[Lower] 00ab7b -> 00ab7b
+[Lower] 00ab7c -> 00ab7c
+[Lower] 00ab7d -> 00ab7d
+[Lower] 00ab7e -> 00ab7e
+[Lower] 00ab7f -> 00ab7f
+[Lower] 00ab80 -> 00ab80
+[Lower] 00ab81 -> 00ab81
+[Lower] 00ab82 -> 00ab82
+[Lower] 00ab83 -> 00ab83
+[Lower] 00ab84 -> 00ab84
+[Lower] 00ab85 -> 00ab85
+[Lower] 00ab86 -> 00ab86
+[Lower] 00ab87 -> 00ab87
+[Lower] 00ab88 -> 00ab88
+[Lower] 00ab89 -> 00ab89
+[Lower] 00ab8a -> 00ab8a
+[Lower] 00ab8b -> 00ab8b
+[Lower] 00ab8c -> 00ab8c
+[Lower] 00ab8d -> 00ab8d
+[Lower] 00ab8e -> 00ab8e
+[Lower] 00ab8f -> 00ab8f
+[Lower] 00ab90 -> 00ab90
+[Lower] 00ab91 -> 00ab91
+[Lower] 00ab92 -> 00ab92
+[Lower] 00ab93 -> 00ab93
+[Lower] 00ab94 -> 00ab94
+[Lower] 00ab95 -> 00ab95
+[Lower] 00ab96 -> 00ab96
+[Lower] 00ab97 -> 00ab97
+[Lower] 00ab98 -> 00ab98
+[Lower] 00ab99 -> 00ab99
+[Lower] 00ab9a -> 00ab9a
+[Lower] 00ab9b -> 00ab9b
+[Lower] 00ab9c -> 00ab9c
+[Lower] 00ab9d -> 00ab9d
+[Lower] 00ab9e -> 00ab9e
+[Lower] 00ab9f -> 00ab9f
+[Lower] 00aba0 -> 00aba0
+[Lower] 00aba1 -> 00aba1
+[Lower] 00aba2 -> 00aba2
+[Lower] 00aba3 -> 00aba3
+[Lower] 00aba4 -> 00aba4
+[Lower] 00aba5 -> 00aba5
+[Lower] 00aba6 -> 00aba6
+[Lower] 00aba7 -> 00aba7
+[Lower] 00aba8 -> 00aba8
+[Lower] 00aba9 -> 00aba9
+[Lower] 00abaa -> 00abaa
+[Lower] 00abab -> 00abab
+[Lower] 00abac -> 00abac
+[Lower] 00abad -> 00abad
+[Lower] 00abae -> 00abae
+[Lower] 00abaf -> 00abaf
+[Lower] 00abb0 -> 00abb0
+[Lower] 00abb1 -> 00abb1
+[Lower] 00abb2 -> 00abb2
+[Lower] 00abb3 -> 00abb3
+[Lower] 00abb4 -> 00abb4
+[Lower] 00abb5 -> 00abb5
+[Lower] 00abb6 -> 00abb6
+[Lower] 00abb7 -> 00abb7
+[Lower] 00abb8 -> 00abb8
+[Lower] 00abb9 -> 00abb9
+[Lower] 00abba -> 00abba
+[Lower] 00abbb -> 00abbb
+[Lower] 00abbc -> 00abbc
+[Lower] 00abbd -> 00abbd
+[Lower] 00abbe -> 00abbe
+[Lower] 00abbf -> 00abbf
[Lower] 00ff21 -> 00ff41
[Lower] 00ff22 -> 00ff42
[Lower] 00ff23 -> 00ff43
@@ -2736,46 +2969,46 @@
[Lower] 00ff58 -> 00ff58
[Lower] 00ff59 -> 00ff59
[Lower] 00ff5a -> 00ff5a
-[Lower] 010400 -> 01041c
-[Lower] 010401 -> 01041d
-[Lower] 010402 -> 01041e
-[Lower] 010403 -> 01041f
-[Lower] 010404 -> 010420
-[Lower] 010405 -> 010421
-[Lower] 010406 -> 010422
-[Lower] 010407 -> 010423
-[Lower] 010408 -> 010424
-[Lower] 010409 -> 010425
-[Lower] 01040a -> 010426
-[Lower] 01040b -> 010427
-[Lower] 01040c -> 010428
-[Lower] 01040d -> 010429
-[Lower] 01040e -> 01042a
-[Lower] 01040f -> 01042b
-[Lower] 010410 -> 01042c
-[Lower] 010411 -> 01042d
-[Lower] 010412 -> 01042e
-[Lower] 010413 -> 01042f
-[Lower] 010414 -> 010430
-[Lower] 010415 -> 010431
-[Lower] 010416 -> 010432
-[Lower] 010417 -> 010433
-[Lower] 010418 -> 010434
-[Lower] 010419 -> 010435
-[Lower] 01041a -> 010436
-[Lower] 01041b -> 010437
-[Lower] 01041c -> 010438
-[Lower] 01041d -> 010439
-[Lower] 01041e -> 01043a
-[Lower] 01041f -> 01043b
-[Lower] 010420 -> 01043c
-[Lower] 010421 -> 01043d
-[Lower] 010422 -> 01043e
-[Lower] 010423 -> 01043f
-[Lower] 010424 -> 010440
-[Lower] 010425 -> 010441
-[Lower] 010426 -> 010442
-[Lower] 010427 -> 010443
+[Lower] 010400 -> 010428
+[Lower] 010401 -> 010429
+[Lower] 010402 -> 01042a
+[Lower] 010403 -> 01042b
+[Lower] 010404 -> 01042c
+[Lower] 010405 -> 01042d
+[Lower] 010406 -> 01042e
+[Lower] 010407 -> 01042f
+[Lower] 010408 -> 010430
+[Lower] 010409 -> 010431
+[Lower] 01040a -> 010432
+[Lower] 01040b -> 010433
+[Lower] 01040c -> 010434
+[Lower] 01040d -> 010435
+[Lower] 01040e -> 010436
+[Lower] 01040f -> 010437
+[Lower] 010410 -> 010438
+[Lower] 010411 -> 010439
+[Lower] 010412 -> 01043a
+[Lower] 010413 -> 01043b
+[Lower] 010414 -> 01043c
+[Lower] 010415 -> 01043d
+[Lower] 010416 -> 01043e
+[Lower] 010417 -> 01043f
+[Lower] 010418 -> 010440
+[Lower] 010419 -> 010441
+[Lower] 01041a -> 010442
+[Lower] 01041b -> 010443
+[Lower] 01041c -> 010444
+[Lower] 01041d -> 010445
+[Lower] 01041e -> 010446
+[Lower] 01041f -> 010447
+[Lower] 010420 -> 010448
+[Lower] 010421 -> 010449
+[Lower] 010422 -> 01044a
+[Lower] 010423 -> 01044b
+[Lower] 010424 -> 01044c
+[Lower] 010425 -> 01044d
+[Lower] 010426 -> 01044e
+[Lower] 010427 -> 01044f
[Lower] 010428 -> 010428
[Lower] 010429 -> 010429
[Lower] 01042a -> 01042a
@@ -2804,6 +3037,18 @@
[Lower] 010441 -> 010441
[Lower] 010442 -> 010442
[Lower] 010443 -> 010443
+[Lower] 010444 -> 010444
+[Lower] 010445 -> 010445
+[Lower] 010446 -> 010446
+[Lower] 010447 -> 010447
+[Lower] 010448 -> 010448
+[Lower] 010449 -> 010449
+[Lower] 01044a -> 01044a
+[Lower] 01044b -> 01044b
+[Lower] 01044c -> 01044c
+[Lower] 01044d -> 01044d
+[Lower] 01044e -> 01044e
+[Lower] 01044f -> 01044f
[Lower] 0104b0 -> 0104d8
[Lower] 0104b1 -> 0104d9
[Lower] 0104b2 -> 0104da
@@ -3307,7 +3552,7 @@
[Upper] 0000ae -> 0000ae
[Upper] 0000b0 -> 0000b0
[Upper] 0000b1 -> 0000b1
-[Upper] 0000b5 -> 0000b5
+[Upper] 0000b5 -> 0003bc
[Upper] 0000b7 -> 0000b7
[Upper] 0000bb -> 0000bb
[Upper] 0000bc -> 0000bc
@@ -3462,6 +3707,7 @@
[Upper] 00019a -> 00019a
[Upper] 00019c -> 00026f
[Upper] 00019d -> 000272
+[Upper] 00019e -> 00019e
[Upper] 00019f -> 000275
[Upper] 0001a0..0001a1 -> 0001a1
[Upper] 0001a2..0001a3 -> 0001a3
@@ -3525,6 +3771,7 @@
[Upper] 00021a..00021b -> 00021b
[Upper] 00021c..00021d -> 00021d
[Upper] 00021e..00021f -> 00021f
+[Upper] 000220 -> 00019e
[Upper] 000222..000223 -> 000223
[Upper] 000224..000225 -> 000225
[Upper] 000226..000227 -> 000227
@@ -3562,6 +3809,7 @@
[Upper] 000260 -> 000260
[Upper] 000261 -> 000261
[Upper] 000263 -> 000263
+[Upper] 000265 -> 000265
[Upper] 000266 -> 000266
[Upper] 000268 -> 000268
[Upper] 000269 -> 000269
@@ -3584,6 +3832,14 @@
[Upper] 000292 -> 000292
[Upper] 00029d -> 00029d
[Upper] 00029e -> 00029e
+[Upper] 000345 -> 0003b9
+[Upper] 000370..000371 -> 000371
+[Upper] 000372..000373 -> 000373
+[Upper] 000376..000377 -> 000377
+[Upper] 00037b -> 00037b
+[Upper] 00037c -> 00037c
+[Upper] 00037d -> 00037d
+[Upper] 00037f -> 0003f3
[Upper] 000386 -> 0003ac
[Upper] 000388 -> 0003ad
[Upper] 000389 -> 0003ae
@@ -3638,7 +3894,7 @@
[Upper] 0003bf -> 0003bf
[Upper] 0003c0 -> 0003c0
[Upper] 0003c1 -> 0003c1
-[Upper] 0003c3 -> 0003c3
+[Upper] 0003c2..0003c3 -> 0003c3
[Upper] 0003c4 -> 0003c4
[Upper] 0003c5 -> 0003c5
[Upper] 0003c6 -> 0003c6
@@ -3650,6 +3906,13 @@
[Upper] 0003cc -> 0003cc
[Upper] 0003cd -> 0003cd
[Upper] 0003ce -> 0003ce
+[Upper] 0003cf -> 0003d7
+[Upper] 0003d0 -> 0003b2
+[Upper] 0003d1 -> 0003b8
+[Upper] 0003d5 -> 0003c6
+[Upper] 0003d6 -> 0003c0
+[Upper] 0003d7 -> 0003d7
+[Upper] 0003d8..0003d9 -> 0003d9
[Upper] 0003da..0003db -> 0003db
[Upper] 0003dc..0003dd -> 0003dd
[Upper] 0003de..0003df -> 0003df
@@ -3661,6 +3924,18 @@
[Upper] 0003ea..0003eb -> 0003eb
[Upper] 0003ec..0003ed -> 0003ed
[Upper] 0003ee..0003ef -> 0003ef
+[Upper] 0003f0 -> 0003ba
+[Upper] 0003f1 -> 0003c1
+[Upper] 0003f2 -> 0003f2
+[Upper] 0003f3 -> 0003f3
+[Upper] 0003f4 -> 0003b8
+[Upper] 0003f5 -> 0003b5
+[Upper] 0003f7..0003f8 -> 0003f8
+[Upper] 0003f9 -> 0003f2
+[Upper] 0003fa..0003fb -> 0003fb
+[Upper] 0003fd -> 00037b
+[Upper] 0003fe -> 00037c
+[Upper] 0003ff -> 00037d
[Upper] 000400 -> 000450
[Upper] 000401 -> 000451
[Upper] 000402 -> 000452
@@ -3774,6 +4049,7 @@
[Upper] 00047c..00047d -> 00047d
[Upper] 00047e..00047f -> 00047f
[Upper] 000480..000481 -> 000481
+[Upper] 00048a..00048b -> 00048b
[Upper] 00048c..00048d -> 00048d
[Upper] 00048e..00048f -> 00048f
[Upper] 000490..000491 -> 000491
@@ -3800,10 +4076,15 @@
[Upper] 0004ba..0004bb -> 0004bb
[Upper] 0004bc..0004bd -> 0004bd
[Upper] 0004be..0004bf -> 0004bf
+[Upper] 0004c0 -> 0004cf
[Upper] 0004c1..0004c2 -> 0004c2
[Upper] 0004c3..0004c4 -> 0004c4
+[Upper] 0004c5..0004c6 -> 0004c6
[Upper] 0004c7..0004c8 -> 0004c8
+[Upper] 0004c9..0004ca -> 0004ca
[Upper] 0004cb..0004cc -> 0004cc
+[Upper] 0004cd..0004ce -> 0004ce
+[Upper] 0004cf -> 0004cf
[Upper] 0004d0..0004d1 -> 0004d1
[Upper] 0004d2..0004d3 -> 0004d3
[Upper] 0004d4..0004d5 -> 0004d5
@@ -3828,6 +4109,30 @@
[Upper] 0004fa..0004fb -> 0004fb
[Upper] 0004fc..0004fd -> 0004fd
[Upper] 0004fe..0004ff -> 0004ff
+[Upper] 000500..000501 -> 000501
+[Upper] 000502..000503 -> 000503
+[Upper] 000504..000505 -> 000505
+[Upper] 000506..000507 -> 000507
+[Upper] 000508..000509 -> 000509
+[Upper] 00050a..00050b -> 00050b
+[Upper] 00050c..00050d -> 00050d
+[Upper] 00050e..00050f -> 00050f
+[Upper] 000510..000511 -> 000511
+[Upper] 000512..000513 -> 000513
+[Upper] 000514..000515 -> 000515
+[Upper] 000516..000517 -> 000517
+[Upper] 000518..000519 -> 000519
+[Upper] 00051a..00051b -> 00051b
+[Upper] 00051c..00051d -> 00051d
+[Upper] 00051e..00051f -> 00051f
+[Upper] 000520..000521 -> 000521
+[Upper] 000522..000523 -> 000523
+[Upper] 000524..000525 -> 000525
+[Upper] 000526..000527 -> 000527
+[Upper] 000528..000529 -> 000529
+[Upper] 00052a..00052b -> 00052b
+[Upper] 00052c..00052d -> 00052d
+[Upper] 00052e..00052f -> 00052f
[Upper] 000531 -> 000561
[Upper] 000532 -> 000562
[Upper] 000533 -> 000563
@@ -3942,14 +4247,108 @@
[Upper] 0010c3 -> 002d23
[Upper] 0010c4 -> 002d24
[Upper] 0010c5 -> 002d25
-[Upper] 0010c6 -> 002d26
[Upper] 0010c7 -> 002d27
-[Upper] 0010c8 -> 002d28
-[Upper] 0010c9 -> 002d29
-[Upper] 0010ca -> 002d2a
-[Upper] 0010cb -> 002d2b
-[Upper] 0010cc -> 002d2c
[Upper] 0010cd -> 002d2d
+[Upper] 0013a0 -> 00ab70
+[Upper] 0013a1 -> 00ab71
+[Upper] 0013a2 -> 00ab72
+[Upper] 0013a3 -> 00ab73
+[Upper] 0013a4 -> 00ab74
+[Upper] 0013a5 -> 00ab75
+[Upper] 0013a6 -> 00ab76
+[Upper] 0013a7 -> 00ab77
+[Upper] 0013a8 -> 00ab78
+[Upper] 0013a9 -> 00ab79
+[Upper] 0013aa -> 00ab7a
+[Upper] 0013ab -> 00ab7b
+[Upper] 0013ac -> 00ab7c
+[Upper] 0013ad -> 00ab7d
+[Upper] 0013ae -> 00ab7e
+[Upper] 0013af -> 00ab7f
+[Upper] 0013b0 -> 00ab80
+[Upper] 0013b1 -> 00ab81
+[Upper] 0013b2 -> 00ab82
+[Upper] 0013b3 -> 00ab83
+[Upper] 0013b4 -> 00ab84
+[Upper] 0013b5 -> 00ab85
+[Upper] 0013b6 -> 00ab86
+[Upper] 0013b7 -> 00ab87
+[Upper] 0013b8 -> 00ab88
+[Upper] 0013b9 -> 00ab89
+[Upper] 0013ba -> 00ab8a
+[Upper] 0013bb -> 00ab8b
+[Upper] 0013bc -> 00ab8c
+[Upper] 0013bd -> 00ab8d
+[Upper] 0013be -> 00ab8e
+[Upper] 0013bf -> 00ab8f
+[Upper] 0013c0 -> 00ab90
+[Upper] 0013c1 -> 00ab91
+[Upper] 0013c2 -> 00ab92
+[Upper] 0013c3 -> 00ab93
+[Upper] 0013c4 -> 00ab94
+[Upper] 0013c5 -> 00ab95
+[Upper] 0013c6 -> 00ab96
+[Upper] 0013c7 -> 00ab97
+[Upper] 0013c8 -> 00ab98
+[Upper] 0013c9 -> 00ab99
+[Upper] 0013ca -> 00ab9a
+[Upper] 0013cb -> 00ab9b
+[Upper] 0013cc -> 00ab9c
+[Upper] 0013cd -> 00ab9d
+[Upper] 0013ce -> 00ab9e
+[Upper] 0013cf -> 00ab9f
+[Upper] 0013d0 -> 00aba0
+[Upper] 0013d1 -> 00aba1
+[Upper] 0013d2 -> 00aba2
+[Upper] 0013d3 -> 00aba3
+[Upper] 0013d4 -> 00aba4
+[Upper] 0013d5 -> 00aba5
+[Upper] 0013d6 -> 00aba6
+[Upper] 0013d7 -> 00aba7
+[Upper] 0013d8 -> 00aba8
+[Upper] 0013d9 -> 00aba9
+[Upper] 0013da -> 00abaa
+[Upper] 0013db -> 00abab
+[Upper] 0013dc -> 00abac
+[Upper] 0013dd -> 00abad
+[Upper] 0013de -> 00abae
+[Upper] 0013df -> 00abaf
+[Upper] 0013e0 -> 00abb0
+[Upper] 0013e1 -> 00abb1
+[Upper] 0013e2 -> 00abb2
+[Upper] 0013e3 -> 00abb3
+[Upper] 0013e4 -> 00abb4
+[Upper] 0013e5 -> 00abb5
+[Upper] 0013e6 -> 00abb6
+[Upper] 0013e7 -> 00abb7
+[Upper] 0013e8 -> 00abb8
+[Upper] 0013e9 -> 00abb9
+[Upper] 0013ea -> 00abba
+[Upper] 0013eb -> 00abbb
+[Upper] 0013ec -> 00abbc
+[Upper] 0013ed -> 00abbd
+[Upper] 0013ee -> 00abbe
+[Upper] 0013ef -> 00abbf
+[Upper] 0013f0 -> 0013f8
+[Upper] 0013f1 -> 0013f9
+[Upper] 0013f2 -> 0013fa
+[Upper] 0013f3 -> 0013fb
+[Upper] 0013f4 -> 0013fc
+[Upper] 0013f5 -> 0013fd
+[Upper] 0013f8 -> 0013f8
+[Upper] 0013f9 -> 0013f9
+[Upper] 0013fa -> 0013fa
+[Upper] 0013fb -> 0013fb
+[Upper] 0013fc -> 0013fc
+[Upper] 0013fd -> 0013fd
+[Upper] 001c80 -> 000432
+[Upper] 001c81 -> 000434
+[Upper] 001c82 -> 00043e
+[Upper] 001c83 -> 000441
+[Upper] 001c84..001c85 -> 000442
+[Upper] 001c86 -> 00044a
+[Upper] 001c87 -> 000463
+[Upper] 001c88 -> 00a64b
[Upper] 001d79 -> 001d79
[Upper] 001d7d -> 001d7d
[Upper] 001e00..001e01 -> 001e01
@@ -4027,6 +4426,8 @@
[Upper] 001e90..001e91 -> 001e91
[Upper] 001e92..001e93 -> 001e93
[Upper] 001e94..001e95 -> 001e95
+[Upper] 001e9b -> 001e61
+[Upper] 001e9e -> 0000df
[Upper] 001ea0..001ea1 -> 001ea1
[Upper] 001ea2..001ea3 -> 001ea3
[Upper] 001ea4..001ea5 -> 001ea5
@@ -4072,6 +4473,9 @@
[Upper] 001ef4..001ef5 -> 001ef5
[Upper] 001ef6..001ef7 -> 001ef7
[Upper] 001ef8..001ef9 -> 001ef9
+[Upper] 001efa..001efb -> 001efb
+[Upper] 001efc..001efd -> 001efd
+[Upper] 001efe..001eff -> 001eff
[Upper] 001f00 -> 001f00
[Upper] 001f01 -> 001f01
[Upper] 001f02 -> 001f02
@@ -4138,22 +4542,20 @@
[Upper] 001f43 -> 001f43
[Upper] 001f44 -> 001f44
[Upper] 001f45 -> 001f45
-[Upper] 001f46 -> 001f46
-[Upper] 001f47 -> 001f47
[Upper] 001f48 -> 001f40
[Upper] 001f49 -> 001f41
[Upper] 001f4a -> 001f42
[Upper] 001f4b -> 001f43
[Upper] 001f4c -> 001f44
[Upper] 001f4d -> 001f45
-[Upper] 001f4e -> 001f46
-[Upper] 001f4f -> 001f47
[Upper] 001f51 -> 001f51
[Upper] 001f53 -> 001f53
[Upper] 001f55 -> 001f55
+[Upper] 001f57 -> 001f57
[Upper] 001f59 -> 001f51
[Upper] 001f5b -> 001f53
[Upper] 001f5d -> 001f55
+[Upper] 001f5f -> 001f57
[Upper] 001f60 -> 001f60
[Upper] 001f61 -> 001f61
[Upper] 001f62 -> 001f62
@@ -4240,6 +4642,7 @@
[Upper] 001fba -> 001f70
[Upper] 001fbb -> 001f71
[Upper] 001fbc -> 001fb3
+[Upper] 001fbe -> 0003b9
[Upper] 001fc3 -> 001fc3
[Upper] 001fc8 -> 001f72
[Upper] 001fc9 -> 001f73
@@ -4378,6 +4781,10 @@
[Upper] 00206d -> 00206d
[Upper] 00206e -> 00206e
[Upper] 00206f -> 00206f
+[Upper] 002126 -> 0003c9
+[Upper] 00212b -> 0000e5
+[Upper] 002132 -> 00214e
+[Upper] 00214e -> 00214e
[Upper] 002160 -> 002170
[Upper] 002161 -> 002171
[Upper] 002162 -> 002172
@@ -4410,6 +4817,7 @@
[Upper] 00217d -> 00217d
[Upper] 00217e -> 00217e
[Upper] 00217f -> 00217f
+[Upper] 002183..002184 -> 002184
[Upper] 002190 -> 002190
[Upper] 002191 -> 002191
[Upper] 002192 -> 002192
@@ -5310,10 +5718,10 @@
[Upper] 002cdc..002cdd -> 002cdd
[Upper] 002cde..002cdf -> 002cdf
[Upper] 002ce0..002ce1 -> 002ce1
+[Upper] 002ce2..002ce3 -> 002ce3
[Upper] 002ceb..002cec -> 002cec
[Upper] 002ced..002cee -> 002cee
-[Upper] 002cef..002cf0 -> 002cf0
-[Upper] 002cf1..002cf2 -> 002cf2
+[Upper] 002cf2..002cf3 -> 002cf3
[Upper] 002d00 -> 002d00
[Upper] 002d01 -> 002d01
[Upper] 002d02 -> 002d02
@@ -5352,13 +5760,7 @@
[Upper] 002d23 -> 002d23
[Upper] 002d24 -> 002d24
[Upper] 002d25 -> 002d25
-[Upper] 002d26 -> 002d26
[Upper] 002d27 -> 002d27
-[Upper] 002d28 -> 002d28
-[Upper] 002d29 -> 002d29
-[Upper] 002d2a -> 002d2a
-[Upper] 002d2b -> 002d2b
-[Upper] 002d2c -> 002d2c
[Upper] 002d2d -> 002d2d
[Upper] 00a640..00a641 -> 00a641
[Upper] 00a642..00a643 -> 00a643
@@ -5444,7 +5846,7 @@
[Upper] 00a784..00a785 -> 00a785
[Upper] 00a786..00a787 -> 00a787
[Upper] 00a78b..00a78c -> 00a78c
-[Upper] 00a78d..00a78e -> 00a78e
+[Upper] 00a78d -> 000265
[Upper] 00a790..00a791 -> 00a791
[Upper] 00a792..00a793 -> 00a793
[Upper] 00a796..00a797 -> 00a797
@@ -5469,6 +5871,86 @@
[Upper] 00a7b4..00a7b5 -> 00a7b5
[Upper] 00a7b6..00a7b7 -> 00a7b7
[Upper] 00ab53 -> 00ab53
+[Upper] 00ab70 -> 00ab70
+[Upper] 00ab71 -> 00ab71
+[Upper] 00ab72 -> 00ab72
+[Upper] 00ab73 -> 00ab73
+[Upper] 00ab74 -> 00ab74
+[Upper] 00ab75 -> 00ab75
+[Upper] 00ab76 -> 00ab76
+[Upper] 00ab77 -> 00ab77
+[Upper] 00ab78 -> 00ab78
+[Upper] 00ab79 -> 00ab79
+[Upper] 00ab7a -> 00ab7a
+[Upper] 00ab7b -> 00ab7b
+[Upper] 00ab7c -> 00ab7c
+[Upper] 00ab7d -> 00ab7d
+[Upper] 00ab7e -> 00ab7e
+[Upper] 00ab7f -> 00ab7f
+[Upper] 00ab80 -> 00ab80
+[Upper] 00ab81 -> 00ab81
+[Upper] 00ab82 -> 00ab82
+[Upper] 00ab83 -> 00ab83
+[Upper] 00ab84 -> 00ab84
+[Upper] 00ab85 -> 00ab85
+[Upper] 00ab86 -> 00ab86
+[Upper] 00ab87 -> 00ab87
+[Upper] 00ab88 -> 00ab88
+[Upper] 00ab89 -> 00ab89
+[Upper] 00ab8a -> 00ab8a
+[Upper] 00ab8b -> 00ab8b
+[Upper] 00ab8c -> 00ab8c
+[Upper] 00ab8d -> 00ab8d
+[Upper] 00ab8e -> 00ab8e
+[Upper] 00ab8f -> 00ab8f
+[Upper] 00ab90 -> 00ab90
+[Upper] 00ab91 -> 00ab91
+[Upper] 00ab92 -> 00ab92
+[Upper] 00ab93 -> 00ab93
+[Upper] 00ab94 -> 00ab94
+[Upper] 00ab95 -> 00ab95
+[Upper] 00ab96 -> 00ab96
+[Upper] 00ab97 -> 00ab97
+[Upper] 00ab98 -> 00ab98
+[Upper] 00ab99 -> 00ab99
+[Upper] 00ab9a -> 00ab9a
+[Upper] 00ab9b -> 00ab9b
+[Upper] 00ab9c -> 00ab9c
+[Upper] 00ab9d -> 00ab9d
+[Upper] 00ab9e -> 00ab9e
+[Upper] 00ab9f -> 00ab9f
+[Upper] 00aba0 -> 00aba0
+[Upper] 00aba1 -> 00aba1
+[Upper] 00aba2 -> 00aba2
+[Upper] 00aba3 -> 00aba3
+[Upper] 00aba4 -> 00aba4
+[Upper] 00aba5 -> 00aba5
+[Upper] 00aba6 -> 00aba6
+[Upper] 00aba7 -> 00aba7
+[Upper] 00aba8 -> 00aba8
+[Upper] 00aba9 -> 00aba9
+[Upper] 00abaa -> 00abaa
+[Upper] 00abab -> 00abab
+[Upper] 00abac -> 00abac
+[Upper] 00abad -> 00abad
+[Upper] 00abae -> 00abae
+[Upper] 00abaf -> 00abaf
+[Upper] 00abb0 -> 00abb0
+[Upper] 00abb1 -> 00abb1
+[Upper] 00abb2 -> 00abb2
+[Upper] 00abb3 -> 00abb3
+[Upper] 00abb4 -> 00abb4
+[Upper] 00abb5 -> 00abb5
+[Upper] 00abb6 -> 00abb6
+[Upper] 00abb7 -> 00abb7
+[Upper] 00abb8 -> 00abb8
+[Upper] 00abb9 -> 00abb9
+[Upper] 00abba -> 00abba
+[Upper] 00abbb -> 00abbb
+[Upper] 00abbc -> 00abbc
+[Upper] 00abbd -> 00abbd
+[Upper] 00abbe -> 00abbe
+[Upper] 00abbf -> 00abbf
[Upper] 00ff21 -> 00ff41
[Upper] 00ff22 -> 00ff42
[Upper] 00ff23 -> 00ff43
@@ -5521,46 +6003,46 @@
[Upper] 00ff58 -> 00ff58
[Upper] 00ff59 -> 00ff59
[Upper] 00ff5a -> 00ff5a
-[Upper] 010400 -> 010438
-[Upper] 010401 -> 010439
-[Upper] 010402 -> 01043a
-[Upper] 010403 -> 01043b
-[Upper] 010404 -> 01043c
-[Upper] 010405 -> 01043d
-[Upper] 010406 -> 01043e
-[Upper] 010407 -> 01043f
-[Upper] 010408 -> 010440
-[Upper] 010409 -> 010441
-[Upper] 01040a -> 010442
-[Upper] 01040b -> 010443
-[Upper] 01040c -> 010428
-[Upper] 01040d -> 010429
-[Upper] 01040e -> 01042a
-[Upper] 01040f -> 01042b
-[Upper] 010410 -> 01042c
-[Upper] 010411 -> 01042d
-[Upper] 010412 -> 01042e
-[Upper] 010413 -> 01042f
-[Upper] 010414 -> 010430
-[Upper] 010415 -> 010431
-[Upper] 010416 -> 010432
-[Upper] 010417 -> 010433
-[Upper] 010418 -> 010434
-[Upper] 010419 -> 010435
-[Upper] 01041a -> 010436
-[Upper] 01041b -> 010437
-[Upper] 01041c -> 010438
-[Upper] 01041d -> 010439
-[Upper] 01041e -> 01043a
-[Upper] 01041f -> 01043b
-[Upper] 010420 -> 01043c
-[Upper] 010421 -> 01043d
-[Upper] 010422 -> 01043e
-[Upper] 010423 -> 01043f
-[Upper] 010424 -> 010440
-[Upper] 010425 -> 010441
-[Upper] 010426 -> 010442
-[Upper] 010427 -> 010443
+[Upper] 010400 -> 010428
+[Upper] 010401 -> 010429
+[Upper] 010402 -> 01042a
+[Upper] 010403 -> 01042b
+[Upper] 010404 -> 01042c
+[Upper] 010405 -> 01042d
+[Upper] 010406 -> 01042e
+[Upper] 010407 -> 01042f
+[Upper] 010408 -> 010430
+[Upper] 010409 -> 010431
+[Upper] 01040a -> 010432
+[Upper] 01040b -> 010433
+[Upper] 01040c -> 010434
+[Upper] 01040d -> 010435
+[Upper] 01040e -> 010436
+[Upper] 01040f -> 010437
+[Upper] 010410 -> 010438
+[Upper] 010411 -> 010439
+[Upper] 010412 -> 01043a
+[Upper] 010413 -> 01043b
+[Upper] 010414 -> 01043c
+[Upper] 010415 -> 01043d
+[Upper] 010416 -> 01043e
+[Upper] 010417 -> 01043f
+[Upper] 010418 -> 010440
+[Upper] 010419 -> 010441
+[Upper] 01041a -> 010442
+[Upper] 01041b -> 010443
+[Upper] 01041c -> 010444
+[Upper] 01041d -> 010445
+[Upper] 01041e -> 010446
+[Upper] 01041f -> 010447
+[Upper] 010420 -> 010448
+[Upper] 010421 -> 010449
+[Upper] 010422 -> 01044a
+[Upper] 010423 -> 01044b
+[Upper] 010424 -> 01044c
+[Upper] 010425 -> 01044d
+[Upper] 010426 -> 01044e
+[Upper] 010427 -> 01044f
[Upper] 010428 -> 010428
[Upper] 010429 -> 010429
[Upper] 01042a -> 01042a
@@ -5589,6 +6071,18 @@
[Upper] 010441 -> 010441
[Upper] 010442 -> 010442
[Upper] 010443 -> 010443
+[Upper] 010444 -> 010444
+[Upper] 010445 -> 010445
+[Upper] 010446 -> 010446
+[Upper] 010447 -> 010447
+[Upper] 010448 -> 010448
+[Upper] 010449 -> 010449
+[Upper] 01044a -> 01044a
+[Upper] 01044b -> 01044b
+[Upper] 01044c -> 01044c
+[Upper] 01044d -> 01044d
+[Upper] 01044e -> 01044e
+[Upper] 01044f -> 01044f
[Upper] 0104b0 -> 0104d8
[Upper] 0104b1 -> 0104d9
[Upper] 0104b2 -> 0104da
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 14:54 ` Michal Nazarewicz
@ 2016-10-04 15:06 ` Eli Zaretskii
2016-10-04 16:57 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 15:06 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Tue, 04 Oct 2016 16:54:31 +0200
>
> + ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well.
> + ;; General category of those characers is Number, Letter.
> + (modify-syntax-entry '(#x2160 . #x216b) "w " syn-tab)
> +
> + ;; ⓐ thourgh ⓩ are symbols, other according to Unicode but Emacs set
> + ;; their syntax to word in the past so keep backwards compatibility.
> + (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-tab))
I think we should document all the changes. If the list of changes is
too long, and cannot be made short enough by summarizing (instead of
showing each individual character), then it probably should go into
some separate file in admin/unidata/. If it can be short enough, then
a comment in characters.el is a better place, I think.
> I get the following (annotated) differences:
Can you add the name of each character (just one, the leftmost one) to
its line and post the result? It's hard to read the report when it
only shows codepoints.
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 15:06 ` Eli Zaretskii
@ 2016-10-04 16:57 ` Michal Nazarewicz
2016-10-04 17:27 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 16:57 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Cc: 24603@debbugs.gnu.org
>> Date: Tue, 04 Oct 2016 16:54:31 +0200
>>
>> + ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well.
>> + ;; General category of those characers is Number, Letter.
>> + (modify-syntax-entry '(#x2160 . #x216b) "w " syn-tab)
>> +
>> + ;; ⓐ thourgh ⓩ are symbols, other according to Unicode but Emacs set
>> + ;; their syntax to word in the past so keep backwards compatibility.
>> + (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-tab))
>
> I think we should document all the changes.
I wouldn’t know where to put such documentation. syntax.texi mentions
existence of standard syntax table but does not describe its content.
standard-syntax-table’s docstring is even less descriptive.
>> I get the following (annotated) differences:
>
> Can you add the name of each character (just one, the leftmost one) to
> its line and post the result? It's hard to read the report when it
> only shows codepoints.
--- orig-tables.txt 2016-10-04 18:40:27.276408290 +0200
+++ modified-tables.txt 2016-10-04 18:41:34.651421547 +0200
@@ -1,4 +1,4 @@
-GNU Emacs 26.0.50.10 (x86_64-unknown-linux-gnu)
+GNU Emacs 26.0.50.11 (x86_64-unknown-linux-gnu)
of 2016-10-04
Syntax: 000000..000008 -> (1)
@@ -44,7 +44,9 @@
Syntax: 0000a7 -> (1) Section Sign
Syntax: 0000a8..0000aa -> (3) Diaeresis
Syntax: 0000ab -> (1) Left-Pointing Double Angle Quotation Mark
-Syntax: 0000ac..0000b6 -> (3) Not Sign
+Syntax: 0000ac..0000b4 -> (3) Not Sign
+Syntax: 0000b5 -> (2) Micro Sign
+Syntax: 0000b6 -> (3) Pilcrow Sign
Syntax: 0000b7 -> (2) Middle Dot
Syntax: 0000b8..0000ba -> (3) Cedilla
Syntax: 0000bb -> (1) Right-Pointing Double Angle Quotation Mark
@@ -54,9 +56,7 @@
Syntax: 0000d7 -> (3) Multiplication Sign
Syntax: 0000d8..0000f6 -> (2) Latin Capital Letter O With Stroke
Syntax: 0000f7 -> (3) Division Sign
-Syntax: 0000f8..000148 -> (2) Latin Small Letter O With Stroke
-Syntax: 000149 -> (3) Latin Small Letter N Preceded By Apostrophe
-Syntax: 00014a..0002c6 -> (2) Latin Capital Letter Eng
+Syntax: 0000f8..0002c6 -> (2) Latin Small Letter O With Stroke
Syntax: 0002c7 -> (3) Caron
Syntax: 0002c8 -> (2) Modifier Letter Vertical Line
Syntax: 0002c9 -> (3) Modifier Letter Macron
@@ -136,17 +136,11 @@
Syntax: 002103 -> (3) Degree Celsius
Syntax: 002104..002108 -> (2) Centre Line Symbol
Syntax: 002109 -> (3) Degree Fahrenheit
-Syntax: 00210a..002112 -> (2) Script Small G
-Syntax: 002113 -> (3) Script Small L
-Syntax: 002114..002115 -> (2) L B Bar Symbol
+Syntax: 00210a..002115 -> (2) Script Small G
Syntax: 002116 -> (1) Numero Sign
Syntax: 002117..002120 -> (2) Sound Recording Copyright
Syntax: 002121..002122 -> (3) Telephone Sign
-Syntax: 002123..002125 -> (2) Versicle
-Syntax: 002126 -> (3) Ohm Sign
-Syntax: 002127..00212a -> (2) Inverted Ohm Sign
-Syntax: 00212b -> (3) Angstrom Sign
-Syntax: 00212c..002152 -> (2) Script Capital B
+Syntax: 002123..002152 -> (2) Versicle
Syntax: 002153..002154 -> (3) Vulgar Fraction One Third
Syntax: 002155..00215a -> (2) Vulgar Fraction One Fifth
Syntax: 00215b..00215e -> (3) Vulgar Fraction One Eighth
@@ -677,6 +671,7 @@
Lower: 00019a -> 00019a Latin Small Letter L With Bar
Lower: 00019c -> 00026f Latin Capital Letter Turned M
Lower: 00019d -> 000272 Latin Capital Letter N With Left Hook
+Lower: 00019e -> 00019e Latin Small Letter N With Long Right Leg
Lower: 00019f -> 000275 Latin Capital Letter O With Middle Tilde
Lower: 0001a0..0001a1 -> 0001a1 Latin Capital Letter O With Horn
Lower: 0001a2..0001a3 -> 0001a3 Latin Capital Letter Oi
@@ -740,6 +735,7 @@
Lower: 00021a..00021b -> 00021b Latin Capital Letter T With Comma Below
Lower: 00021c..00021d -> 00021d Latin Capital Letter Yogh
Lower: 00021e..00021f -> 00021f Latin Capital Letter H With Caron
+Lower: 000220 -> 00019e Latin Capital Letter N With Long Right Leg
Lower: 000222..000223 -> 000223 Latin Capital Letter Ou
Lower: 000224..000225 -> 000225 Latin Capital Letter Z With Hook
Lower: 000226..000227 -> 000227 Latin Capital Letter A With Dot Above
@@ -777,6 +773,7 @@
Lower: 000260 -> 000260 Latin Small Letter G With Hook
Lower: 000261 -> 000261 Latin Small Letter Script G
Lower: 000263 -> 000263 Latin Small Letter Gamma
+Lower: 000265 -> 000265 Latin Small Letter Turned H
Lower: 000266 -> 000266 Latin Small Letter H With Hook
Lower: 000268 -> 000268 Latin Small Letter I With Stroke
Lower: 000269 -> 000269 Latin Small Letter Iota
@@ -799,6 +796,14 @@
Lower: 000292 -> 000292 Latin Small Letter Ezh
Lower: 00029d -> 00029d Latin Small Letter J With Crossed-Tail
Lower: 00029e -> 00029e Latin Small Letter Turned K
+Lower: 000345 -> 000345 Combining Greek Ypogegrammeni
+Lower: 000370..000371 -> 000371 Greek Capital Letter Heta
+Lower: 000372..000373 -> 000373 Greek Capital Letter Archaic Sampi
+Lower: 000376..000377 -> 000377 Greek Capital Letter Pamphylian Digamma
+Lower: 00037b -> 00037b Greek Small Reversed Lunate Sigma Symbol
+Lower: 00037c -> 00037c Greek Small Dotted Lunate Sigma Symbol
+Lower: 00037d -> 00037d Greek Small Reversed Dotted Lunate Sigma Symbol
+Lower: 00037f -> 0003f3 Greek Capital Letter Yot
Lower: 000386 -> 0003ac Greek Capital Letter Alpha With Tonos
Lower: 000388 -> 0003ad Greek Capital Letter Epsilon With Tonos
Lower: 000389 -> 0003ae Greek Capital Letter Eta With Tonos
@@ -853,6 +858,7 @@
Lower: 0003bf -> 0003bf Greek Small Letter Omicron
Lower: 0003c0 -> 0003c0 Greek Small Letter Pi
Lower: 0003c1 -> 0003c1 Greek Small Letter Rho
+Lower: 0003c2 -> 0003c2 Greek Small Letter Final Sigma
Lower: 0003c3 -> 0003c3 Greek Small Letter Sigma
Lower: 0003c4 -> 0003c4 Greek Small Letter Tau
Lower: 0003c5 -> 0003c5 Greek Small Letter Upsilon
@@ -865,6 +871,13 @@
Lower: 0003cc -> 0003cc Greek Small Letter Omicron With Tonos
Lower: 0003cd -> 0003cd Greek Small Letter Upsilon With Tonos
Lower: 0003ce -> 0003ce Greek Small Letter Omega With Tonos
+Lower: 0003cf -> 0003d7 Greek Capital Kai Symbol
+Lower: 0003d0 -> 0003d0 Greek Beta Symbol
+Lower: 0003d1 -> 0003d1 Greek Theta Symbol
+Lower: 0003d5 -> 0003d5 Greek Phi Symbol
+Lower: 0003d6 -> 0003d6 Greek Pi Symbol
+Lower: 0003d7 -> 0003d7 Greek Kai Symbol
+Lower: 0003d8..0003d9 -> 0003d9 Greek Letter Archaic Koppa
Lower: 0003da..0003db -> 0003db Greek Letter Stigma
Lower: 0003dc..0003dd -> 0003dd Greek Letter Digamma
Lower: 0003de..0003df -> 0003df Greek Letter Koppa
@@ -876,6 +889,18 @@
Lower: 0003ea..0003eb -> 0003eb Coptic Capital Letter Gangia
Lower: 0003ec..0003ed -> 0003ed Coptic Capital Letter Shima
Lower: 0003ee..0003ef -> 0003ef Coptic Capital Letter Dei
+Lower: 0003f0 -> 0003f0 Greek Kappa Symbol
+Lower: 0003f1 -> 0003f1 Greek Rho Symbol
+Lower: 0003f2 -> 0003f2 Greek Lunate Sigma Symbol
+Lower: 0003f3 -> 0003f3 Greek Letter Yot
+Lower: 0003f4 -> 0003b8 Greek Capital Theta Symbol
+Lower: 0003f5 -> 0003f5 Greek Lunate Epsilon Symbol
+Lower: 0003f7..0003f8 -> 0003f8 Greek Capital Letter Sho
+Lower: 0003f9 -> 0003f2 Greek Capital Lunate Sigma Symbol
+Lower: 0003fa..0003fb -> 0003fb Greek Capital Letter San
+Lower: 0003fd -> 00037b Greek Capital Reversed Lunate Sigma Symbol
+Lower: 0003fe -> 00037c Greek Capital Dotted Lunate Sigma Symbol
+Lower: 0003ff -> 00037d Greek Capital Reversed Dotted Lunate Sigma Symbol
Lower: 000400 -> 000450 Cyrillic Capital Letter Ie With Grave
Lower: 000401 -> 000451 Cyrillic Capital Letter Io
Lower: 000402 -> 000452 Cyrillic Capital Letter Dje
@@ -989,6 +1014,7 @@
Lower: 00047c..00047d -> 00047d Cyrillic Capital Letter Omega With Titlo
Lower: 00047e..00047f -> 00047f Cyrillic Capital Letter Ot
Lower: 000480..000481 -> 000481 Cyrillic Capital Letter Koppa
+Lower: 00048a..00048b -> 00048b Cyrillic Capital Letter Short I With Tail
Lower: 00048c..00048d -> 00048d Cyrillic Capital Letter Semisoft Sign
Lower: 00048e..00048f -> 00048f Cyrillic Capital Letter Er With Tick
Lower: 000490..000491 -> 000491 Cyrillic Capital Letter Ghe With Upturn
@@ -1015,10 +1041,15 @@
Lower: 0004ba..0004bb -> 0004bb Cyrillic Capital Letter Shha
Lower: 0004bc..0004bd -> 0004bd Cyrillic Capital Letter Abkhasian Che
Lower: 0004be..0004bf -> 0004bf Cyrillic Capital Letter Abkhasian Che With Descender
+Lower: 0004c0 -> 0004cf Cyrillic Letter Palochka
Lower: 0004c1..0004c2 -> 0004c2 Cyrillic Capital Letter Zhe With Breve
Lower: 0004c3..0004c4 -> 0004c4 Cyrillic Capital Letter Ka With Hook
+Lower: 0004c5..0004c6 -> 0004c6 Cyrillic Capital Letter El With Tail
Lower: 0004c7..0004c8 -> 0004c8 Cyrillic Capital Letter En With Hook
+Lower: 0004c9..0004ca -> 0004ca Cyrillic Capital Letter En With Tail
Lower: 0004cb..0004cc -> 0004cc Cyrillic Capital Letter Khakassian Che
+Lower: 0004cd..0004ce -> 0004ce Cyrillic Capital Letter Em With Tail
+Lower: 0004cf -> 0004cf Cyrillic Small Letter Palochka
Lower: 0004d0..0004d1 -> 0004d1 Cyrillic Capital Letter A With Breve
Lower: 0004d2..0004d3 -> 0004d3 Cyrillic Capital Letter A With Diaeresis
Lower: 0004d4..0004d5 -> 0004d5 Cyrillic Capital Ligature A Ie
@@ -1043,6 +1074,30 @@
Lower: 0004fa..0004fb -> 0004fb Cyrillic Capital Letter Ghe With Stroke And Hook
Lower: 0004fc..0004fd -> 0004fd Cyrillic Capital Letter Ha With Hook
Lower: 0004fe..0004ff -> 0004ff Cyrillic Capital Letter Ha With Stroke
+Lower: 000500..000501 -> 000501 Cyrillic Capital Letter Komi De
+Lower: 000502..000503 -> 000503 Cyrillic Capital Letter Komi Dje
+Lower: 000504..000505 -> 000505 Cyrillic Capital Letter Komi Zje
+Lower: 000506..000507 -> 000507 Cyrillic Capital Letter Komi Dzje
+Lower: 000508..000509 -> 000509 Cyrillic Capital Letter Komi Lje
+Lower: 00050a..00050b -> 00050b Cyrillic Capital Letter Komi Nje
+Lower: 00050c..00050d -> 00050d Cyrillic Capital Letter Komi Sje
+Lower: 00050e..00050f -> 00050f Cyrillic Capital Letter Komi Tje
+Lower: 000510..000511 -> 000511 Cyrillic Capital Letter Reversed Ze
+Lower: 000512..000513 -> 000513 Cyrillic Capital Letter El With Hook
+Lower: 000514..000515 -> 000515 Cyrillic Capital Letter Lha
+Lower: 000516..000517 -> 000517 Cyrillic Capital Letter Rha
+Lower: 000518..000519 -> 000519 Cyrillic Capital Letter Yae
+Lower: 00051a..00051b -> 00051b Cyrillic Capital Letter Qa
+Lower: 00051c..00051d -> 00051d Cyrillic Capital Letter We
+Lower: 00051e..00051f -> 00051f Cyrillic Capital Letter Aleut Ka
+Lower: 000520..000521 -> 000521 Cyrillic Capital Letter El With Middle Hook
+Lower: 000522..000523 -> 000523 Cyrillic Capital Letter En With Middle Hook
+Lower: 000524..000525 -> 000525 Cyrillic Capital Letter Pe With Descender
+Lower: 000526..000527 -> 000527 Cyrillic Capital Letter Shha With Descender
+Lower: 000528..000529 -> 000529 Cyrillic Capital Letter En With Left Hook
+Lower: 00052a..00052b -> 00052b Cyrillic Capital Letter Dzzhe
+Lower: 00052c..00052d -> 00052d Cyrillic Capital Letter Dche
+Lower: 00052e..00052f -> 00052f Cyrillic Capital Letter El With Descender
Lower: 000531 -> 000561 Armenian Capital Letter Ayb
Lower: 000532 -> 000562 Armenian Capital Letter Ben
Lower: 000533 -> 000563 Armenian Capital Letter Gim
@@ -1157,14 +1212,109 @@
Lower: 0010c3 -> 002d23 Georgian Capital Letter We
Lower: 0010c4 -> 002d24 Georgian Capital Letter Har
Lower: 0010c5 -> 002d25 Georgian Capital Letter Hoe
-Lower: 0010c6 -> 002d26
Lower: 0010c7 -> 002d27 Georgian Capital Letter Yn
-Lower: 0010c8 -> 002d28
-Lower: 0010c9 -> 002d29
-Lower: 0010ca -> 002d2a
-Lower: 0010cb -> 002d2b
-Lower: 0010cc -> 002d2c
Lower: 0010cd -> 002d2d Georgian Capital Letter Aen
+Lower: 0013a0 -> 00ab70 Cherokee Letter A
+Lower: 0013a1 -> 00ab71 Cherokee Letter E
+Lower: 0013a2 -> 00ab72 Cherokee Letter I
+Lower: 0013a3 -> 00ab73 Cherokee Letter O
+Lower: 0013a4 -> 00ab74 Cherokee Letter U
+Lower: 0013a5 -> 00ab75 Cherokee Letter V
+Lower: 0013a6 -> 00ab76 Cherokee Letter Ga
+Lower: 0013a7 -> 00ab77 Cherokee Letter Ka
+Lower: 0013a8 -> 00ab78 Cherokee Letter Ge
+Lower: 0013a9 -> 00ab79 Cherokee Letter Gi
+Lower: 0013aa -> 00ab7a Cherokee Letter Go
+Lower: 0013ab -> 00ab7b Cherokee Letter Gu
+Lower: 0013ac -> 00ab7c Cherokee Letter Gv
+Lower: 0013ad -> 00ab7d Cherokee Letter Ha
+Lower: 0013ae -> 00ab7e Cherokee Letter He
+Lower: 0013af -> 00ab7f Cherokee Letter Hi
+Lower: 0013b0 -> 00ab80 Cherokee Letter Ho
+Lower: 0013b1 -> 00ab81 Cherokee Letter Hu
+Lower: 0013b2 -> 00ab82 Cherokee Letter Hv
+Lower: 0013b3 -> 00ab83 Cherokee Letter La
+Lower: 0013b4 -> 00ab84 Cherokee Letter Le
+Lower: 0013b5 -> 00ab85 Cherokee Letter Li
+Lower: 0013b6 -> 00ab86 Cherokee Letter Lo
+Lower: 0013b7 -> 00ab87 Cherokee Letter Lu
+Lower: 0013b8 -> 00ab88 Cherokee Letter Lv
+Lower: 0013b9 -> 00ab89 Cherokee Letter Ma
+Lower: 0013ba -> 00ab8a Cherokee Letter Me
+Lower: 0013bb -> 00ab8b Cherokee Letter Mi
+Lower: 0013bc -> 00ab8c Cherokee Letter Mo
+Lower: 0013bd -> 00ab8d Cherokee Letter Mu
+Lower: 0013be -> 00ab8e Cherokee Letter Na
+Lower: 0013bf -> 00ab8f Cherokee Letter Hna
+Lower: 0013c0 -> 00ab90 Cherokee Letter Nah
+Lower: 0013c1 -> 00ab91 Cherokee Letter Ne
+Lower: 0013c2 -> 00ab92 Cherokee Letter Ni
+Lower: 0013c3 -> 00ab93 Cherokee Letter No
+Lower: 0013c4 -> 00ab94 Cherokee Letter Nu
+Lower: 0013c5 -> 00ab95 Cherokee Letter Nv
+Lower: 0013c6 -> 00ab96 Cherokee Letter Qua
+Lower: 0013c7 -> 00ab97 Cherokee Letter Que
+Lower: 0013c8 -> 00ab98 Cherokee Letter Qui
+Lower: 0013c9 -> 00ab99 Cherokee Letter Quo
+Lower: 0013ca -> 00ab9a Cherokee Letter Quu
+Lower: 0013cb -> 00ab9b Cherokee Letter Quv
+Lower: 0013cc -> 00ab9c Cherokee Letter Sa
+Lower: 0013cd -> 00ab9d Cherokee Letter S
+Lower: 0013ce -> 00ab9e Cherokee Letter Se
+Lower: 0013cf -> 00ab9f Cherokee Letter Si
+Lower: 0013d0 -> 00aba0 Cherokee Letter So
+Lower: 0013d1 -> 00aba1 Cherokee Letter Su
+Lower: 0013d2 -> 00aba2 Cherokee Letter Sv
+Lower: 0013d3 -> 00aba3 Cherokee Letter Da
+Lower: 0013d4 -> 00aba4 Cherokee Letter Ta
+Lower: 0013d5 -> 00aba5 Cherokee Letter De
+Lower: 0013d6 -> 00aba6 Cherokee Letter Te
+Lower: 0013d7 -> 00aba7 Cherokee Letter Di
+Lower: 0013d8 -> 00aba8 Cherokee Letter Ti
+Lower: 0013d9 -> 00aba9 Cherokee Letter Do
+Lower: 0013da -> 00abaa Cherokee Letter Du
+Lower: 0013db -> 00abab Cherokee Letter Dv
+Lower: 0013dc -> 00abac Cherokee Letter Dla
+Lower: 0013dd -> 00abad Cherokee Letter Tla
+Lower: 0013de -> 00abae Cherokee Letter Tle
+Lower: 0013df -> 00abaf Cherokee Letter Tli
+Lower: 0013e0 -> 00abb0 Cherokee Letter Tlo
+Lower: 0013e1 -> 00abb1 Cherokee Letter Tlu
+Lower: 0013e2 -> 00abb2 Cherokee Letter Tlv
+Lower: 0013e3 -> 00abb3 Cherokee Letter Tsa
+Lower: 0013e4 -> 00abb4 Cherokee Letter Tse
+Lower: 0013e5 -> 00abb5 Cherokee Letter Tsi
+Lower: 0013e6 -> 00abb6 Cherokee Letter Tso
+Lower: 0013e7 -> 00abb7 Cherokee Letter Tsu
+Lower: 0013e8 -> 00abb8 Cherokee Letter Tsv
+Lower: 0013e9 -> 00abb9 Cherokee Letter Wa
+Lower: 0013ea -> 00abba Cherokee Letter We
+Lower: 0013eb -> 00abbb Cherokee Letter Wi
+Lower: 0013ec -> 00abbc Cherokee Letter Wo
+Lower: 0013ed -> 00abbd Cherokee Letter Wu
+Lower: 0013ee -> 00abbe Cherokee Letter Wv
+Lower: 0013ef -> 00abbf Cherokee Letter Ya
+Lower: 0013f0 -> 0013f8 Cherokee Letter Ye
+Lower: 0013f1 -> 0013f9 Cherokee Letter Yi
+Lower: 0013f2 -> 0013fa Cherokee Letter Yo
+Lower: 0013f3 -> 0013fb Cherokee Letter Yu
+Lower: 0013f4 -> 0013fc Cherokee Letter Yv
+Lower: 0013f5 -> 0013fd Cherokee Letter Mv
+Lower: 0013f8 -> 0013f8 Cherokee Small Letter Ye
+Lower: 0013f9 -> 0013f9 Cherokee Small Letter Yi
+Lower: 0013fa -> 0013fa Cherokee Small Letter Yo
+Lower: 0013fb -> 0013fb Cherokee Small Letter Yu
+Lower: 0013fc -> 0013fc Cherokee Small Letter Yv
+Lower: 0013fd -> 0013fd Cherokee Small Letter Mv
+Lower: 001c80 -> 001c80 Cyrillic Small Letter Rounded Ve
+Lower: 001c81 -> 001c81 Cyrillic Small Letter Long-Legged De
+Lower: 001c82 -> 001c82 Cyrillic Small Letter Narrow O
+Lower: 001c83 -> 001c83 Cyrillic Small Letter Wide Es
+Lower: 001c84 -> 001c84 Cyrillic Small Letter Tall Te
+Lower: 001c85 -> 001c85 Cyrillic Small Letter Three-Legged Te
+Lower: 001c86 -> 001c86 Cyrillic Small Letter Tall Hard Sign
+Lower: 001c87 -> 001c87 Cyrillic Small Letter Tall Yat
+Lower: 001c88 -> 001c88 Cyrillic Small Letter Unblended Uk
Lower: 001d79 -> 001d79 Latin Small Letter Insular G
Lower: 001d7d -> 001d7d Latin Small Letter P With Stroke
Lower: 001e00..001e01 -> 001e01 Latin Capital Letter A With Ring Below
@@ -1242,6 +1392,8 @@
Lower: 001e90..001e91 -> 001e91 Latin Capital Letter Z With Circumflex
Lower: 001e92..001e93 -> 001e93 Latin Capital Letter Z With Dot Below
Lower: 001e94..001e95 -> 001e95 Latin Capital Letter Z With Line Below
+Lower: 001e9b -> 001e9b Latin Small Letter Long S With Dot Above
+Lower: 001e9e -> 0000df Latin Capital Letter Sharp S
Lower: 001ea0..001ea1 -> 001ea1 Latin Capital Letter A With Dot Below
Lower: 001ea2..001ea3 -> 001ea3 Latin Capital Letter A With Hook Above
Lower: 001ea4..001ea5 -> 001ea5 Latin Capital Letter A With Circumflex And Acute
@@ -1287,6 +1439,9 @@
Lower: 001ef4..001ef5 -> 001ef5 Latin Capital Letter Y With Dot Below
Lower: 001ef6..001ef7 -> 001ef7 Latin Capital Letter Y With Hook Above
Lower: 001ef8..001ef9 -> 001ef9 Latin Capital Letter Y With Tilde
+Lower: 001efa..001efb -> 001efb Latin Capital Letter Middle-Welsh Ll
+Lower: 001efc..001efd -> 001efd Latin Capital Letter Middle-Welsh V
+Lower: 001efe..001eff -> 001eff Latin Capital Letter Y With Loop
Lower: 001f00 -> 001f00 Greek Small Letter Alpha With Psili
Lower: 001f01 -> 001f01 Greek Small Letter Alpha With Dasia
Lower: 001f02 -> 001f02 Greek Small Letter Alpha With Psili And Varia
@@ -1353,22 +1508,20 @@
Lower: 001f43 -> 001f43 Greek Small Letter Omicron With Dasia And Varia
Lower: 001f44 -> 001f44 Greek Small Letter Omicron With Psili And Oxia
Lower: 001f45 -> 001f45 Greek Small Letter Omicron With Dasia And Oxia
-Lower: 001f46 -> 001f46
-Lower: 001f47 -> 001f47
Lower: 001f48 -> 001f40 Greek Capital Letter Omicron With Psili
Lower: 001f49 -> 001f41 Greek Capital Letter Omicron With Dasia
Lower: 001f4a -> 001f42 Greek Capital Letter Omicron With Psili And Varia
Lower: 001f4b -> 001f43 Greek Capital Letter Omicron With Dasia And Varia
Lower: 001f4c -> 001f44 Greek Capital Letter Omicron With Psili And Oxia
Lower: 001f4d -> 001f45 Greek Capital Letter Omicron With Dasia And Oxia
-Lower: 001f4e -> 001f46
-Lower: 001f4f -> 001f47
Lower: 001f51 -> 001f51 Greek Small Letter Upsilon With Dasia
Lower: 001f53 -> 001f53 Greek Small Letter Upsilon With Dasia And Varia
Lower: 001f55 -> 001f55 Greek Small Letter Upsilon With Dasia And Oxia
+Lower: 001f57 -> 001f57 Greek Small Letter Upsilon With Dasia And Perispomeni
Lower: 001f59 -> 001f51 Greek Capital Letter Upsilon With Dasia
Lower: 001f5b -> 001f53 Greek Capital Letter Upsilon With Dasia And Varia
Lower: 001f5d -> 001f55 Greek Capital Letter Upsilon With Dasia And Oxia
+Lower: 001f5f -> 001f57 Greek Capital Letter Upsilon With Dasia And Perispomeni
Lower: 001f60 -> 001f60 Greek Small Letter Omega With Psili
Lower: 001f61 -> 001f61 Greek Small Letter Omega With Dasia
Lower: 001f62 -> 001f62 Greek Small Letter Omega With Psili And Varia
@@ -1455,6 +1608,7 @@
Lower: 001fba -> 001f70 Greek Capital Letter Alpha With Varia
Lower: 001fbb -> 001f71 Greek Capital Letter Alpha With Oxia
Lower: 001fbc -> 001fb3 Greek Capital Letter Alpha With Prosgegrammeni
+Lower: 001fbe -> 001fbe Greek Prosgegrammeni
Lower: 001fc3 -> 001fc3 Greek Small Letter Eta With Ypogegrammeni
Lower: 001fc8 -> 001f72 Greek Capital Letter Epsilon With Varia
Lower: 001fc9 -> 001f73 Greek Capital Letter Epsilon With Oxia
@@ -1593,6 +1747,10 @@
Lower: 00206d -> 00206d Activate Arabic Form Shaping
Lower: 00206e -> 00206e National Digit Shapes
Lower: 00206f -> 00206f Nominal Digit Shapes
+Lower: 002126 -> 0003c9 Ohm Sign
+Lower: 00212b -> 0000e5 Angstrom Sign
+Lower: 002132 -> 00214e Turned Capital F
+Lower: 00214e -> 00214e Turned Small F
Lower: 002160 -> 002170 Roman Numeral One
Lower: 002161 -> 002171 Roman Numeral Two
Lower: 002162 -> 002172 Roman Numeral Three
@@ -1625,6 +1783,7 @@
Lower: 00217d -> 00217d Small Roman Numeral One Hundred
Lower: 00217e -> 00217e Small Roman Numeral Five Hundred
Lower: 00217f -> 00217f Small Roman Numeral One Thousand
+Lower: 002183..002184 -> 002184 Roman Numeral Reversed One Hundred
Lower: 002190 -> 002190 Leftwards Arrow
Lower: 002191 -> 002191 Upwards Arrow
Lower: 002192 -> 002192 Rightwards Arrow
@@ -2525,10 +2684,10 @@
Lower: 002cdc..002cdd -> 002cdd Coptic Capital Letter Old Nubian Shima
Lower: 002cde..002cdf -> 002cdf Coptic Capital Letter Old Nubian Ngi
Lower: 002ce0..002ce1 -> 002ce1 Coptic Capital Letter Old Nubian Nyi
+Lower: 002ce2..002ce3 -> 002ce3 Coptic Capital Letter Old Nubian Wau
Lower: 002ceb..002cec -> 002cec Coptic Capital Letter Cryptogrammic Shei
Lower: 002ced..002cee -> 002cee Coptic Capital Letter Cryptogrammic Gangia
-Lower: 002cef..002cf0 -> 002cf0 Coptic Combining Ni Above
-Lower: 002cf1..002cf2 -> 002cf2 Coptic Combining Spiritus Lenis
+Lower: 002cf2..002cf3 -> 002cf3 Coptic Capital Letter Bohairic Khei
Lower: 002d00 -> 002d00 Georgian Small Letter An
Lower: 002d01 -> 002d01 Georgian Small Letter Ban
Lower: 002d02 -> 002d02 Georgian Small Letter Gan
@@ -2567,13 +2726,7 @@
Lower: 002d23 -> 002d23 Georgian Small Letter We
Lower: 002d24 -> 002d24 Georgian Small Letter Har
Lower: 002d25 -> 002d25 Georgian Small Letter Hoe
-Lower: 002d26 -> 002d26
Lower: 002d27 -> 002d27 Georgian Small Letter Yn
-Lower: 002d28 -> 002d28
-Lower: 002d29 -> 002d29
-Lower: 002d2a -> 002d2a
-Lower: 002d2b -> 002d2b
-Lower: 002d2c -> 002d2c
Lower: 002d2d -> 002d2d Georgian Small Letter Aen
Lower: 00a640..00a641 -> 00a641 Cyrillic Capital Letter Zemlya
Lower: 00a642..00a643 -> 00a643 Cyrillic Capital Letter Dzelo
@@ -2659,7 +2812,7 @@
Lower: 00a784..00a785 -> 00a785 Latin Capital Letter Insular S
Lower: 00a786..00a787 -> 00a787 Latin Capital Letter Insular T
Lower: 00a78b..00a78c -> 00a78c Latin Capital Letter Saltillo
-Lower: 00a78d..00a78e -> 00a78e Latin Capital Letter Turned H
+Lower: 00a78d -> 000265 Latin Capital Letter Turned H
Lower: 00a790..00a791 -> 00a791 Latin Capital Letter N With Descender
Lower: 00a792..00a793 -> 00a793 Latin Capital Letter C With Bar
Lower: 00a796..00a797 -> 00a797 Latin Capital Letter B With Flourish
@@ -2684,6 +2837,86 @@
Lower: 00a7b4..00a7b5 -> 00a7b5 Latin Capital Letter Beta
Lower: 00a7b6..00a7b7 -> 00a7b7 Latin Capital Letter Omega
Lower: 00ab53 -> 00ab53 Latin Small Letter Chi
+Lower: 00ab70 -> 00ab70 Cherokee Small Letter A
+Lower: 00ab71 -> 00ab71 Cherokee Small Letter E
+Lower: 00ab72 -> 00ab72 Cherokee Small Letter I
+Lower: 00ab73 -> 00ab73 Cherokee Small Letter O
+Lower: 00ab74 -> 00ab74 Cherokee Small Letter U
+Lower: 00ab75 -> 00ab75 Cherokee Small Letter V
+Lower: 00ab76 -> 00ab76 Cherokee Small Letter Ga
+Lower: 00ab77 -> 00ab77 Cherokee Small Letter Ka
+Lower: 00ab78 -> 00ab78 Cherokee Small Letter Ge
+Lower: 00ab79 -> 00ab79 Cherokee Small Letter Gi
+Lower: 00ab7a -> 00ab7a Cherokee Small Letter Go
+Lower: 00ab7b -> 00ab7b Cherokee Small Letter Gu
+Lower: 00ab7c -> 00ab7c Cherokee Small Letter Gv
+Lower: 00ab7d -> 00ab7d Cherokee Small Letter Ha
+Lower: 00ab7e -> 00ab7e Cherokee Small Letter He
+Lower: 00ab7f -> 00ab7f Cherokee Small Letter Hi
+Lower: 00ab80 -> 00ab80 Cherokee Small Letter Ho
+Lower: 00ab81 -> 00ab81 Cherokee Small Letter Hu
+Lower: 00ab82 -> 00ab82 Cherokee Small Letter Hv
+Lower: 00ab83 -> 00ab83 Cherokee Small Letter La
+Lower: 00ab84 -> 00ab84 Cherokee Small Letter Le
+Lower: 00ab85 -> 00ab85 Cherokee Small Letter Li
+Lower: 00ab86 -> 00ab86 Cherokee Small Letter Lo
+Lower: 00ab87 -> 00ab87 Cherokee Small Letter Lu
+Lower: 00ab88 -> 00ab88 Cherokee Small Letter Lv
+Lower: 00ab89 -> 00ab89 Cherokee Small Letter Ma
+Lower: 00ab8a -> 00ab8a Cherokee Small Letter Me
+Lower: 00ab8b -> 00ab8b Cherokee Small Letter Mi
+Lower: 00ab8c -> 00ab8c Cherokee Small Letter Mo
+Lower: 00ab8d -> 00ab8d Cherokee Small Letter Mu
+Lower: 00ab8e -> 00ab8e Cherokee Small Letter Na
+Lower: 00ab8f -> 00ab8f Cherokee Small Letter Hna
+Lower: 00ab90 -> 00ab90 Cherokee Small Letter Nah
+Lower: 00ab91 -> 00ab91 Cherokee Small Letter Ne
+Lower: 00ab92 -> 00ab92 Cherokee Small Letter Ni
+Lower: 00ab93 -> 00ab93 Cherokee Small Letter No
+Lower: 00ab94 -> 00ab94 Cherokee Small Letter Nu
+Lower: 00ab95 -> 00ab95 Cherokee Small Letter Nv
+Lower: 00ab96 -> 00ab96 Cherokee Small Letter Qua
+Lower: 00ab97 -> 00ab97 Cherokee Small Letter Que
+Lower: 00ab98 -> 00ab98 Cherokee Small Letter Qui
+Lower: 00ab99 -> 00ab99 Cherokee Small Letter Quo
+Lower: 00ab9a -> 00ab9a Cherokee Small Letter Quu
+Lower: 00ab9b -> 00ab9b Cherokee Small Letter Quv
+Lower: 00ab9c -> 00ab9c Cherokee Small Letter Sa
+Lower: 00ab9d -> 00ab9d Cherokee Small Letter S
+Lower: 00ab9e -> 00ab9e Cherokee Small Letter Se
+Lower: 00ab9f -> 00ab9f Cherokee Small Letter Si
+Lower: 00aba0 -> 00aba0 Cherokee Small Letter So
+Lower: 00aba1 -> 00aba1 Cherokee Small Letter Su
+Lower: 00aba2 -> 00aba2 Cherokee Small Letter Sv
+Lower: 00aba3 -> 00aba3 Cherokee Small Letter Da
+Lower: 00aba4 -> 00aba4 Cherokee Small Letter Ta
+Lower: 00aba5 -> 00aba5 Cherokee Small Letter De
+Lower: 00aba6 -> 00aba6 Cherokee Small Letter Te
+Lower: 00aba7 -> 00aba7 Cherokee Small Letter Di
+Lower: 00aba8 -> 00aba8 Cherokee Small Letter Ti
+Lower: 00aba9 -> 00aba9 Cherokee Small Letter Do
+Lower: 00abaa -> 00abaa Cherokee Small Letter Du
+Lower: 00abab -> 00abab Cherokee Small Letter Dv
+Lower: 00abac -> 00abac Cherokee Small Letter Dla
+Lower: 00abad -> 00abad Cherokee Small Letter Tla
+Lower: 00abae -> 00abae Cherokee Small Letter Tle
+Lower: 00abaf -> 00abaf Cherokee Small Letter Tli
+Lower: 00abb0 -> 00abb0 Cherokee Small Letter Tlo
+Lower: 00abb1 -> 00abb1 Cherokee Small Letter Tlu
+Lower: 00abb2 -> 00abb2 Cherokee Small Letter Tlv
+Lower: 00abb3 -> 00abb3 Cherokee Small Letter Tsa
+Lower: 00abb4 -> 00abb4 Cherokee Small Letter Tse
+Lower: 00abb5 -> 00abb5 Cherokee Small Letter Tsi
+Lower: 00abb6 -> 00abb6 Cherokee Small Letter Tso
+Lower: 00abb7 -> 00abb7 Cherokee Small Letter Tsu
+Lower: 00abb8 -> 00abb8 Cherokee Small Letter Tsv
+Lower: 00abb9 -> 00abb9 Cherokee Small Letter Wa
+Lower: 00abba -> 00abba Cherokee Small Letter We
+Lower: 00abbb -> 00abbb Cherokee Small Letter Wi
+Lower: 00abbc -> 00abbc Cherokee Small Letter Wo
+Lower: 00abbd -> 00abbd Cherokee Small Letter Wu
+Lower: 00abbe -> 00abbe Cherokee Small Letter Wv
+Lower: 00abbf -> 00abbf Cherokee Small Letter Ya
Lower: 00ff21 -> 00ff41 Fullwidth Latin Capital Letter A
Lower: 00ff22 -> 00ff42 Fullwidth Latin Capital Letter B
Lower: 00ff23 -> 00ff43 Fullwidth Latin Capital Letter C
@@ -2736,46 +2969,46 @@
Lower: 00ff58 -> 00ff58 Fullwidth Latin Small Letter X
Lower: 00ff59 -> 00ff59 Fullwidth Latin Small Letter Y
Lower: 00ff5a -> 00ff5a Fullwidth Latin Small Letter Z
-Lower: 010400 -> 01041c Deseret Capital Letter Long I
-Lower: 010401 -> 01041d Deseret Capital Letter Long E
-Lower: 010402 -> 01041e Deseret Capital Letter Long A
-Lower: 010403 -> 01041f Deseret Capital Letter Long Ah
-Lower: 010404 -> 010420 Deseret Capital Letter Long O
-Lower: 010405 -> 010421 Deseret Capital Letter Long Oo
-Lower: 010406 -> 010422 Deseret Capital Letter Short I
-Lower: 010407 -> 010423 Deseret Capital Letter Short E
-Lower: 010408 -> 010424 Deseret Capital Letter Short A
-Lower: 010409 -> 010425 Deseret Capital Letter Short Ah
-Lower: 01040a -> 010426 Deseret Capital Letter Short O
-Lower: 01040b -> 010427 Deseret Capital Letter Short Oo
-Lower: 01040c -> 010428 Deseret Capital Letter Ay
-Lower: 01040d -> 010429 Deseret Capital Letter Ow
-Lower: 01040e -> 01042a Deseret Capital Letter Wu
-Lower: 01040f -> 01042b Deseret Capital Letter Yee
-Lower: 010410 -> 01042c Deseret Capital Letter H
-Lower: 010411 -> 01042d Deseret Capital Letter Pee
-Lower: 010412 -> 01042e Deseret Capital Letter Bee
-Lower: 010413 -> 01042f Deseret Capital Letter Tee
-Lower: 010414 -> 010430 Deseret Capital Letter Dee
-Lower: 010415 -> 010431 Deseret Capital Letter Chee
-Lower: 010416 -> 010432 Deseret Capital Letter Jee
-Lower: 010417 -> 010433 Deseret Capital Letter Kay
-Lower: 010418 -> 010434 Deseret Capital Letter Gay
-Lower: 010419 -> 010435 Deseret Capital Letter Ef
-Lower: 01041a -> 010436 Deseret Capital Letter Vee
-Lower: 01041b -> 010437 Deseret Capital Letter Eth
-Lower: 01041c -> 010438 Deseret Capital Letter Thee
-Lower: 01041d -> 010439 Deseret Capital Letter Es
-Lower: 01041e -> 01043a Deseret Capital Letter Zee
-Lower: 01041f -> 01043b Deseret Capital Letter Esh
-Lower: 010420 -> 01043c Deseret Capital Letter Zhee
-Lower: 010421 -> 01043d Deseret Capital Letter Er
-Lower: 010422 -> 01043e Deseret Capital Letter El
-Lower: 010423 -> 01043f Deseret Capital Letter Em
-Lower: 010424 -> 010440 Deseret Capital Letter En
-Lower: 010425 -> 010441 Deseret Capital Letter Eng
-Lower: 010426 -> 010442 Deseret Capital Letter Oi
-Lower: 010427 -> 010443 Deseret Capital Letter Ew
+Lower: 010400 -> 010428 Deseret Capital Letter Long I
+Lower: 010401 -> 010429 Deseret Capital Letter Long E
+Lower: 010402 -> 01042a Deseret Capital Letter Long A
+Lower: 010403 -> 01042b Deseret Capital Letter Long Ah
+Lower: 010404 -> 01042c Deseret Capital Letter Long O
+Lower: 010405 -> 01042d Deseret Capital Letter Long Oo
+Lower: 010406 -> 01042e Deseret Capital Letter Short I
+Lower: 010407 -> 01042f Deseret Capital Letter Short E
+Lower: 010408 -> 010430 Deseret Capital Letter Short A
+Lower: 010409 -> 010431 Deseret Capital Letter Short Ah
+Lower: 01040a -> 010432 Deseret Capital Letter Short O
+Lower: 01040b -> 010433 Deseret Capital Letter Short Oo
+Lower: 01040c -> 010434 Deseret Capital Letter Ay
+Lower: 01040d -> 010435 Deseret Capital Letter Ow
+Lower: 01040e -> 010436 Deseret Capital Letter Wu
+Lower: 01040f -> 010437 Deseret Capital Letter Yee
+Lower: 010410 -> 010438 Deseret Capital Letter H
+Lower: 010411 -> 010439 Deseret Capital Letter Pee
+Lower: 010412 -> 01043a Deseret Capital Letter Bee
+Lower: 010413 -> 01043b Deseret Capital Letter Tee
+Lower: 010414 -> 01043c Deseret Capital Letter Dee
+Lower: 010415 -> 01043d Deseret Capital Letter Chee
+Lower: 010416 -> 01043e Deseret Capital Letter Jee
+Lower: 010417 -> 01043f Deseret Capital Letter Kay
+Lower: 010418 -> 010440 Deseret Capital Letter Gay
+Lower: 010419 -> 010441 Deseret Capital Letter Ef
+Lower: 01041a -> 010442 Deseret Capital Letter Vee
+Lower: 01041b -> 010443 Deseret Capital Letter Eth
+Lower: 01041c -> 010444 Deseret Capital Letter Thee
+Lower: 01041d -> 010445 Deseret Capital Letter Es
+Lower: 01041e -> 010446 Deseret Capital Letter Zee
+Lower: 01041f -> 010447 Deseret Capital Letter Esh
+Lower: 010420 -> 010448 Deseret Capital Letter Zhee
+Lower: 010421 -> 010449 Deseret Capital Letter Er
+Lower: 010422 -> 01044a Deseret Capital Letter El
+Lower: 010423 -> 01044b Deseret Capital Letter Em
+Lower: 010424 -> 01044c Deseret Capital Letter En
+Lower: 010425 -> 01044d Deseret Capital Letter Eng
+Lower: 010426 -> 01044e Deseret Capital Letter Oi
+Lower: 010427 -> 01044f Deseret Capital Letter Ew
Lower: 010428 -> 010428 Deseret Small Letter Long I
Lower: 010429 -> 010429 Deseret Small Letter Long E
Lower: 01042a -> 01042a Deseret Small Letter Long A
@@ -2804,6 +3037,18 @@
Lower: 010441 -> 010441 Deseret Small Letter Ef
Lower: 010442 -> 010442 Deseret Small Letter Vee
Lower: 010443 -> 010443 Deseret Small Letter Eth
+Lower: 010444 -> 010444 Deseret Small Letter Thee
+Lower: 010445 -> 010445 Deseret Small Letter Es
+Lower: 010446 -> 010446 Deseret Small Letter Zee
+Lower: 010447 -> 010447 Deseret Small Letter Esh
+Lower: 010448 -> 010448 Deseret Small Letter Zhee
+Lower: 010449 -> 010449 Deseret Small Letter Er
+Lower: 01044a -> 01044a Deseret Small Letter El
+Lower: 01044b -> 01044b Deseret Small Letter Em
+Lower: 01044c -> 01044c Deseret Small Letter En
+Lower: 01044d -> 01044d Deseret Small Letter Eng
+Lower: 01044e -> 01044e Deseret Small Letter Oi
+Lower: 01044f -> 01044f Deseret Small Letter Ew
Lower: 0104b0 -> 0104d8 Osage Capital Letter A
Lower: 0104b1 -> 0104d9 Osage Capital Letter Ai
Lower: 0104b2 -> 0104da Osage Capital Letter Ain
@@ -3307,7 +3552,7 @@
Upper: 0000ae -> 0000ae Registered Sign
Upper: 0000b0 -> 0000b0 Degree Sign
Upper: 0000b1 -> 0000b1 Plus-Minus Sign
-Upper: 0000b5 -> 0000b5 Micro Sign
+Upper: 0000b5 -> 00039c Micro Sign
Upper: 0000b7 -> 0000b7 Middle Dot
Upper: 0000bb -> 0000bb Right-Pointing Double Angle Quotation Mark
Upper: 0000bc -> 0000bc Vulgar Fraction One Quarter
@@ -3462,6 +3707,7 @@
Upper: 00019a -> 00023d Latin Small Letter L With Bar
Upper: 00019c -> 00019c Latin Capital Letter Turned M
Upper: 00019d -> 00019d Latin Capital Letter N With Left Hook
+Upper: 00019e -> 000220 Latin Small Letter N With Long Right Leg
Upper: 00019f -> 00019f Latin Capital Letter O With Middle Tilde
Upper: 0001a0..0001a1 -> 0001a0 Latin Capital Letter O With Horn
Upper: 0001a2..0001a3 -> 0001a2 Latin Capital Letter Oi
@@ -3480,15 +3726,9 @@
Upper: 0001b8..0001b9 -> 0001b8 Latin Capital Letter Ezh Reversed
Upper: 0001bc..0001bd -> 0001bc Latin Capital Letter Tone Five
Upper: 0001bf -> 0001f7 Latin Letter Wynn
-Upper: 0001c4 -> 0001c4 Latin Capital Letter Dz With Caron
-Upper: 0001c5 -> 0001c5 Latin Capital Letter D With Small Letter Z With Caron
-Upper: 0001c6 -> 0001c4 Latin Small Letter Dz With Caron
-Upper: 0001c7 -> 0001c7 Latin Capital Letter Lj
-Upper: 0001c8 -> 0001c8 Latin Capital Letter L With Small Letter J
-Upper: 0001c9 -> 0001c7 Latin Small Letter Lj
-Upper: 0001ca -> 0001ca Latin Capital Letter Nj
-Upper: 0001cb -> 0001cb Latin Capital Letter N With Small Letter J
-Upper: 0001cc -> 0001ca Latin Small Letter Nj
+Upper: 0001c4..0001c6 -> 0001c4 Latin Capital Letter Dz With Caron
+Upper: 0001c7..0001c9 -> 0001c7 Latin Capital Letter Lj
+Upper: 0001ca..0001cc -> 0001ca Latin Capital Letter Nj
Upper: 0001cd..0001ce -> 0001cd Latin Capital Letter A With Caron
Upper: 0001cf..0001d0 -> 0001cf Latin Capital Letter I With Caron
Upper: 0001d1..0001d2 -> 0001d1 Latin Capital Letter O With Caron
@@ -3507,9 +3747,7 @@
Upper: 0001ea..0001eb -> 0001ea Latin Capital Letter O With Ogonek
Upper: 0001ec..0001ed -> 0001ec Latin Capital Letter O With Ogonek And Macron
Upper: 0001ee..0001ef -> 0001ee Latin Capital Letter Ezh With Caron
-Upper: 0001f1 -> 0001f1 Latin Capital Letter Dz
-Upper: 0001f2 -> 0001f2 Latin Capital Letter D With Small Letter Z
-Upper: 0001f3 -> 0001f1 Latin Small Letter Dz
+Upper: 0001f1..0001f3 -> 0001f1 Latin Capital Letter Dz
Upper: 0001f4..0001f5 -> 0001f4 Latin Capital Letter G With Acute
Upper: 0001f6 -> 0001f6 Latin Capital Letter Hwair
Upper: 0001f7 -> 0001f7 Latin Capital Letter Wynn
@@ -3533,6 +3771,7 @@
Upper: 00021a..00021b -> 00021a Latin Capital Letter T With Comma Below
Upper: 00021c..00021d -> 00021c Latin Capital Letter Yogh
Upper: 00021e..00021f -> 00021e Latin Capital Letter H With Caron
+Upper: 000220 -> 000220 Latin Capital Letter N With Long Right Leg
Upper: 000222..000223 -> 000222 Latin Capital Letter Ou
Upper: 000224..000225 -> 000224 Latin Capital Letter Z With Hook
Upper: 000226..000227 -> 000226 Latin Capital Letter A With Dot Above
@@ -3570,6 +3809,7 @@
Upper: 000260 -> 000193 Latin Small Letter G With Hook
Upper: 000261 -> 00a7ac Latin Small Letter Script G
Upper: 000263 -> 000194 Latin Small Letter Gamma
+Upper: 000265 -> 00a78d Latin Small Letter Turned H
Upper: 000266 -> 00a7aa Latin Small Letter H With Hook
Upper: 000268 -> 000197 Latin Small Letter I With Stroke
Upper: 000269 -> 000196 Latin Small Letter Iota
@@ -3592,6 +3832,14 @@
Upper: 000292 -> 0001b7 Latin Small Letter Ezh
Upper: 00029d -> 00a7b2 Latin Small Letter J With Crossed-Tail
Upper: 00029e -> 00a7b0 Latin Small Letter Turned K
+Upper: 000345 -> 000399 Combining Greek Ypogegrammeni
+Upper: 000370..000371 -> 000370 Greek Capital Letter Heta
+Upper: 000372..000373 -> 000372 Greek Capital Letter Archaic Sampi
+Upper: 000376..000377 -> 000376 Greek Capital Letter Pamphylian Digamma
+Upper: 00037b -> 0003fd Greek Small Reversed Lunate Sigma Symbol
+Upper: 00037c -> 0003fe Greek Small Dotted Lunate Sigma Symbol
+Upper: 00037d -> 0003ff Greek Small Reversed Dotted Lunate Sigma Symbol
+Upper: 00037f -> 00037f Greek Capital Letter Yot
Upper: 000386 -> 000386 Greek Capital Letter Alpha With Tonos
Upper: 000388 -> 000388 Greek Capital Letter Epsilon With Tonos
Upper: 000389 -> 000389 Greek Capital Letter Eta With Tonos
@@ -3646,7 +3894,7 @@
Upper: 0003bf -> 00039f Greek Small Letter Omicron
Upper: 0003c0 -> 0003a0 Greek Small Letter Pi
Upper: 0003c1 -> 0003a1 Greek Small Letter Rho
-Upper: 0003c3 -> 0003a3 Greek Small Letter Sigma
+Upper: 0003c2..0003c3 -> 0003a3 Greek Small Letter Final Sigma
Upper: 0003c4 -> 0003a4 Greek Small Letter Tau
Upper: 0003c5 -> 0003a5 Greek Small Letter Upsilon
Upper: 0003c6 -> 0003a6 Greek Small Letter Phi
@@ -3658,6 +3906,13 @@
Upper: 0003cc -> 00038c Greek Small Letter Omicron With Tonos
Upper: 0003cd -> 00038e Greek Small Letter Upsilon With Tonos
Upper: 0003ce -> 00038f Greek Small Letter Omega With Tonos
+Upper: 0003cf -> 0003cf Greek Capital Kai Symbol
+Upper: 0003d0 -> 000392 Greek Beta Symbol
+Upper: 0003d1 -> 000398 Greek Theta Symbol
+Upper: 0003d5 -> 0003a6 Greek Phi Symbol
+Upper: 0003d6 -> 0003a0 Greek Pi Symbol
+Upper: 0003d7 -> 0003cf Greek Kai Symbol
+Upper: 0003d8..0003d9 -> 0003d8 Greek Letter Archaic Koppa
Upper: 0003da..0003db -> 0003da Greek Letter Stigma
Upper: 0003dc..0003dd -> 0003dc Greek Letter Digamma
Upper: 0003de..0003df -> 0003de Greek Letter Koppa
@@ -3669,6 +3924,18 @@
Upper: 0003ea..0003eb -> 0003ea Coptic Capital Letter Gangia
Upper: 0003ec..0003ed -> 0003ec Coptic Capital Letter Shima
Upper: 0003ee..0003ef -> 0003ee Coptic Capital Letter Dei
+Upper: 0003f0 -> 00039a Greek Kappa Symbol
+Upper: 0003f1 -> 0003a1 Greek Rho Symbol
+Upper: 0003f2 -> 0003f9 Greek Lunate Sigma Symbol
+Upper: 0003f3 -> 00037f Greek Letter Yot
+Upper: 0003f4 -> 0003f4 Greek Capital Theta Symbol
+Upper: 0003f5 -> 000395 Greek Lunate Epsilon Symbol
+Upper: 0003f7..0003f8 -> 0003f7 Greek Capital Letter Sho
+Upper: 0003f9 -> 0003f9 Greek Capital Lunate Sigma Symbol
+Upper: 0003fa..0003fb -> 0003fa Greek Capital Letter San
+Upper: 0003fd -> 0003fd Greek Capital Reversed Lunate Sigma Symbol
+Upper: 0003fe -> 0003fe Greek Capital Dotted Lunate Sigma Symbol
+Upper: 0003ff -> 0003ff Greek Capital Reversed Dotted Lunate Sigma Symbol
Upper: 000400 -> 000400 Cyrillic Capital Letter Ie With Grave
Upper: 000401 -> 000401 Cyrillic Capital Letter Io
Upper: 000402 -> 000402 Cyrillic Capital Letter Dje
@@ -3782,6 +4049,7 @@
Upper: 00047c..00047d -> 00047c Cyrillic Capital Letter Omega With Titlo
Upper: 00047e..00047f -> 00047e Cyrillic Capital Letter Ot
Upper: 000480..000481 -> 000480 Cyrillic Capital Letter Koppa
+Upper: 00048a..00048b -> 00048a Cyrillic Capital Letter Short I With Tail
Upper: 00048c..00048d -> 00048c Cyrillic Capital Letter Semisoft Sign
Upper: 00048e..00048f -> 00048e Cyrillic Capital Letter Er With Tick
Upper: 000490..000491 -> 000490 Cyrillic Capital Letter Ghe With Upturn
@@ -3808,10 +4076,15 @@
Upper: 0004ba..0004bb -> 0004ba Cyrillic Capital Letter Shha
Upper: 0004bc..0004bd -> 0004bc Cyrillic Capital Letter Abkhasian Che
Upper: 0004be..0004bf -> 0004be Cyrillic Capital Letter Abkhasian Che With Descender
+Upper: 0004c0 -> 0004c0 Cyrillic Letter Palochka
Upper: 0004c1..0004c2 -> 0004c1 Cyrillic Capital Letter Zhe With Breve
Upper: 0004c3..0004c4 -> 0004c3 Cyrillic Capital Letter Ka With Hook
+Upper: 0004c5..0004c6 -> 0004c5 Cyrillic Capital Letter El With Tail
Upper: 0004c7..0004c8 -> 0004c7 Cyrillic Capital Letter En With Hook
+Upper: 0004c9..0004ca -> 0004c9 Cyrillic Capital Letter En With Tail
Upper: 0004cb..0004cc -> 0004cb Cyrillic Capital Letter Khakassian Che
+Upper: 0004cd..0004ce -> 0004cd Cyrillic Capital Letter Em With Tail
+Upper: 0004cf -> 0004c0 Cyrillic Small Letter Palochka
Upper: 0004d0..0004d1 -> 0004d0 Cyrillic Capital Letter A With Breve
Upper: 0004d2..0004d3 -> 0004d2 Cyrillic Capital Letter A With Diaeresis
Upper: 0004d4..0004d5 -> 0004d4 Cyrillic Capital Ligature A Ie
@@ -3836,6 +4109,30 @@
Upper: 0004fa..0004fb -> 0004fa Cyrillic Capital Letter Ghe With Stroke And Hook
Upper: 0004fc..0004fd -> 0004fc Cyrillic Capital Letter Ha With Hook
Upper: 0004fe..0004ff -> 0004fe Cyrillic Capital Letter Ha With Stroke
+Upper: 000500..000501 -> 000500 Cyrillic Capital Letter Komi De
+Upper: 000502..000503 -> 000502 Cyrillic Capital Letter Komi Dje
+Upper: 000504..000505 -> 000504 Cyrillic Capital Letter Komi Zje
+Upper: 000506..000507 -> 000506 Cyrillic Capital Letter Komi Dzje
+Upper: 000508..000509 -> 000508 Cyrillic Capital Letter Komi Lje
+Upper: 00050a..00050b -> 00050a Cyrillic Capital Letter Komi Nje
+Upper: 00050c..00050d -> 00050c Cyrillic Capital Letter Komi Sje
+Upper: 00050e..00050f -> 00050e Cyrillic Capital Letter Komi Tje
+Upper: 000510..000511 -> 000510 Cyrillic Capital Letter Reversed Ze
+Upper: 000512..000513 -> 000512 Cyrillic Capital Letter El With Hook
+Upper: 000514..000515 -> 000514 Cyrillic Capital Letter Lha
+Upper: 000516..000517 -> 000516 Cyrillic Capital Letter Rha
+Upper: 000518..000519 -> 000518 Cyrillic Capital Letter Yae
+Upper: 00051a..00051b -> 00051a Cyrillic Capital Letter Qa
+Upper: 00051c..00051d -> 00051c Cyrillic Capital Letter We
+Upper: 00051e..00051f -> 00051e Cyrillic Capital Letter Aleut Ka
+Upper: 000520..000521 -> 000520 Cyrillic Capital Letter El With Middle Hook
+Upper: 000522..000523 -> 000522 Cyrillic Capital Letter En With Middle Hook
+Upper: 000524..000525 -> 000524 Cyrillic Capital Letter Pe With Descender
+Upper: 000526..000527 -> 000526 Cyrillic Capital Letter Shha With Descender
+Upper: 000528..000529 -> 000528 Cyrillic Capital Letter En With Left Hook
+Upper: 00052a..00052b -> 00052a Cyrillic Capital Letter Dzzhe
+Upper: 00052c..00052d -> 00052c Cyrillic Capital Letter Dche
+Upper: 00052e..00052f -> 00052e Cyrillic Capital Letter El With Descender
Upper: 000531 -> 000531 Armenian Capital Letter Ayb
Upper: 000532 -> 000532 Armenian Capital Letter Ben
Upper: 000533 -> 000533 Armenian Capital Letter Gim
@@ -3950,14 +4247,108 @@
Upper: 0010c3 -> 0010c3 Georgian Capital Letter We
Upper: 0010c4 -> 0010c4 Georgian Capital Letter Har
Upper: 0010c5 -> 0010c5 Georgian Capital Letter Hoe
-Upper: 0010c6 -> 0010c6
Upper: 0010c7 -> 0010c7 Georgian Capital Letter Yn
-Upper: 0010c8 -> 0010c8
-Upper: 0010c9 -> 0010c9
-Upper: 0010ca -> 0010ca
-Upper: 0010cb -> 0010cb
-Upper: 0010cc -> 0010cc
Upper: 0010cd -> 0010cd Georgian Capital Letter Aen
+Upper: 0013a0 -> 0013a0 Cherokee Letter A
+Upper: 0013a1 -> 0013a1 Cherokee Letter E
+Upper: 0013a2 -> 0013a2 Cherokee Letter I
+Upper: 0013a3 -> 0013a3 Cherokee Letter O
+Upper: 0013a4 -> 0013a4 Cherokee Letter U
+Upper: 0013a5 -> 0013a5 Cherokee Letter V
+Upper: 0013a6 -> 0013a6 Cherokee Letter Ga
+Upper: 0013a7 -> 0013a7 Cherokee Letter Ka
+Upper: 0013a8 -> 0013a8 Cherokee Letter Ge
+Upper: 0013a9 -> 0013a9 Cherokee Letter Gi
+Upper: 0013aa -> 0013aa Cherokee Letter Go
+Upper: 0013ab -> 0013ab Cherokee Letter Gu
+Upper: 0013ac -> 0013ac Cherokee Letter Gv
+Upper: 0013ad -> 0013ad Cherokee Letter Ha
+Upper: 0013ae -> 0013ae Cherokee Letter He
+Upper: 0013af -> 0013af Cherokee Letter Hi
+Upper: 0013b0 -> 0013b0 Cherokee Letter Ho
+Upper: 0013b1 -> 0013b1 Cherokee Letter Hu
+Upper: 0013b2 -> 0013b2 Cherokee Letter Hv
+Upper: 0013b3 -> 0013b3 Cherokee Letter La
+Upper: 0013b4 -> 0013b4 Cherokee Letter Le
+Upper: 0013b5 -> 0013b5 Cherokee Letter Li
+Upper: 0013b6 -> 0013b6 Cherokee Letter Lo
+Upper: 0013b7 -> 0013b7 Cherokee Letter Lu
+Upper: 0013b8 -> 0013b8 Cherokee Letter Lv
+Upper: 0013b9 -> 0013b9 Cherokee Letter Ma
+Upper: 0013ba -> 0013ba Cherokee Letter Me
+Upper: 0013bb -> 0013bb Cherokee Letter Mi
+Upper: 0013bc -> 0013bc Cherokee Letter Mo
+Upper: 0013bd -> 0013bd Cherokee Letter Mu
+Upper: 0013be -> 0013be Cherokee Letter Na
+Upper: 0013bf -> 0013bf Cherokee Letter Hna
+Upper: 0013c0 -> 0013c0 Cherokee Letter Nah
+Upper: 0013c1 -> 0013c1 Cherokee Letter Ne
+Upper: 0013c2 -> 0013c2 Cherokee Letter Ni
+Upper: 0013c3 -> 0013c3 Cherokee Letter No
+Upper: 0013c4 -> 0013c4 Cherokee Letter Nu
+Upper: 0013c5 -> 0013c5 Cherokee Letter Nv
+Upper: 0013c6 -> 0013c6 Cherokee Letter Qua
+Upper: 0013c7 -> 0013c7 Cherokee Letter Que
+Upper: 0013c8 -> 0013c8 Cherokee Letter Qui
+Upper: 0013c9 -> 0013c9 Cherokee Letter Quo
+Upper: 0013ca -> 0013ca Cherokee Letter Quu
+Upper: 0013cb -> 0013cb Cherokee Letter Quv
+Upper: 0013cc -> 0013cc Cherokee Letter Sa
+Upper: 0013cd -> 0013cd Cherokee Letter S
+Upper: 0013ce -> 0013ce Cherokee Letter Se
+Upper: 0013cf -> 0013cf Cherokee Letter Si
+Upper: 0013d0 -> 0013d0 Cherokee Letter So
+Upper: 0013d1 -> 0013d1 Cherokee Letter Su
+Upper: 0013d2 -> 0013d2 Cherokee Letter Sv
+Upper: 0013d3 -> 0013d3 Cherokee Letter Da
+Upper: 0013d4 -> 0013d4 Cherokee Letter Ta
+Upper: 0013d5 -> 0013d5 Cherokee Letter De
+Upper: 0013d6 -> 0013d6 Cherokee Letter Te
+Upper: 0013d7 -> 0013d7 Cherokee Letter Di
+Upper: 0013d8 -> 0013d8 Cherokee Letter Ti
+Upper: 0013d9 -> 0013d9 Cherokee Letter Do
+Upper: 0013da -> 0013da Cherokee Letter Du
+Upper: 0013db -> 0013db Cherokee Letter Dv
+Upper: 0013dc -> 0013dc Cherokee Letter Dla
+Upper: 0013dd -> 0013dd Cherokee Letter Tla
+Upper: 0013de -> 0013de Cherokee Letter Tle
+Upper: 0013df -> 0013df Cherokee Letter Tli
+Upper: 0013e0 -> 0013e0 Cherokee Letter Tlo
+Upper: 0013e1 -> 0013e1 Cherokee Letter Tlu
+Upper: 0013e2 -> 0013e2 Cherokee Letter Tlv
+Upper: 0013e3 -> 0013e3 Cherokee Letter Tsa
+Upper: 0013e4 -> 0013e4 Cherokee Letter Tse
+Upper: 0013e5 -> 0013e5 Cherokee Letter Tsi
+Upper: 0013e6 -> 0013e6 Cherokee Letter Tso
+Upper: 0013e7 -> 0013e7 Cherokee Letter Tsu
+Upper: 0013e8 -> 0013e8 Cherokee Letter Tsv
+Upper: 0013e9 -> 0013e9 Cherokee Letter Wa
+Upper: 0013ea -> 0013ea Cherokee Letter We
+Upper: 0013eb -> 0013eb Cherokee Letter Wi
+Upper: 0013ec -> 0013ec Cherokee Letter Wo
+Upper: 0013ed -> 0013ed Cherokee Letter Wu
+Upper: 0013ee -> 0013ee Cherokee Letter Wv
+Upper: 0013ef -> 0013ef Cherokee Letter Ya
+Upper: 0013f0 -> 0013f0 Cherokee Letter Ye
+Upper: 0013f1 -> 0013f1 Cherokee Letter Yi
+Upper: 0013f2 -> 0013f2 Cherokee Letter Yo
+Upper: 0013f3 -> 0013f3 Cherokee Letter Yu
+Upper: 0013f4 -> 0013f4 Cherokee Letter Yv
+Upper: 0013f5 -> 0013f5 Cherokee Letter Mv
+Upper: 0013f8 -> 0013f0 Cherokee Small Letter Ye
+Upper: 0013f9 -> 0013f1 Cherokee Small Letter Yi
+Upper: 0013fa -> 0013f2 Cherokee Small Letter Yo
+Upper: 0013fb -> 0013f3 Cherokee Small Letter Yu
+Upper: 0013fc -> 0013f4 Cherokee Small Letter Yv
+Upper: 0013fd -> 0013f5 Cherokee Small Letter Mv
+Upper: 001c80 -> 000412 Cyrillic Small Letter Rounded Ve
+Upper: 001c81 -> 000414 Cyrillic Small Letter Long-Legged De
+Upper: 001c82 -> 00041e Cyrillic Small Letter Narrow O
+Upper: 001c83 -> 000421 Cyrillic Small Letter Wide Es
+Upper: 001c84..001c85 -> 000422 Cyrillic Small Letter Tall Te
+Upper: 001c86 -> 00042a Cyrillic Small Letter Tall Hard Sign
+Upper: 001c87 -> 000462 Cyrillic Small Letter Tall Yat
+Upper: 001c88 -> 00a64a Cyrillic Small Letter Unblended Uk
Upper: 001d79 -> 00a77d Latin Small Letter Insular G
Upper: 001d7d -> 002c63 Latin Small Letter P With Stroke
Upper: 001e00..001e01 -> 001e00 Latin Capital Letter A With Ring Below
@@ -4035,6 +4426,8 @@
Upper: 001e90..001e91 -> 001e90 Latin Capital Letter Z With Circumflex
Upper: 001e92..001e93 -> 001e92 Latin Capital Letter Z With Dot Below
Upper: 001e94..001e95 -> 001e94 Latin Capital Letter Z With Line Below
+Upper: 001e9b -> 001e60 Latin Small Letter Long S With Dot Above
+Upper: 001e9e -> 001e9e Latin Capital Letter Sharp S
Upper: 001ea0..001ea1 -> 001ea0 Latin Capital Letter A With Dot Below
Upper: 001ea2..001ea3 -> 001ea2 Latin Capital Letter A With Hook Above
Upper: 001ea4..001ea5 -> 001ea4 Latin Capital Letter A With Circumflex And Acute
@@ -4080,6 +4473,9 @@
Upper: 001ef4..001ef5 -> 001ef4 Latin Capital Letter Y With Dot Below
Upper: 001ef6..001ef7 -> 001ef6 Latin Capital Letter Y With Hook Above
Upper: 001ef8..001ef9 -> 001ef8 Latin Capital Letter Y With Tilde
+Upper: 001efa..001efb -> 001efa Latin Capital Letter Middle-Welsh Ll
+Upper: 001efc..001efd -> 001efc Latin Capital Letter Middle-Welsh V
+Upper: 001efe..001eff -> 001efe Latin Capital Letter Y With Loop
Upper: 001f00 -> 001f08 Greek Small Letter Alpha With Psili
Upper: 001f01 -> 001f09 Greek Small Letter Alpha With Dasia
Upper: 001f02 -> 001f0a Greek Small Letter Alpha With Psili And Varia
@@ -4146,22 +4542,20 @@
Upper: 001f43 -> 001f4b Greek Small Letter Omicron With Dasia And Varia
Upper: 001f44 -> 001f4c Greek Small Letter Omicron With Psili And Oxia
Upper: 001f45 -> 001f4d Greek Small Letter Omicron With Dasia And Oxia
-Upper: 001f46 -> 001f4e
-Upper: 001f47 -> 001f4f
Upper: 001f48 -> 001f48 Greek Capital Letter Omicron With Psili
Upper: 001f49 -> 001f49 Greek Capital Letter Omicron With Dasia
Upper: 001f4a -> 001f4a Greek Capital Letter Omicron With Psili And Varia
Upper: 001f4b -> 001f4b Greek Capital Letter Omicron With Dasia And Varia
Upper: 001f4c -> 001f4c Greek Capital Letter Omicron With Psili And Oxia
Upper: 001f4d -> 001f4d Greek Capital Letter Omicron With Dasia And Oxia
-Upper: 001f4e -> 001f4e
-Upper: 001f4f -> 001f4f
Upper: 001f51 -> 001f59 Greek Small Letter Upsilon With Dasia
Upper: 001f53 -> 001f5b Greek Small Letter Upsilon With Dasia And Varia
Upper: 001f55 -> 001f5d Greek Small Letter Upsilon With Dasia And Oxia
+Upper: 001f57 -> 001f5f Greek Small Letter Upsilon With Dasia And Perispomeni
Upper: 001f59 -> 001f59 Greek Capital Letter Upsilon With Dasia
Upper: 001f5b -> 001f5b Greek Capital Letter Upsilon With Dasia And Varia
Upper: 001f5d -> 001f5d Greek Capital Letter Upsilon With Dasia And Oxia
+Upper: 001f5f -> 001f5f Greek Capital Letter Upsilon With Dasia And Perispomeni
Upper: 001f60 -> 001f68 Greek Small Letter Omega With Psili
Upper: 001f61 -> 001f69 Greek Small Letter Omega With Dasia
Upper: 001f62 -> 001f6a Greek Small Letter Omega With Psili And Varia
@@ -4248,6 +4642,7 @@
Upper: 001fba -> 001fba Greek Capital Letter Alpha With Varia
Upper: 001fbb -> 001fbb Greek Capital Letter Alpha With Oxia
Upper: 001fbc -> 001fbc Greek Capital Letter Alpha With Prosgegrammeni
+Upper: 001fbe -> 000399 Greek Prosgegrammeni
Upper: 001fc3 -> 001fcc Greek Small Letter Eta With Ypogegrammeni
Upper: 001fc8 -> 001fc8 Greek Capital Letter Epsilon With Varia
Upper: 001fc9 -> 001fc9 Greek Capital Letter Epsilon With Oxia
@@ -4386,6 +4781,10 @@
Upper: 00206d -> 00206d Activate Arabic Form Shaping
Upper: 00206e -> 00206e National Digit Shapes
Upper: 00206f -> 00206f Nominal Digit Shapes
+Upper: 002126 -> 002126 Ohm Sign
+Upper: 00212b -> 00212b Angstrom Sign
+Upper: 002132 -> 002132 Turned Capital F
+Upper: 00214e -> 002132 Turned Small F
Upper: 002160 -> 002160 Roman Numeral One
Upper: 002161 -> 002161 Roman Numeral Two
Upper: 002162 -> 002162 Roman Numeral Three
@@ -4418,6 +4817,7 @@
Upper: 00217d -> 00216d Small Roman Numeral One Hundred
Upper: 00217e -> 00216e Small Roman Numeral Five Hundred
Upper: 00217f -> 00216f Small Roman Numeral One Thousand
+Upper: 002183..002184 -> 002183 Roman Numeral Reversed One Hundred
Upper: 002190 -> 002190 Leftwards Arrow
Upper: 002191 -> 002191 Upwards Arrow
Upper: 002192 -> 002192 Rightwards Arrow
@@ -5318,10 +5718,10 @@
Upper: 002cdc..002cdd -> 002cdc Coptic Capital Letter Old Nubian Shima
Upper: 002cde..002cdf -> 002cde Coptic Capital Letter Old Nubian Ngi
Upper: 002ce0..002ce1 -> 002ce0 Coptic Capital Letter Old Nubian Nyi
+Upper: 002ce2..002ce3 -> 002ce2 Coptic Capital Letter Old Nubian Wau
Upper: 002ceb..002cec -> 002ceb Coptic Capital Letter Cryptogrammic Shei
Upper: 002ced..002cee -> 002ced Coptic Capital Letter Cryptogrammic Gangia
-Upper: 002cef..002cf0 -> 002cef Coptic Combining Ni Above
-Upper: 002cf1..002cf2 -> 002cf1 Coptic Combining Spiritus Lenis
+Upper: 002cf2..002cf3 -> 002cf2 Coptic Capital Letter Bohairic Khei
Upper: 002d00 -> 0010a0 Georgian Small Letter An
Upper: 002d01 -> 0010a1 Georgian Small Letter Ban
Upper: 002d02 -> 0010a2 Georgian Small Letter Gan
@@ -5360,13 +5760,7 @@
Upper: 002d23 -> 0010c3 Georgian Small Letter We
Upper: 002d24 -> 0010c4 Georgian Small Letter Har
Upper: 002d25 -> 0010c5 Georgian Small Letter Hoe
-Upper: 002d26 -> 0010c6
Upper: 002d27 -> 0010c7 Georgian Small Letter Yn
-Upper: 002d28 -> 0010c8
-Upper: 002d29 -> 0010c9
-Upper: 002d2a -> 0010ca
-Upper: 002d2b -> 0010cb
-Upper: 002d2c -> 0010cc
Upper: 002d2d -> 0010cd Georgian Small Letter Aen
Upper: 00a640..00a641 -> 00a640 Cyrillic Capital Letter Zemlya
Upper: 00a642..00a643 -> 00a642 Cyrillic Capital Letter Dzelo
@@ -5452,7 +5846,7 @@
Upper: 00a784..00a785 -> 00a784 Latin Capital Letter Insular S
Upper: 00a786..00a787 -> 00a786 Latin Capital Letter Insular T
Upper: 00a78b..00a78c -> 00a78b Latin Capital Letter Saltillo
-Upper: 00a78d..00a78e -> 00a78d Latin Capital Letter Turned H
+Upper: 00a78d -> 00a78d Latin Capital Letter Turned H
Upper: 00a790..00a791 -> 00a790 Latin Capital Letter N With Descender
Upper: 00a792..00a793 -> 00a792 Latin Capital Letter C With Bar
Upper: 00a796..00a797 -> 00a796 Latin Capital Letter B With Flourish
@@ -5477,6 +5871,86 @@
Upper: 00a7b4..00a7b5 -> 00a7b4 Latin Capital Letter Beta
Upper: 00a7b6..00a7b7 -> 00a7b6 Latin Capital Letter Omega
Upper: 00ab53 -> 00a7b3 Latin Small Letter Chi
+Upper: 00ab70 -> 0013a0 Cherokee Small Letter A
+Upper: 00ab71 -> 0013a1 Cherokee Small Letter E
+Upper: 00ab72 -> 0013a2 Cherokee Small Letter I
+Upper: 00ab73 -> 0013a3 Cherokee Small Letter O
+Upper: 00ab74 -> 0013a4 Cherokee Small Letter U
+Upper: 00ab75 -> 0013a5 Cherokee Small Letter V
+Upper: 00ab76 -> 0013a6 Cherokee Small Letter Ga
+Upper: 00ab77 -> 0013a7 Cherokee Small Letter Ka
+Upper: 00ab78 -> 0013a8 Cherokee Small Letter Ge
+Upper: 00ab79 -> 0013a9 Cherokee Small Letter Gi
+Upper: 00ab7a -> 0013aa Cherokee Small Letter Go
+Upper: 00ab7b -> 0013ab Cherokee Small Letter Gu
+Upper: 00ab7c -> 0013ac Cherokee Small Letter Gv
+Upper: 00ab7d -> 0013ad Cherokee Small Letter Ha
+Upper: 00ab7e -> 0013ae Cherokee Small Letter He
+Upper: 00ab7f -> 0013af Cherokee Small Letter Hi
+Upper: 00ab80 -> 0013b0 Cherokee Small Letter Ho
+Upper: 00ab81 -> 0013b1 Cherokee Small Letter Hu
+Upper: 00ab82 -> 0013b2 Cherokee Small Letter Hv
+Upper: 00ab83 -> 0013b3 Cherokee Small Letter La
+Upper: 00ab84 -> 0013b4 Cherokee Small Letter Le
+Upper: 00ab85 -> 0013b5 Cherokee Small Letter Li
+Upper: 00ab86 -> 0013b6 Cherokee Small Letter Lo
+Upper: 00ab87 -> 0013b7 Cherokee Small Letter Lu
+Upper: 00ab88 -> 0013b8 Cherokee Small Letter Lv
+Upper: 00ab89 -> 0013b9 Cherokee Small Letter Ma
+Upper: 00ab8a -> 0013ba Cherokee Small Letter Me
+Upper: 00ab8b -> 0013bb Cherokee Small Letter Mi
+Upper: 00ab8c -> 0013bc Cherokee Small Letter Mo
+Upper: 00ab8d -> 0013bd Cherokee Small Letter Mu
+Upper: 00ab8e -> 0013be Cherokee Small Letter Na
+Upper: 00ab8f -> 0013bf Cherokee Small Letter Hna
+Upper: 00ab90 -> 0013c0 Cherokee Small Letter Nah
+Upper: 00ab91 -> 0013c1 Cherokee Small Letter Ne
+Upper: 00ab92 -> 0013c2 Cherokee Small Letter Ni
+Upper: 00ab93 -> 0013c3 Cherokee Small Letter No
+Upper: 00ab94 -> 0013c4 Cherokee Small Letter Nu
+Upper: 00ab95 -> 0013c5 Cherokee Small Letter Nv
+Upper: 00ab96 -> 0013c6 Cherokee Small Letter Qua
+Upper: 00ab97 -> 0013c7 Cherokee Small Letter Que
+Upper: 00ab98 -> 0013c8 Cherokee Small Letter Qui
+Upper: 00ab99 -> 0013c9 Cherokee Small Letter Quo
+Upper: 00ab9a -> 0013ca Cherokee Small Letter Quu
+Upper: 00ab9b -> 0013cb Cherokee Small Letter Quv
+Upper: 00ab9c -> 0013cc Cherokee Small Letter Sa
+Upper: 00ab9d -> 0013cd Cherokee Small Letter S
+Upper: 00ab9e -> 0013ce Cherokee Small Letter Se
+Upper: 00ab9f -> 0013cf Cherokee Small Letter Si
+Upper: 00aba0 -> 0013d0 Cherokee Small Letter So
+Upper: 00aba1 -> 0013d1 Cherokee Small Letter Su
+Upper: 00aba2 -> 0013d2 Cherokee Small Letter Sv
+Upper: 00aba3 -> 0013d3 Cherokee Small Letter Da
+Upper: 00aba4 -> 0013d4 Cherokee Small Letter Ta
+Upper: 00aba5 -> 0013d5 Cherokee Small Letter De
+Upper: 00aba6 -> 0013d6 Cherokee Small Letter Te
+Upper: 00aba7 -> 0013d7 Cherokee Small Letter Di
+Upper: 00aba8 -> 0013d8 Cherokee Small Letter Ti
+Upper: 00aba9 -> 0013d9 Cherokee Small Letter Do
+Upper: 00abaa -> 0013da Cherokee Small Letter Du
+Upper: 00abab -> 0013db Cherokee Small Letter Dv
+Upper: 00abac -> 0013dc Cherokee Small Letter Dla
+Upper: 00abad -> 0013dd Cherokee Small Letter Tla
+Upper: 00abae -> 0013de Cherokee Small Letter Tle
+Upper: 00abaf -> 0013df Cherokee Small Letter Tli
+Upper: 00abb0 -> 0013e0 Cherokee Small Letter Tlo
+Upper: 00abb1 -> 0013e1 Cherokee Small Letter Tlu
+Upper: 00abb2 -> 0013e2 Cherokee Small Letter Tlv
+Upper: 00abb3 -> 0013e3 Cherokee Small Letter Tsa
+Upper: 00abb4 -> 0013e4 Cherokee Small Letter Tse
+Upper: 00abb5 -> 0013e5 Cherokee Small Letter Tsi
+Upper: 00abb6 -> 0013e6 Cherokee Small Letter Tso
+Upper: 00abb7 -> 0013e7 Cherokee Small Letter Tsu
+Upper: 00abb8 -> 0013e8 Cherokee Small Letter Tsv
+Upper: 00abb9 -> 0013e9 Cherokee Small Letter Wa
+Upper: 00abba -> 0013ea Cherokee Small Letter We
+Upper: 00abbb -> 0013eb Cherokee Small Letter Wi
+Upper: 00abbc -> 0013ec Cherokee Small Letter Wo
+Upper: 00abbd -> 0013ed Cherokee Small Letter Wu
+Upper: 00abbe -> 0013ee Cherokee Small Letter Wv
+Upper: 00abbf -> 0013ef Cherokee Small Letter Ya
Upper: 00ff21 -> 00ff21 Fullwidth Latin Capital Letter A
Upper: 00ff22 -> 00ff22 Fullwidth Latin Capital Letter B
Upper: 00ff23 -> 00ff23 Fullwidth Latin Capital Letter C
@@ -5569,34 +6043,46 @@
Upper: 010425 -> 010425 Deseret Capital Letter Eng
Upper: 010426 -> 010426 Deseret Capital Letter Oi
Upper: 010427 -> 010427 Deseret Capital Letter Ew
-Upper: 010428 -> 01040c Deseret Small Letter Long I
-Upper: 010429 -> 01040d Deseret Small Letter Long E
-Upper: 01042a -> 01040e Deseret Small Letter Long A
-Upper: 01042b -> 01040f Deseret Small Letter Long Ah
-Upper: 01042c -> 010410 Deseret Small Letter Long O
-Upper: 01042d -> 010411 Deseret Small Letter Long Oo
-Upper: 01042e -> 010412 Deseret Small Letter Short I
-Upper: 01042f -> 010413 Deseret Small Letter Short E
-Upper: 010430 -> 010414 Deseret Small Letter Short A
-Upper: 010431 -> 010415 Deseret Small Letter Short Ah
-Upper: 010432 -> 010416 Deseret Small Letter Short O
-Upper: 010433 -> 010417 Deseret Small Letter Short Oo
-Upper: 010434 -> 010418 Deseret Small Letter Ay
-Upper: 010435 -> 010419 Deseret Small Letter Ow
-Upper: 010436 -> 01041a Deseret Small Letter Wu
-Upper: 010437 -> 01041b Deseret Small Letter Yee
-Upper: 010438 -> 01041c Deseret Small Letter H
-Upper: 010439 -> 01041d Deseret Small Letter Pee
-Upper: 01043a -> 01041e Deseret Small Letter Bee
-Upper: 01043b -> 01041f Deseret Small Letter Tee
-Upper: 01043c -> 010420 Deseret Small Letter Dee
-Upper: 01043d -> 010421 Deseret Small Letter Chee
-Upper: 01043e -> 010422 Deseret Small Letter Jee
-Upper: 01043f -> 010423 Deseret Small Letter Kay
-Upper: 010440 -> 010424 Deseret Small Letter Gay
-Upper: 010441 -> 010425 Deseret Small Letter Ef
-Upper: 010442 -> 010426 Deseret Small Letter Vee
-Upper: 010443 -> 010427 Deseret Small Letter Eth
+Upper: 010428 -> 010400 Deseret Small Letter Long I
+Upper: 010429 -> 010401 Deseret Small Letter Long E
+Upper: 01042a -> 010402 Deseret Small Letter Long A
+Upper: 01042b -> 010403 Deseret Small Letter Long Ah
+Upper: 01042c -> 010404 Deseret Small Letter Long O
+Upper: 01042d -> 010405 Deseret Small Letter Long Oo
+Upper: 01042e -> 010406 Deseret Small Letter Short I
+Upper: 01042f -> 010407 Deseret Small Letter Short E
+Upper: 010430 -> 010408 Deseret Small Letter Short A
+Upper: 010431 -> 010409 Deseret Small Letter Short Ah
+Upper: 010432 -> 01040a Deseret Small Letter Short O
+Upper: 010433 -> 01040b Deseret Small Letter Short Oo
+Upper: 010434 -> 01040c Deseret Small Letter Ay
+Upper: 010435 -> 01040d Deseret Small Letter Ow
+Upper: 010436 -> 01040e Deseret Small Letter Wu
+Upper: 010437 -> 01040f Deseret Small Letter Yee
+Upper: 010438 -> 010410 Deseret Small Letter H
+Upper: 010439 -> 010411 Deseret Small Letter Pee
+Upper: 01043a -> 010412 Deseret Small Letter Bee
+Upper: 01043b -> 010413 Deseret Small Letter Tee
+Upper: 01043c -> 010414 Deseret Small Letter Dee
+Upper: 01043d -> 010415 Deseret Small Letter Chee
+Upper: 01043e -> 010416 Deseret Small Letter Jee
+Upper: 01043f -> 010417 Deseret Small Letter Kay
+Upper: 010440 -> 010418 Deseret Small Letter Gay
+Upper: 010441 -> 010419 Deseret Small Letter Ef
+Upper: 010442 -> 01041a Deseret Small Letter Vee
+Upper: 010443 -> 01041b Deseret Small Letter Eth
+Upper: 010444 -> 01041c Deseret Small Letter Thee
+Upper: 010445 -> 01041d Deseret Small Letter Es
+Upper: 010446 -> 01041e Deseret Small Letter Zee
+Upper: 010447 -> 01041f Deseret Small Letter Esh
+Upper: 010448 -> 010420 Deseret Small Letter Zhee
+Upper: 010449 -> 010421 Deseret Small Letter Er
+Upper: 01044a -> 010422 Deseret Small Letter El
+Upper: 01044b -> 010423 Deseret Small Letter Em
+Upper: 01044c -> 010424 Deseret Small Letter En
+Upper: 01044d -> 010425 Deseret Small Letter Eng
+Upper: 01044e -> 010426 Deseret Small Letter Oi
+Upper: 01044f -> 010427 Deseret Small Letter Ew
Upper: 0104b0 -> 0104b0 Osage Capital Letter A
Upper: 0104b1 -> 0104b1 Osage Capital Letter Ai
Upper: 0104b2 -> 0104b2 Osage Capital Letter Ain
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 16:57 ` Michal Nazarewicz
@ 2016-10-04 17:27 ` Eli Zaretskii
2016-10-04 17:44 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 17:27 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Tue, 04 Oct 2016 18:57:03 +0200
>
> > I think we should document all the changes.
>
> I wouldn’t know where to put such documentation.
On a separate file under admin/unidata/, if we cannot find a better
place.
> > Can you add the name of each character (just one, the leftmost one) to
> > its line and post the result? It's hard to read the report when it
> > only shows codepoints.
>
> --- orig-tables.txt 2016-10-04 18:40:27.276408290 +0200
> +++ modified-tables.txt 2016-10-04 18:41:34.651421547 +0200
Thanks, this all looks good to me: mostly additions for new
characters, most deletions are for reserved codepoints and the rest
are corrections for mistakes.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 17:27 ` Eli Zaretskii
@ 2016-10-04 17:44 ` Eli Zaretskii
2016-10-06 20:29 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 17:44 UTC (permalink / raw)
To: mina86; +Cc: 24603
> Date: Tue, 04 Oct 2016 20:27:02 +0300
> From: Eli Zaretskii <eliz@gnu.org>
> Cc: 24603@debbugs.gnu.org
>
> > From: Michal Nazarewicz <mina86@mina86.com>
> > Cc: 24603@debbugs.gnu.org
> > Date: Tue, 04 Oct 2016 18:57:03 +0200
> >
> > > I think we should document all the changes.
> >
> > I wouldn’t know where to put such documentation.
>
> On a separate file under admin/unidata/, if we cannot find a better
> place.
Or maybe just mention in the commit log the URL of the message in the
bugtracker's records, where you posted the diffs, it might be good
enough.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-04 17:44 ` Eli Zaretskii
@ 2016-10-06 20:29 ` Michal Nazarewicz
2016-10-07 6:52 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-06 20:29 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
[-- Attachment #1: Type: text/plain, Size: 812 bytes --]
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> Date: Tue, 04 Oct 2016 20:27:02 +0300
>> From: Eli Zaretskii <eliz@gnu.org>
>> Cc: 24603@debbugs.gnu.org
>>
>> > From: Michal Nazarewicz <mina86@mina86.com>
>> > Cc: 24603@debbugs.gnu.org
>> > Date: Tue, 04 Oct 2016 18:57:03 +0200
>> >
>> > > I think we should document all the changes.
>> >
>> > I wouldn’t know where to put such documentation.
>>
>> On a separate file under admin/unidata/, if we cannot find a better
>> place.
>
> Or maybe just mention in the commit log the URL of the message in the
> bugtracker's records, where you posted the diffs, it might be good
> enough.
That’s easy enough.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
[-- Attachment #2: 0002-Generate-upcase-and-downcase-tables-from-Unicode-dat.patch --]
[-- Type: text/x-diff, Size: 17671 bytes --]
From 9d2fd43c4d442543a650a4d3cb95b0c2aa6a0c4e Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Mon, 19 Sep 2016 00:23:40 +0200
Subject: [PATCH 02/19] Generate upcase and downcase tables from Unicode data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Use Unicode data to generate case tables instead of mostly repeating
them in lisp code. Do that in a way which maps ‘Dz’ (and similar)
digraph to ‘dz’ when down- and ‘DZ’ when upcasing.
https://debbugs.gnu.org/cgi/bugreport.cgi?msg=89;bug=24603 lists all
changes to syntax table and case tables introduced by this commit.
* lisp/international/characters.el: Remove case-pairs defined with
explicit Lisp code and instead use Unicode character properties.
* test/src/casefiddle-tests.el (casefiddle-tests--characters,
casefiddle-tests-casing): Update test cases which are now working
as they should.
---
lisp/international/characters.el | 345 ++++++++-------------------------------
test/src/casefiddle-tests.el | 7 +-
2 files changed, 73 insertions(+), 279 deletions(-)
diff --git a/lisp/international/characters.el b/lisp/international/characters.el
index 1757d2b..8dd9c73 100644
--- a/lisp/international/characters.el
+++ b/lisp/international/characters.el
@@ -543,10 +543,6 @@ ?L
(set-case-syntax ?½ "_" tbl)
(set-case-syntax ?¾ "_" tbl)
(set-case-syntax ?¿ "." tbl)
- (let ((c 192))
- (while (<= c 222)
- (set-case-syntax-pair c (+ c 32) tbl)
- (setq c (1+ c))))
(set-case-syntax ?× "_" tbl)
(set-case-syntax ?ß "w" tbl)
(set-case-syntax ?÷ "_" tbl)
@@ -558,101 +554,8 @@ ?L
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#x0100 . #x012F)
- (#x0132 . #x0137)
- (#x0139 . #x0148)
- (#x014a . #x0177)
- (#x0179 . #x017E)
- (#x0182 . #x0185)
- (#x0187 . #x0188)
- (#x018B . #x018C)
- (#x0191 . #x0192)
- (#x0198 . #x0199)
- (#x01A0 . #x01A5)
- (#x01A7 . #x01A8)
- (#x01AC . #x01AD)
- (#x01AF . #x01B0)
- (#x01B3 . #x01B6)
- (#x01B8 . #x01B9)
- (#x01BC . #x01BD)
- (#x01CD . #x01DC)
- (#x01DE . #x01EF)
- (#x01F4 . #x01F5)
- (#x01F8 . #x021F)
- (#x0222 . #x0233)
- (#x023B . #x023C)
- (#x0241 . #x0242)
- (#x0246 . #x024F))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ÿ ?ÿ tbl)
-
- ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
- ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
- ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
- ;; SMALL LETTER I.
-
- ;; We used to set up half of those correspondence unconditionally,
- ;; but that makes searches slow. So now we don't set up either half
- ;; of these correspondences by default.
-
- ;; (set-downcase-syntax ?İ ?i tbl)
- ;; (set-upcase-syntax ?I ?ı tbl)
-
- (set-case-syntax-pair ?Ɓ ?ɓ tbl)
- (set-case-syntax-pair ?Ɔ ?ɔ tbl)
- (set-case-syntax-pair ?Ɖ ?ɖ tbl)
- (set-case-syntax-pair ?Ɗ ?ɗ tbl)
- (set-case-syntax-pair ?Ǝ ?ǝ tbl)
- (set-case-syntax-pair ?Ə ?ə tbl)
- (set-case-syntax-pair ?Ɛ ?ɛ tbl)
- (set-case-syntax-pair ?Ɠ ?ɠ tbl)
- (set-case-syntax-pair ?Ɣ ?ɣ tbl)
- (set-case-syntax-pair ?Ɩ ?ɩ tbl)
- (set-case-syntax-pair ?Ɨ ?ɨ tbl)
- (set-case-syntax-pair ?Ɯ ?ɯ tbl)
- (set-case-syntax-pair ?Ɲ ?ɲ tbl)
- (set-case-syntax-pair ?Ɵ ?ɵ tbl)
- (set-case-syntax-pair ?Ʀ ?ʀ tbl)
- (set-case-syntax-pair ?Ʃ ?ʃ tbl)
- (set-case-syntax-pair ?Ʈ ?ʈ tbl)
- (set-case-syntax-pair ?Ʊ ?ʊ tbl)
- (set-case-syntax-pair ?Ʋ ?ʋ tbl)
- (set-case-syntax-pair ?Ʒ ?ʒ tbl)
- ;; We use set-downcase-syntax below, since we want upcase of dž
- ;; return DŽ, not Dž, and the same for the rest.
- (set-case-syntax-pair ?DŽ ?dž tbl)
- (set-downcase-syntax ?Dž ?dž tbl)
- (set-case-syntax-pair ?LJ ?lj tbl)
- (set-downcase-syntax ?Lj ?lj tbl)
- (set-case-syntax-pair ?NJ ?nj tbl)
- (set-downcase-syntax ?Nj ?nj tbl)
-
- ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
-
- (set-case-syntax-pair ?DZ ?dz tbl)
- (set-downcase-syntax ?Dz ?dz tbl)
- (set-case-syntax-pair ?Ƕ ?ƕ tbl)
- (set-case-syntax-pair ?Ƿ ?ƿ tbl)
- (set-case-syntax-pair ?Ⱥ ?ⱥ tbl)
- (set-case-syntax-pair ?Ƚ ?ƚ tbl)
- (set-case-syntax-pair ?Ⱦ ?ⱦ tbl)
- (set-case-syntax-pair ?Ƀ ?ƀ tbl)
- (set-case-syntax-pair ?Ʉ ?ʉ tbl)
- (set-case-syntax-pair ?Ʌ ?ʌ tbl)
-
;; Latin Extended Additional
(modify-category-entry '(#x1e00 . #x1ef9) ?l)
- (setq c #x1e00)
- (while (<= c #x1ef9)
- (and (zerop (% c 2))
- (or (<= c #x1e94) (>= c #x1ea0))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
;; Latin Extended-C
(setq c #x2C60)
@@ -660,57 +563,12 @@ ?L
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#x2C60 . #x2C61)
- (#x2C67 . #x2C6C)
- (#x2C72 . #x2C73)
- (#x2C75 . #x2C76))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ɫ ?ɫ tbl)
- (set-case-syntax-pair ?Ᵽ ?ᵽ tbl)
- (set-case-syntax-pair ?Ɽ ?ɽ tbl)
- (set-case-syntax-pair ?Ɑ ?ɑ tbl)
- (set-case-syntax-pair ?Ɱ ?ɱ tbl)
- (set-case-syntax-pair ?Ɐ ?ɐ tbl)
- (set-case-syntax-pair ?Ɒ ?ɒ tbl)
- (set-case-syntax-pair ?Ȿ ?ȿ tbl)
- (set-case-syntax-pair ?Ɀ ?ɀ tbl)
-
;; Latin Extended-D
(setq c #xA720)
(while (<= c #xA7FF)
(modify-category-entry c ?l)
(setq c (1+ c)))
- (let ((pair-ranges '((#xA722 . #xA72F)
- (#xA732 . #xA76F)
- (#xA779 . #xA77C)
- (#xA77E . #xA787)
- (#xA78B . #xA78E)
- (#xA790 . #xA793)
- (#xA796 . #xA7A9)
- (#xA7B4 . #xA7B7))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
-
- (set-case-syntax-pair ?Ᵹ ?ᵹ tbl)
- (set-case-syntax-pair ?Ɦ ?ɦ tbl)
- (set-case-syntax-pair ?Ɜ ?ɜ tbl)
- (set-case-syntax-pair ?Ɡ ?ɡ tbl)
- (set-case-syntax-pair ?Ɬ ?ɬ tbl)
- (set-case-syntax-pair ?Ɪ ?ɪ tbl)
- (set-case-syntax-pair ?Ʞ ?ʞ tbl)
- (set-case-syntax-pair ?Ʇ ?ʇ tbl)
- (set-case-syntax-pair ?Ʝ ?ʝ tbl)
- (set-case-syntax-pair ?Ꭓ ?ꭓ tbl)
-
;; Latin Extended-E
(setq c #xAB30)
(while (<= c #xAB64)
@@ -719,102 +577,19 @@ ?L
;; Greek
(modify-category-entry '(#x0370 . #x03ff) ?g)
- (setq c #x0370)
- (while (<= c #x03ff)
- (if (or (and (>= c #x0391) (<= c #x03a1))
- (and (>= c #x03a3) (<= c #x03ab)))
- (set-case-syntax-pair c (+ c 32) tbl))
- (and (>= c #x03da)
- (<= c #x03ee)
- (zerop (% c 2))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ά ?ά tbl)
- (set-case-syntax-pair ?Έ ?έ tbl)
- (set-case-syntax-pair ?Ή ?ή tbl)
- (set-case-syntax-pair ?Ί ?ί tbl)
- (set-case-syntax-pair ?Ό ?ό tbl)
- (set-case-syntax-pair ?Ύ ?ύ tbl)
- (set-case-syntax-pair ?Ώ ?ώ tbl)
;; Armenian
(setq c #x531)
- (while (<= c #x556)
- (set-case-syntax-pair c (+ c #x30) tbl)
- (setq c (1+ c)))
;; Greek Extended
(modify-category-entry '(#x1f00 . #x1fff) ?g)
- (setq c #x1f00)
- (while (<= c #x1fff)
- (and (<= (logand c #x000f) 7)
- (<= c #x1fa7)
- (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57
- #x1f50 #x1f52 #x1f54 #x1f56)))
- (/= (logand c #x00f0) #x70)
- (set-case-syntax-pair (+ c 8) c tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
- (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
- (set-case-syntax-pair ?Ὰ ?ὰ tbl)
- (set-case-syntax-pair ?Ά ?ά tbl)
- (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
- (set-case-syntax-pair ?Ὲ ?ὲ tbl)
- (set-case-syntax-pair ?Έ ?έ tbl)
- (set-case-syntax-pair ?Ὴ ?ὴ tbl)
- (set-case-syntax-pair ?Ή ?ή tbl)
- (set-case-syntax-pair ?ῌ ?ῃ tbl)
- (set-case-syntax-pair ?Ῐ ?ῐ tbl)
- (set-case-syntax-pair ?Ῑ ?ῑ tbl)
- (set-case-syntax-pair ?Ὶ ?ὶ tbl)
- (set-case-syntax-pair ?Ί ?ί tbl)
- (set-case-syntax-pair ?Ῠ ?ῠ tbl)
- (set-case-syntax-pair ?Ῡ ?ῡ tbl)
- (set-case-syntax-pair ?Ὺ ?ὺ tbl)
- (set-case-syntax-pair ?Ύ ?ύ tbl)
- (set-case-syntax-pair ?Ῥ ?ῥ tbl)
- (set-case-syntax-pair ?Ὸ ?ὸ tbl)
- (set-case-syntax-pair ?Ό ?ό tbl)
- (set-case-syntax-pair ?Ὼ ?ὼ tbl)
- (set-case-syntax-pair ?Ώ ?ώ tbl)
- (set-case-syntax-pair ?ῼ ?ῳ tbl)
;; cyrillic
(modify-category-entry '(#x0400 . #x04FF) ?y)
- (setq c #x0400)
- (while (<= c #x04ff)
- (and (>= c #x0400)
- (<= c #x040f)
- (set-case-syntax-pair c (+ c 80) tbl))
- (and (>= c #x0410)
- (<= c #x042f)
- (set-case-syntax-pair c (+ c 32) tbl))
- (and (zerop (% c 2))
- (or (and (>= c #x0460) (<= c #x0480))
- (and (>= c #x048c) (<= c #x04be))
- (and (>= c #x04d0) (<= c #x052e)))
- (set-case-syntax-pair c (1+ c) tbl))
- (setq c (1+ c)))
- (set-case-syntax-pair ?Ӂ ?ӂ tbl)
- (set-case-syntax-pair ?Ӄ ?ӄ tbl)
- (set-case-syntax-pair ?Ӈ ?ӈ tbl)
- (set-case-syntax-pair ?Ӌ ?ӌ tbl)
-
(modify-category-entry '(#xA640 . #xA69F) ?y)
- (setq c #xA640)
- (while (<= c #xA66C)
- (set-case-syntax-pair c (+ c 1) tbl)
- (setq c (+ c 2)))
- (setq c #xA680)
- (while (<= c #xA69A)
- (set-case-syntax-pair c (+ c 1) tbl)
- (setq c (+ c 2)))
;; Georgian
(setq c #x10A0)
- (while (<= c #x10CD)
- (set-case-syntax-pair c (+ c #x1C60) tbl)
- (setq c (1+ c)))
;; Cyrillic Extended-C
(modify-category-entry '(#x1C80 . #x1C8F) ?y)
@@ -844,12 +619,6 @@ ?L
(set-case-syntax c "." tbl)
(setq c (1+ c)))
- ;; Roman numerals
- (setq c #x2160)
- (while (<= c #x216f)
- (set-case-syntax-pair c (+ c #x10) tbl)
- (setq c (1+ c)))
-
;; Fixme: The following blocks might be better as symbol rather than
;; punctuation.
;; Arrows
@@ -873,25 +642,11 @@ ?L
;; Circled Latin
(setq c #x24b6)
(while (<= c #x24cf)
- (set-case-syntax-pair c (+ c 26) tbl)
(modify-category-entry c ?l)
(modify-category-entry (+ c 26) ?l)
(setq c (1+ c)))
- ;; Glagolitic
- (setq c #x2C00)
- (while (<= c #x2C2E)
- (set-case-syntax-pair c (+ c 48) tbl)
- (setq c (1+ c)))
-
;; Coptic
- (let ((pair-ranges '((#x2C80 . #x2CE2)
- (#x2CEB . #x2CF2))))
- (dolist (elt pair-ranges)
- (let ((from (car elt)) (to (cdr elt)))
- (while (< from to)
- (set-case-syntax-pair from (1+ from) tbl)
- (setq from (+ from 2))))))
;; There's no Coptic category. However, Coptic letters that are
;; part of the Greek block above get the Greek category, and those
;; in this block are derived from Greek letters, so let's be
@@ -901,45 +656,85 @@ ?L
;; Fullwidth Latin
(setq c #xff21)
(while (<= c #xff3a)
- (set-case-syntax-pair c (+ c #x20) tbl)
(modify-category-entry c ?l)
(modify-category-entry (+ c #x20) ?l)
(setq c (1+ c)))
- ;; Deseret
- (setq c #x10400)
- (while (<= c #x10427)
- (set-case-syntax-pair c (+ c 28) tbl)
- (setq c (1+ c)))
+ ;; Combining diacritics
+ (modify-category-entry '(#x300 . #x362) ?^)
+ ;; Combining marks
+ (modify-category-entry '(#x20d0 . #x20ff) ?^)
- ;; Osage
- (setq c #x104B0)
- (while (<= c #x104D3)
- (set-case-syntax-pair c (+ c 40) tbl)
- (setq c (1+ c)))
+ ;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax
+ ;; to word.
+ (let ((syn-tab (standard-syntax-table)))
+ (map-char-table
+ (lambda (ch cat)
+ (when (memq cat '(Lu Ll Lt))
+ (modify-syntax-entry ch "w " syn-tab)))
+ (unicode-property-table-internal 'general-category))
- ;; Old Hungarian
- (setq c #x10c80)
- (while (<= c #x10cb2)
- (set-case-syntax-pair c (+ c #x40) tbl)
- (setq c (1+ c)))
+ ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well.
+ ;; General category of those characers is Number, Letter.
+ (modify-syntax-entry '(#x2160 . #x216b) "w " syn-tab)
- ;; Warang Citi
- (setq c #x118a0)
- (while (<= c #x118bf)
- (set-case-syntax-pair c (+ c #x20) tbl)
- (setq c (1+ c)))
+ ;; ⓐ thourgh ⓩ are symbols, other according to Unicode but Emacs set
+ ;; their syntax to word in the past so keep backwards compatibility.
+ (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-tab))
- ;; Adlam
- (setq c #x1e900)
- (while (<= c #x1e921)
- (set-case-syntax-pair c (+ c #x22) tbl)
- (setq c (1+ c)))
+ ;; Set downcase and upcase from Unicode properties
- ;; Combining diacritics
- (modify-category-entry '(#x300 . #x362) ?^)
- ;; Combining marks
- (modify-category-entry '(#x20d0 . #x20ff) ?^)
+ ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I and
+ ;; U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so do U+0130
+ ;; LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I.
+
+ ;; We used to set up half of those correspondence unconditionally, but that
+ ;; makes searches slow. So now we don't set up either half of these
+ ;; correspondences by default.
+
+ ;; (set-downcase-syntax ?İ ?i tbl)
+ ;; (set-upcase-syntax ?I ?ı tbl)
+
+ (let ((map-unicode-property
+ (lambda (property func)
+ (map-char-table
+ (lambda (ch cased)
+ ;; ASCII characters skipped due to reasons outlined above. As of
+ ;; Unicode 9.0, this exception affects the following:
+ ;; lc(U+0130 İ) = i
+ ;; uc(U+0131 ı) = I
+ ;; uc(U+017F ſ) = S
+ ;; uc(U+212A K) = k
+ (when (> cased 127)
+ (let ((end (if (consp ch) (cdr ch) ch)))
+ (setq ch (max 128 (if (consp ch) (car ch) ch)))
+ (while (<= ch end)
+ (funcall func ch cased)
+ (setq ch (1+ ch))))))
+ (unicode-property-table-internal property))))
+ (down tbl)
+ (up (case-table-get-table tbl 'up)))
+
+ ;; This works on an assumption that if toUpper(x) != x then toLower(x) ==
+ ;; x (and the opposite for toLower/toUpper). This doesn’t hold for title
+ ;; case characters but those incorrect mappings will be overwritten later.
+ (funcall map-unicode-property 'uppercase
+ (lambda (lc uc) (aset down lc lc) (aset up uc uc)))
+ (funcall map-unicode-property 'lowercase
+ (lambda (uc lc) (aset down lc lc) (aset up uc uc)))
+
+ ;; Now deal with the actual mapping. This will correctly assign casing for
+ ;; title-case characters.
+ (funcall map-unicode-property 'uppercase
+ (lambda (lc uc) (aset up lc uc) (aset up uc uc)))
+ (funcall map-unicode-property 'lowercase
+ (lambda (uc lc) (aset down uc lc) (aset down lc lc))))
+
+ ;; Clear out the extra slots so that they will be recomputed from the main
+ ;; (downcase) table and upcase table. Since we’re side-stepping the usual
+ ;; set-case-syntax-* functions, we need to do it explicitly.
+ (set-char-table-extra-slot tbl 1 nil)
+ (set-char-table-extra-slot tbl 2 nil)
;; Fixme: syntax for symbols &c
)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 4b2eeaf..ca3657d 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -72,8 +72,7 @@ casefiddle-tests--characters
(?Σ ?Σ ?σ ?Σ)
(?σ ?Σ ?σ ?Σ)
- ;; FIXME: Another broken one:
- ;;(?ς ?Σ ?ς ?Σ)
+ (?ς ?Σ ?ς ?Σ)
(?Ⅷ ?Ⅷ ?ⅷ ?Ⅷ)
(?ⅷ ?Ⅷ ?ⅷ ?Ⅷ)))
@@ -151,7 +150,6 @@ casefiddle-tests--characters
;;("fish" "FIsh" "fish" "Fish" "Fish")
;;("Straße" "STRASSE" "straße" "Straße" "Straße")
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
- ;;("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA")
("Džungla" "DžUNGLA" "džungla" "Džungla" "Džungla")
@@ -160,7 +158,8 @@ casefiddle-tests--characters
("fish" "fiSH" "fish" "fish" "fish")
("Straße" "STRAßE" "straße" "Straße" "Straße")
("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
- ("όσος" "ΌΣΟς" "όσος" "Όσος" "Όσος"))
+
+ ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))
(nreverse errors))
(let* ((input (car test))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data
2016-10-06 20:29 ` Michal Nazarewicz
@ 2016-10-07 6:52 ` Eli Zaretskii
0 siblings, 0 replies; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-07 6:52 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Thu, 06 Oct 2016 22:29:06 +0200
>
> >> > > I think we should document all the changes.
> >> >
> >> > I wouldn’t know where to put such documentation.
> >>
> >> On a separate file under admin/unidata/, if we cannot find a better
> >> place.
> >
> > Or maybe just mention in the commit log the URL of the message in the
> > bugtracker's records, where you posted the diffs, it might be good
> > enough.
>
> That’s easy enough.
Thanks, this is what I had in mind, indeed.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 03/18] Don’t assume character can be either upper- or lower-case when casing
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 04/18] Split casify_object into multiple functions Michal Nazarewicz
` (14 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
A compatibility digraph characters, such as Dž, are neither upper- nor
lower-case. At the moment however, those are reported as upper-case¹
despite the fact that they change when upper-cased.
Stop checking if a character is upper-case before trying to up-case it
so that title-case characters are handled correctly.
¹ Because they change when converted to lower-case. Notice an asymmetry
in that for a character to be considered lower-case it must not be
upper-case (plus the usual condition of changing when upper-cased).
* src/buffer.h (upcase1): Delete.
(upcase): Change to upcase character unconditionally just like downcase
does it. This is what upcase1 was.
* src/casefiddle.c (casify_object, casify_region): Use upcase instead
of upcase1 and don’t check !uppercasep(x) before calling upcase.
* src/keyboard.c (read_key_sequence): Don’t check if uppercase(x), just
downcase(x) and see if it changed.
* test/src/casefiddle-tests.el (casefiddle-tests--characters,
casefiddle-tests-casing): Update test cases which are now passing.
---
etc/NEWS | 8 +++++++-
src/buffer.h | 18 +++++++++---------
src/casefiddle.c | 20 +++++++-------------
src/keyboard.c | 25 +++++++++++++++----------
test/src/casefiddle-tests.el | 8 ++++----
5 files changed, 42 insertions(+), 37 deletions(-)
diff --git a/etc/NEWS b/etc/NEWS
index bd94c94..61afcc6 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -235,6 +235,12 @@ same as in modes where the character is not whitespace.
Instead of only checking the modification time, Emacs now also checks
the file's actual content before prompting the user.
+** Title case characters are properly converted to upper case.
+'upcase', 'upcase-region' et al. convert title case characters (such
+as Dz) into their upper case form (such as DZ). As a downside,
+'capitalize' and 'upcase-initials' produce awkward words where first
+two letters are upper case, e.g. DŽungla (instead of Džungla).
+
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
@@ -662,7 +668,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
\f
Local variables:
-coding: us-ascii
+coding: utf-8
mode: outline
paragraph-separate: "[ \f]*$"
end:
diff --git a/src/buffer.h b/src/buffer.h
index 6ac161c..1543f67 100644
--- a/src/buffer.h
+++ b/src/buffer.h
@@ -1349,28 +1349,28 @@ downcase (int c)
return NATNUMP (down) ? XFASTINT (down) : c;
}
-/* True if C is upper case. */
-INLINE bool uppercasep (int c) { return downcase (c) != c; }
-
-/* Upcase a character C known to be not upper case. */
+/* Upcase a character C, or make no change if that cannot be done. */
INLINE int
-upcase1 (int c)
+upcase (int c)
{
Lisp_Object upcase_table = BVAR (current_buffer, upcase_table);
Lisp_Object up = CHAR_TABLE_REF (upcase_table, c);
return NATNUMP (up) ? XFASTINT (up) : c;
}
+/* True if C is upper case. */
+INLINE bool uppercasep (int c)
+{
+ return downcase (c) != c;
+}
+
/* True if C is lower case. */
INLINE bool
lowercasep (int c)
{
- return !uppercasep (c) && upcase1 (c) != c;
+ return !uppercasep (c) && upcase (c) != c;
}
-/* Upcase a character C, or make no change if that cannot be done. */
-INLINE int upcase (int c) { return uppercasep (c) ? c : upcase1 (c); }
-
INLINE_HEADER_END
#endif /* EMACS_BUFFER_H */
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 2d32f49..b86f485 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -64,13 +64,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
multibyte = 1;
if (! multibyte)
MAKE_CHAR_MULTIBYTE (c1);
- c = downcase (c1);
- if (inword)
- XSETFASTINT (obj, c | flags);
- else if (c == (XFASTINT (obj) & ~flagbits))
+ c = flag == CASE_DOWN ? downcase (c1) : upcase (c1);
+ if (c != c1)
{
- if (! inword)
- c = upcase1 (c1);
if (! multibyte)
MAKE_CHAR_UNIBYTE (c);
XSETFASTINT (obj, c | flags);
@@ -95,7 +91,7 @@ casify_object (enum case_action flag, Lisp_Object obj)
c = downcase (c);
else if (!uppercasep (c)
&& (!inword || flag != CASE_CAPITALIZE_UP))
- c = upcase1 (c1);
+ c = upcase (c1);
if ((int) flag >= (int) CASE_CAPITALIZE)
inword = (SYNTAX (c) == Sword);
if (c != c1)
@@ -127,9 +123,8 @@ casify_object (enum case_action flag, Lisp_Object obj)
c = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
if (inword && flag != CASE_CAPITALIZE_UP)
c = downcase (c);
- else if (!uppercasep (c)
- && (!inword || flag != CASE_CAPITALIZE_UP))
- c = upcase1 (c);
+ else if (!inword || flag != CASE_CAPITALIZE_UP)
+ c = upcase (c);
if ((int) flag >= (int) CASE_CAPITALIZE)
inword = (SYNTAX (c) == Sword);
o += CHAR_STRING (c, o);
@@ -236,9 +231,8 @@ casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
c2 = c;
if (inword && flag != CASE_CAPITALIZE_UP)
c = downcase (c);
- else if (!uppercasep (c)
- && (!inword || flag != CASE_CAPITALIZE_UP))
- c = upcase1 (c);
+ else if (!inword || flag != CASE_CAPITALIZE_UP)
+ c = upcase (c);
if ((int) flag >= (int) CASE_CAPITALIZE)
inword = ((SYNTAX (c) == Sword)
&& (inword || !syntax_prefix_flag_p (c)));
diff --git a/src/keyboard.c b/src/keyboard.c
index ca40c6e..2115fc9 100644
--- a/src/keyboard.c
+++ b/src/keyboard.c
@@ -9633,22 +9633,26 @@ read_key_sequence (Lisp_Object *keybuf, int bufsize, Lisp_Object prompt,
use the corresponding lower-case letter instead. */
if (NILP (current_binding)
&& /* indec.start >= t && fkey.start >= t && */ keytran.start >= t
- && INTEGERP (key)
- && ((CHARACTERP (make_number (XINT (key) & ~CHAR_MODIFIER_MASK))
- && uppercasep (XINT (key) & ~CHAR_MODIFIER_MASK))
- || (XINT (key) & shift_modifier)))
+ && INTEGERP (key))
{
Lisp_Object new_key;
+ int k = XINT (key);
+
+ if (k & shift_modifier)
+ XSETINT (new_key, k & ~shift_modifier);
+ else if (CHARACTERP (make_number (k & ~CHAR_MODIFIER_MASK)))
+ {
+ int dc = downcase(k & ~CHAR_MODIFIER_MASK);
+ if (dc == (k & ~CHAR_MODIFIER_MASK))
+ goto not_upcase;
+ XSETINT (new_key, dc | (k & CHAR_MODIFIER_MASK));
+ }
+ else
+ goto not_upcase;
original_uppercase = key;
original_uppercase_position = t - 1;
- if (XINT (key) & shift_modifier)
- XSETINT (new_key, XINT (key) & ~shift_modifier);
- else
- XSETINT (new_key, (downcase (XINT (key) & ~CHAR_MODIFIER_MASK)
- | (XINT (key) & CHAR_MODIFIER_MASK)));
-
/* We have to do this unconditionally, regardless of whether
the lower-case char is defined in the keymaps, because they
might get translated through function-key-map. */
@@ -9659,6 +9663,7 @@ read_key_sequence (Lisp_Object *keybuf, int bufsize, Lisp_Object prompt,
goto replay_sequence;
}
+ not_upcase:
if (NILP (current_binding)
&& help_char_p (EVENT_HEAD (key)) && t > 1)
{
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index ca3657d..8d9bf01 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -62,13 +62,13 @@ casefiddle-tests--characters
(?Ł ?Ł ?ł ?Ł)
(?ł ?Ł ?ł ?Ł)
- ;; FIXME: We should have:
+ ;; FIXME: Commented one is what we want.
;;(?DŽ ?DŽ ?dž ?Dž)
- ;; but instead we have:
(?DŽ ?DŽ ?dž ?DŽ)
- ;; FIXME: Those two are broken at the moment:
;;(?Dž ?DŽ ?dž ?Dž)
+ (?Dž ?DŽ ?dž ?DŽ)
;;(?dž ?DŽ ?dž ?Dž)
+ (?dž ?DŽ ?dž ?DŽ)
(?Σ ?Σ ?σ ?Σ)
(?σ ?Σ ?σ ?Σ)
@@ -152,7 +152,7 @@ casefiddle-tests--characters
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA")
- ("Džungla" "DžUNGLA" "džungla" "Džungla" "Džungla")
+ ("Džungla" "DŽUNGLA" "džungla" "DŽungla" "DŽungla")
("džungla" "DŽUNGLA" "džungla" "DŽungla" "DŽungla")
("define" "DEfiNE" "define" "Define" "Define")
("fish" "fiSH" "fish" "fish" "fish")
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 04/18] Split casify_object into multiple functions
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 03/18] Don’t assume character can be either upper- or lower-case when casing Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 05/18] Introduce case_character function Michal Nazarewicz
` (13 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
casify_object had three major cases to cover and those were mostly
independent of each other. Move those branches to separate function
so it’s easier to comprehend each individual case.
While at it, use somewhat more descriptive ch and cased variable names
rather than c and c1.
* src/casefiddle.c (casify_object): Split into…
(do_casify_integer, do_casify_multibyte_string,
do_casify_unibyte_string): …new functions.
---
src/casefiddle.c | 196 +++++++++++++++++++++++++++++--------------------------
1 file changed, 104 insertions(+), 92 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index b86f485..47ebdf0 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -32,108 +32,120 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
\f
static Lisp_Object
-casify_object (enum case_action flag, Lisp_Object obj)
+do_casify_integer (enum case_action flag, Lisp_Object obj)
+{
+ int flagbits = (CHAR_ALT | CHAR_SUPER | CHAR_HYPER
+ | CHAR_SHIFT | CHAR_CTL | CHAR_META);
+ int flags, ch = XFASTINT (obj), cased;
+ bool multibyte;
+
+ /* If the character has higher bits set above the flags, return it unchanged.
+ It is not a real character. */
+ if (UNSIGNED_CMP (ch, >, flagbits))
+ return obj;
+
+ flags = ch & flagbits;
+ ch = ch & ~flagbits;
+
+ /* FIXME: Even if enable-multibyte-characters is nil, we may manipulate
+ multibyte chars. This means we have a bug for latin-1 chars since when we
+ receive an int 128-255 we can't tell whether it's an eight-bit byte or
+ a latin-1 char. */
+ multibyte = (ch >= 256 ||
+ !NILP (BVAR (current_buffer, enable_multibyte_characters)));
+ if (! multibyte)
+ MAKE_CHAR_MULTIBYTE (ch);
+ cased = flag == CASE_DOWN ? downcase (ch) : upcase (ch);
+ if (cased == ch)
+ return obj;
+
+ if (! multibyte)
+ MAKE_CHAR_UNIBYTE (cased);
+ XSETFASTINT (obj, cased | flags);
+ return obj;
+}
+
+static Lisp_Object
+do_casify_multibyte_string (enum case_action flag, Lisp_Object obj)
+{
+ ptrdiff_t i, i_byte, size = SCHARS (obj);
+ bool inword = flag == CASE_DOWN;
+ int len, ch, cased;
+ USE_SAFE_ALLOCA;
+ ptrdiff_t o_size;
+ if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
+ o_size = PTRDIFF_MAX;
+ unsigned char *dst = SAFE_ALLOCA (o_size);
+ unsigned char *o = dst;
+
+ for (i = i_byte = 0; i < size; i++, i_byte += len)
+ {
+ if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
+ string_overflow ();
+ ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
+ if (inword && flag != CASE_CAPITALIZE_UP)
+ cased = downcase (ch);
+ else if (!inword || flag != CASE_CAPITALIZE_UP)
+ cased = upcase (ch);
+ else
+ cased = ch;
+ if ((int) flag >= (int) CASE_CAPITALIZE)
+ inword = (SYNTAX (ch) == Sword);
+ o += CHAR_STRING (cased, o);
+ }
+ eassert (o - dst <= o_size);
+ obj = make_multibyte_string ((char *) dst, size, o - dst);
+ SAFE_FREE ();
+ return obj;
+}
+
+static Lisp_Object
+do_casify_unibyte_string (enum case_action flag, Lisp_Object obj)
{
- int c, c1;
+ ptrdiff_t i, size = SCHARS (obj);
bool inword = flag == CASE_DOWN;
+ int ch, cased;
+
+ obj = Fcopy_sequence (obj);
+ for (i = 0; i < size; i++)
+ {
+ ch = SREF (obj, i);
+ MAKE_CHAR_MULTIBYTE (ch);
+ cased = ch;
+ if (inword && flag != CASE_CAPITALIZE_UP)
+ ch = downcase (ch);
+ else if (!uppercasep (ch)
+ && (!inword || flag != CASE_CAPITALIZE_UP))
+ ch = upcase (cased);
+ if ((int) flag >= (int) CASE_CAPITALIZE)
+ inword = (SYNTAX (ch) == Sword);
+ if (ch == cased)
+ continue;
+ MAKE_CHAR_UNIBYTE (ch);
+ /* If the char can't be converted to a valid byte, just don't change it */
+ if (ch >= 0 && ch < 256)
+ SSET (obj, i, ch);
+ }
+ return obj;
+}
+static Lisp_Object
+casify_object (enum case_action flag, Lisp_Object obj)
+{
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
Fset_case_table (BVAR (current_buffer, downcase_table));
if (INTEGERP (obj))
- {
- int flagbits = (CHAR_ALT | CHAR_SUPER | CHAR_HYPER
- | CHAR_SHIFT | CHAR_CTL | CHAR_META);
- int flags = XINT (obj) & flagbits;
- bool multibyte = ! NILP (BVAR (current_buffer,
- enable_multibyte_characters));
-
- /* If the character has higher bits set
- above the flags, return it unchanged.
- It is not a real character. */
- if (UNSIGNED_CMP (XFASTINT (obj), >, flagbits))
- return obj;
-
- c1 = XFASTINT (obj) & ~flagbits;
- /* FIXME: Even if enable-multibyte-characters is nil, we may
- manipulate multibyte chars. This means we have a bug for latin-1
- chars since when we receive an int 128-255 we can't tell whether
- it's an eight-bit byte or a latin-1 char. */
- if (c1 >= 256)
- multibyte = 1;
- if (! multibyte)
- MAKE_CHAR_MULTIBYTE (c1);
- c = flag == CASE_DOWN ? downcase (c1) : upcase (c1);
- if (c != c1)
- {
- if (! multibyte)
- MAKE_CHAR_UNIBYTE (c);
- XSETFASTINT (obj, c | flags);
- }
- return obj;
- }
-
- if (!STRINGP (obj))
+ return do_casify_integer (flag, obj);
+ else if (!STRINGP (obj))
wrong_type_argument (Qchar_or_string_p, obj);
- else if (!STRING_MULTIBYTE (obj))
- {
- ptrdiff_t i;
- ptrdiff_t size = SCHARS (obj);
-
- obj = Fcopy_sequence (obj);
- for (i = 0; i < size; i++)
- {
- c = SREF (obj, i);
- MAKE_CHAR_MULTIBYTE (c);
- c1 = c;
- if (inword && flag != CASE_CAPITALIZE_UP)
- c = downcase (c);
- else if (!uppercasep (c)
- && (!inword || flag != CASE_CAPITALIZE_UP))
- c = upcase (c1);
- if ((int) flag >= (int) CASE_CAPITALIZE)
- inword = (SYNTAX (c) == Sword);
- if (c != c1)
- {
- MAKE_CHAR_UNIBYTE (c);
- /* If the char can't be converted to a valid byte, just don't
- change it. */
- if (c >= 0 && c < 256)
- SSET (obj, i, c);
- }
- }
- return obj;
- }
+ else if (!SCHARS (obj))
+ return obj;
+ else if (STRING_MULTIBYTE (obj))
+ return do_casify_multibyte_string (flag, obj);
else
- {
- ptrdiff_t i, i_byte, size = SCHARS (obj);
- int len;
- USE_SAFE_ALLOCA;
- ptrdiff_t o_size;
- if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
- o_size = PTRDIFF_MAX;
- unsigned char *dst = SAFE_ALLOCA (o_size);
- unsigned char *o = dst;
-
- for (i = i_byte = 0; i < size; i++, i_byte += len)
- {
- if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
- string_overflow ();
- c = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
- if (inword && flag != CASE_CAPITALIZE_UP)
- c = downcase (c);
- else if (!inword || flag != CASE_CAPITALIZE_UP)
- c = upcase (c);
- if ((int) flag >= (int) CASE_CAPITALIZE)
- inword = (SYNTAX (c) == Sword);
- o += CHAR_STRING (c, o);
- }
- eassert (o - dst <= o_size);
- obj = make_multibyte_string ((char *) dst, size, o - dst);
- SAFE_FREE ();
- return obj;
- }
+ return do_casify_unibyte_string (flag, obj);
}
DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 05/18] Introduce case_character function
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (2 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 04/18] Split casify_object into multiple functions Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 06/18] Add support for title-casing letters Michal Nazarewicz
` (12 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Move single-character casing logic into a separate function so that
it is collected in a single place. This will make future changes to
the logic easier.
* src/casefiddle.c (struct casing_context, prepare_casing_context): New
sturcture for saving casing context and function to initialise it.
(case_character): New function which cases character base on provided
context.
(do_casify_integer, do_casify_multibyte_string,
do_casify_unibyte_string, casify_object, casify_region): Convert to
use casing_context and case_character.
---
src/casefiddle.c | 135 +++++++++++++++++++++++++++++++------------------------
1 file changed, 77 insertions(+), 58 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 47ebdf0..2fbd23b 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -30,9 +30,56 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include "keymap.h"
enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
+
+/* State for casing individual characters. */
+struct casing_context {
+ /* User-requested action. */
+ enum case_action flag;
+ /* If true, function operates on a buffer as opposed to a string or character.
+ When run on a buffer, syntax_prefix_flag_p is taken into account when
+ determined inword flag. */
+ bool inbuffer;
+ /* Conceptually, this denotes whether we are inside of a word except
+ that if flag is CASE_UP it’s always false and if flag is CASE_DOWN
+ this is always true. */
+ bool inword;
+};
+
+/* Initialise CTX structure and prepares related global data for casing
+ characters. */
+static void
+prepare_casing_context (struct casing_context *ctx,
+ enum case_action flag, bool inbuffer)
+{
+ ctx->flag = flag;
+ ctx->inbuffer = inbuffer;
+ ctx->inword = flag == CASE_DOWN;
+
+ /* If the case table is flagged as modified, rescan it. */
+ if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
+ Fset_case_table (BVAR (current_buffer, downcase_table));
+
+ if (inbuffer && (int) flag >= (int) CASE_CAPITALIZE)
+ SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
+}
+
+/* Based on CTX, case character CH accordingly. Update CTX as necessary.
+ Return cased character. */
+static int
+case_character (struct casing_context *ctx, int ch)
+{
+ if (ctx->inword)
+ ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+ else
+ ch = upcase(ch);
+ if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+ ctx->inword = SYNTAX (ch) == Sword &&
+ (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
+ return ch;
+}
\f
static Lisp_Object
-do_casify_integer (enum case_action flag, Lisp_Object obj)
+do_casify_integer (struct casing_context *ctx, Lisp_Object obj)
{
int flagbits = (CHAR_ALT | CHAR_SUPER | CHAR_HYPER
| CHAR_SHIFT | CHAR_CTL | CHAR_META);
@@ -55,7 +102,7 @@ do_casify_integer (enum case_action flag, Lisp_Object obj)
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
if (! multibyte)
MAKE_CHAR_MULTIBYTE (ch);
- cased = flag == CASE_DOWN ? downcase (ch) : upcase (ch);
+ cased = case_character (ctx, ch);
if (cased == ch)
return obj;
@@ -66,10 +113,9 @@ do_casify_integer (enum case_action flag, Lisp_Object obj)
}
static Lisp_Object
-do_casify_multibyte_string (enum case_action flag, Lisp_Object obj)
+do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
ptrdiff_t i, i_byte, size = SCHARS (obj);
- bool inword = flag == CASE_DOWN;
int len, ch, cased;
USE_SAFE_ALLOCA;
ptrdiff_t o_size;
@@ -83,14 +129,7 @@ do_casify_multibyte_string (enum case_action flag, Lisp_Object obj)
if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
string_overflow ();
ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
- if (inword && flag != CASE_CAPITALIZE_UP)
- cased = downcase (ch);
- else if (!inword || flag != CASE_CAPITALIZE_UP)
- cased = upcase (ch);
- else
- cased = ch;
- if ((int) flag >= (int) CASE_CAPITALIZE)
- inword = (SYNTAX (ch) == Sword);
+ cased = case_character (ctx, ch);
o += CHAR_STRING (cased, o);
}
eassert (o - dst <= o_size);
@@ -100,10 +139,9 @@ do_casify_multibyte_string (enum case_action flag, Lisp_Object obj)
}
static Lisp_Object
-do_casify_unibyte_string (enum case_action flag, Lisp_Object obj)
+do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
ptrdiff_t i, size = SCHARS (obj);
- bool inword = flag == CASE_DOWN;
int ch, cased;
obj = Fcopy_sequence (obj);
@@ -111,20 +149,13 @@ do_casify_unibyte_string (enum case_action flag, Lisp_Object obj)
{
ch = SREF (obj, i);
MAKE_CHAR_MULTIBYTE (ch);
- cased = ch;
- if (inword && flag != CASE_CAPITALIZE_UP)
- ch = downcase (ch);
- else if (!uppercasep (ch)
- && (!inword || flag != CASE_CAPITALIZE_UP))
- ch = upcase (cased);
- if ((int) flag >= (int) CASE_CAPITALIZE)
- inword = (SYNTAX (ch) == Sword);
+ cased = case_character (ctx, ch);
if (ch == cased)
continue;
- MAKE_CHAR_UNIBYTE (ch);
+ MAKE_CHAR_UNIBYTE (cased);
/* If the char can't be converted to a valid byte, just don't change it */
- if (ch >= 0 && ch < 256)
- SSET (obj, i, ch);
+ if (cased >= 0 && cased < 256)
+ SSET (obj, i, cased);
}
return obj;
}
@@ -132,20 +163,19 @@ do_casify_unibyte_string (enum case_action flag, Lisp_Object obj)
static Lisp_Object
casify_object (enum case_action flag, Lisp_Object obj)
{
- /* If the case table is flagged as modified, rescan it. */
- if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
- Fset_case_table (BVAR (current_buffer, downcase_table));
+ struct casing_context ctx;
+ prepare_casing_context (&ctx, flag, false);
if (INTEGERP (obj))
- return do_casify_integer (flag, obj);
+ return do_casify_integer (&ctx, obj);
else if (!STRINGP (obj))
wrong_type_argument (Qchar_or_string_p, obj);
else if (!SCHARS (obj))
return obj;
else if (STRING_MULTIBYTE (obj))
- return do_casify_multibyte_string (flag, obj);
+ return do_casify_multibyte_string (&ctx, obj);
else
- return do_casify_unibyte_string (flag, obj);
+ return do_casify_unibyte_string (&ctx, obj);
}
DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
@@ -196,8 +226,6 @@ The argument object is not altered--the value is a copy. */)
static void
casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
{
- int c;
- bool inword = flag == CASE_DOWN;
bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
ptrdiff_t start, end;
ptrdiff_t start_byte;
@@ -208,14 +236,12 @@ casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
ptrdiff_t opoint = PT;
ptrdiff_t opoint_byte = PT_BYTE;
+ struct casing_context ctx;
+
if (EQ (b, e))
/* Not modifying because nothing marked */
return;
- /* If the case table is flagged as modified, rescan it. */
- if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
- Fset_case_table (BVAR (current_buffer, downcase_table));
-
validate_region (&b, &e);
start = XFASTINT (b);
end = XFASTINT (e);
@@ -223,32 +249,25 @@ casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
record_change (start, end - start);
start_byte = CHAR_TO_BYTE (start);
- SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
+ prepare_casing_context (&ctx, flag, true);
while (start < end)
{
- int c2, len;
+ int ch, cased, len;
if (multibyte)
{
- c = FETCH_MULTIBYTE_CHAR (start_byte);
- len = CHAR_BYTES (c);
+ ch = FETCH_MULTIBYTE_CHAR (start_byte);
+ len = CHAR_BYTES (ch);
}
else
{
- c = FETCH_BYTE (start_byte);
- MAKE_CHAR_MULTIBYTE (c);
+ ch = FETCH_BYTE (start_byte);
+ MAKE_CHAR_MULTIBYTE (ch);
len = 1;
}
- c2 = c;
- if (inword && flag != CASE_CAPITALIZE_UP)
- c = downcase (c);
- else if (!inword || flag != CASE_CAPITALIZE_UP)
- c = upcase (c);
- if ((int) flag >= (int) CASE_CAPITALIZE)
- inword = ((SYNTAX (c) == Sword)
- && (inword || !syntax_prefix_flag_p (c)));
- if (c != c2)
+ cased = case_character (&ctx, ch);
+ if (ch != cased)
{
last = start;
if (first < 0)
@@ -256,18 +275,18 @@ casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
if (! multibyte)
{
- MAKE_CHAR_UNIBYTE (c);
- FETCH_BYTE (start_byte) = c;
+ MAKE_CHAR_UNIBYTE (cased);
+ FETCH_BYTE (start_byte) = cased;
}
- else if (ASCII_CHAR_P (c2) && ASCII_CHAR_P (c))
- FETCH_BYTE (start_byte) = c;
+ else if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
+ FETCH_BYTE (start_byte) = cased;
else
{
- int tolen = CHAR_BYTES (c);
+ int tolen = CHAR_BYTES (cased);
int j;
unsigned char str[MAX_MULTIBYTE_LENGTH];
- CHAR_STRING (c, str);
+ CHAR_STRING (cased, str);
if (len == tolen)
{
/* Length is unchanged. */
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 06/18] Add support for title-casing letters
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (3 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 05/18] Introduce case_character function Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 07/18] Split up casify_region function Michal Nazarewicz
` (11 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
titlecase_char_table member. It’s set to the ‘titlecase’ Unicode
property table if capitalisation has been requested.
(case_character): Make use of the titlecase_char_table to title-case
initial characters when capitalising.
* test/src/casefiddle-tests.el (casefiddle-tests--characters,
casefiddle-tests-casing): Update test cases which are now passing.
---
etc/NEWS | 10 ++++++----
src/casefiddle.c | 27 +++++++++++++++++++++------
test/src/casefiddle-tests.el | 39 ++++++++++++++++++++++++++-------------
3 files changed, 53 insertions(+), 23 deletions(-)
diff --git a/etc/NEWS b/etc/NEWS
index 61afcc6..f2bbead 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -235,11 +235,13 @@ same as in modes where the character is not whitespace.
Instead of only checking the modification time, Emacs now also checks
the file's actual content before prompting the user.
-** Title case characters are properly converted to upper case.
+** Title case characters are properly cased (from and into).
'upcase', 'upcase-region' et al. convert title case characters (such
-as Dz) into their upper case form (such as DZ). As a downside,
-'capitalize' and 'upcase-initials' produce awkward words where first
-two letters are upper case, e.g. DŽungla (instead of Džungla).
+as Dz) into their upper case form (such as DZ).
+
+Similarly, 'capitalize', 'upcase-initials' et al. make use of
+title-case forms of initial characters (correctly producing for example
+Džungla instead of incorrect DŽungla).
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 2fbd23b..b3ffa86 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -33,6 +33,10 @@ enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
/* State for casing individual characters. */
struct casing_context {
+ /* A char-table with title-case character mappings or nil. It being non-nil
+ implies flag being CASE_CAPITALIZE or CASE_CAPITALIZE_UP (but the reverse
+ is not true). */
+ Lisp_Object titlecase_char_table;
/* User-requested action. */
enum case_action flag;
/* If true, function operates on a buffer as opposed to a string or character.
@@ -54,6 +58,9 @@ prepare_casing_context (struct casing_context *ctx,
ctx->flag = flag;
ctx->inbuffer = inbuffer;
ctx->inword = flag == CASE_DOWN;
+ ctx->titlecase_char_table = (int)flag >= (int)CASE_CAPITALIZE
+ ? uniprop_table (intern_c_string ("titlecase"))
+ : Qnil;
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -68,10 +75,16 @@ prepare_casing_context (struct casing_context *ctx,
static int
case_character (struct casing_context *ctx, int ch)
{
+ Lisp_Object prop;
+
if (ctx->inword)
ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+ else if (!NILP (ctx->titlecase_char_table) &&
+ CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
+ ch = XFASTINT (prop);
else
ch = upcase(ch);
+
if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
ctx->inword = SYNTAX (ch) == Sword &&
(!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
@@ -199,8 +212,8 @@ The argument object is not altered--the value is a copy. */)
DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
doc: /* Convert argument to capitalized form and return that.
-This means that each word's first character is upper case
-and the rest is lower case.
+This means that each word's first character is upper case (more
+precisely, if available, title case) and the rest is lower case.
The argument may be a character or string. The result has the same type.
The argument object is not altered--the value is a copy. */)
(Lisp_Object obj)
@@ -212,7 +225,8 @@ The argument object is not altered--the value is a copy. */)
DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
doc: /* Convert the initial of each word in the argument to upper case.
-Do not change the other letters of each word.
+(More precisely, if available, initial of each word is converted to
+title-case). Do not change the other letters of each word.
The argument may be a character or string. The result has the same type.
The argument object is not altered--the value is a copy. */)
(Lisp_Object obj)
@@ -376,8 +390,8 @@ point and the mark is operated on. */)
DEFUN ("capitalize-region", Fcapitalize_region, Scapitalize_region, 2, 2, "r",
doc: /* Convert the region to capitalized form.
-Capitalized form means each word's first character is upper case
-and the rest of it is lower case.
+Capitalized form means each word's first character is upper case (more
+precisely, if available, title case) and the rest of it is lower case.
In programs, give two arguments, the starting and ending
character positions to operate on. */)
(Lisp_Object beg, Lisp_Object end)
@@ -391,7 +405,8 @@ character positions to operate on. */)
DEFUN ("upcase-initials-region", Fupcase_initials_region,
Supcase_initials_region, 2, 2, "r",
doc: /* Upcase the initial of each word in the region.
-Subsequent letters of each word are not changed.
+(More precisely, if available, initial of each word is converted to
+title-case). Subsequent letters of each word are not changed.
In programs, give two arguments, the starting and ending
character positions to operate on. */)
(Lisp_Object beg, Lisp_Object end)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 8d9bf01..def74a0 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -62,13 +62,9 @@ casefiddle-tests--characters
(?Ł ?Ł ?ł ?Ł)
(?ł ?Ł ?ł ?Ł)
- ;; FIXME: Commented one is what we want.
- ;;(?DŽ ?DŽ ?dž ?Dž)
- (?DŽ ?DŽ ?dž ?DŽ)
- ;;(?Dž ?DŽ ?dž ?Dž)
- (?Dž ?DŽ ?dž ?DŽ)
- ;;(?dž ?DŽ ?dž ?Dž)
- (?dž ?DŽ ?dž ?DŽ)
+ (?DŽ ?DŽ ?dž ?Dž)
+ (?Dž ?DŽ ?dž ?Dž)
+ (?dž ?DŽ ?dž ?Dž)
(?Σ ?Σ ?σ ?Σ)
(?σ ?Σ ?σ ?Σ)
@@ -141,19 +137,19 @@ casefiddle-tests--characters
;; input upcase downcase capitalize upcase-initials
'(("Foo baR" "FOO BAR" "foo bar" "Foo Bar" "Foo BaR")
("Ⅷ ⅷ" "Ⅷ Ⅷ" "ⅷ ⅷ" "Ⅷ Ⅷ" "Ⅷ Ⅷ")
+ ;; "DžUNGLA" is an unfortunate result but it’s really best we can
+ ;; do while still being consistent. Hopefully, users only ever
+ ;; use upcase-initials on camelCase identifiers not real words.
+ ("DŽUNGLA" "DŽUNGLA" "džungla" "Džungla" "DžUNGLA")
+ ("Džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
+ ("džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
;; FIXME: Everything below is broken at the moment. Here’s what
;; should happen:
- ;;("DŽUNGLA" "DŽUNGLA" "džungla" "Džungla" "DžUNGLA")
- ;;("Džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
- ;;("džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
;;("define" "DEFINE" "define" "Define" "Define")
;;("fish" "FIsh" "fish" "Fish" "Fish")
;;("Straße" "STRASSE" "straße" "Straße" "Straße")
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
- ("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA")
- ("Džungla" "DŽUNGLA" "džungla" "DŽungla" "DŽungla")
- ("džungla" "DŽUNGLA" "džungla" "DŽungla" "DŽungla")
("define" "DEfiNE" "define" "Define" "Define")
("fish" "fiSH" "fish" "fish" "fish")
("Straße" "STRAßE" "straße" "Straße" "Straße")
@@ -186,4 +182,21 @@ casefiddle-tests--characters
(setq expected (cdr expected)))))))))
+(ert-deftest casefiddle-tests-char-casing ()
+ ;; input upcase downcase [titlecase]
+ (dolist (test '((?a ?A ?a) (?A ?A ?a)
+ (?ł ?Ł ?ł) (?Ł ?Ł ?ł)
+ (?ß ?ß ?ß) (?ẞ ?ẞ ?ß)
+ (?ⅷ ?Ⅷ ?ⅷ) (?Ⅷ ?Ⅷ ?ⅷ)
+ (?DŽ ?DŽ ?dž ?Dž) (?Dž ?DŽ ?dž ?Dž) (?dž ?DŽ ?dž ?Dž)))
+ (let ((ch (car test))
+ (up (nth 1 test))
+ (lo (nth 2 test))
+ (tc (or (nth 3 test) (nth 1 test))))
+ (should (eq up (upcase ch)))
+ (should (eq lo (downcase ch)))
+ (should (eq tc (capitalize ch)))
+ (should (eq tc (upcase-initials ch))))))
+
+
;;; casefiddle-tests.el ends here
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 07/18] Split up casify_region function.
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (4 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 06/18] Add support for title-casing letters Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:17 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 08/18] Support casing characters which map into multiple code points Michal Nazarewicz
` (10 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
No functional changes at this time but splitting casify_region into
a function dealing with multibyte and another dealing with unibyte
buffers will make future code changes slightly easier.
* src/casefiddle.c (casify_region): Move most of the code into two
new functions:
(do_casify_multibyte_region, do_casify_unibyte_region): new functions.
---
src/casefiddle.c | 159 ++++++++++++++++++++++++++++++-------------------------
1 file changed, 86 insertions(+), 73 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index b3ffa86..a016871 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -234,102 +234,115 @@ The argument object is not altered--the value is a copy. */)
return casify_object (CASE_CAPITALIZE_UP, obj);
}
\f
-/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
- b and e specify range of buffer to operate on. */
-
-static void
-casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
+/* Based on CTX, case region in a multibyte buffer from POS to *ENDP. Return
+ first position that has changed and save last position in *ENDP. If no
+ characters were changed, return -1 and *ENDP is unspecified. */
+static ptrdiff_t
+do_casify_unibyte_region (struct casing_context *ctx,
+ ptrdiff_t pos, ptrdiff_t *endp)
{
- bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
- ptrdiff_t start, end;
- ptrdiff_t start_byte;
+ ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
+ ptrdiff_t end = *endp;
+ int ch, cased;
- /* Position of first and last changes. */
- ptrdiff_t first = -1, last;
+ for (; pos < end; ++pos)
+ {
+ ch = FETCH_BYTE (pos);
+ MAKE_CHAR_MULTIBYTE (ch);
- ptrdiff_t opoint = PT;
- ptrdiff_t opoint_byte = PT_BYTE;
+ cased = case_character (ctx, ch);
+ if (cased == ch)
+ continue;
- struct casing_context ctx;
+ last = pos;
+ if (first < 0)
+ first = pos;
- if (EQ (b, e))
- /* Not modifying because nothing marked */
- return;
+ MAKE_CHAR_UNIBYTE (cased);
+ FETCH_BYTE (pos) = cased;
+ }
- validate_region (&b, &e);
- start = XFASTINT (b);
- end = XFASTINT (e);
- modify_text (start, end);
- record_change (start, end - start);
- start_byte = CHAR_TO_BYTE (start);
+ *endp = last + 1;
+ return first;
+}
- prepare_casing_context (&ctx, flag, true);
+/* Based on CTX, case region in a unibyte buffer from POS to *ENDP. Return
+ first position that has changed and save last position in *ENDP. If no
+ characters were changed, return -1 and *ENDP is unspecified. */
+static ptrdiff_t
+do_casify_multibyte_region (struct casing_context *ctx,
+ ptrdiff_t pos, ptrdiff_t *endp)
+{
+ ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
+ ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;
+ ptrdiff_t opoint = PT;
+ int ch, cased, len;
- while (start < end)
+ while (pos < end)
{
- int ch, cased, len;
-
- if (multibyte)
- {
- ch = FETCH_MULTIBYTE_CHAR (start_byte);
- len = CHAR_BYTES (ch);
- }
- else
- {
- ch = FETCH_BYTE (start_byte);
- MAKE_CHAR_MULTIBYTE (ch);
- len = 1;
- }
- cased = case_character (&ctx, ch);
- if (ch != cased)
+ ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
+ cased = case_character (ctx, ch);
+ if (cased != ch)
{
- last = start;
+ last = pos;
if (first < 0)
- first = start;
+ first = pos;
- if (! multibyte)
- {
- MAKE_CHAR_UNIBYTE (cased);
- FETCH_BYTE (start_byte) = cased;
- }
- else if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
- FETCH_BYTE (start_byte) = cased;
+ if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
+ FETCH_BYTE (pos_byte) = cased;
else
{
- int tolen = CHAR_BYTES (cased);
- int j;
unsigned char str[MAX_MULTIBYTE_LENGTH];
-
- CHAR_STRING (cased, str);
- if (len == tolen)
- {
- /* Length is unchanged. */
- for (j = 0; j < len; ++j)
- FETCH_BYTE (start_byte + j) = str[j];
- }
+ int totlen = CHAR_STRING (cased, str);
+ if (len == totlen)
+ memcpy (BYTE_POS_ADDR (pos_byte), str, len);
else
- {
- /* Replace one character with the other,
- keeping text properties the same. */
- replace_range_2 (start, start_byte,
- start + 1, start_byte + len,
- (char *) str, 1, tolen,
- 0);
- len = tolen;
- }
+ /* Replace one character with the other(s), keeping text
+ properties the same. */
+ replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
+ (char *) str, 9, totlen, 0);
+ len = totlen;
}
}
- start++;
- start_byte += len;
+ pos++;
+ pos_byte += len;
}
if (PT != opoint)
- TEMP_SET_PT_BOTH (opoint, opoint_byte);
+ TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
+
+ *endp = last;
+ return first;
+}
+
+/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
+ b and e specify range of buffer to operate on. */
+static void
+casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
+{
+ struct casing_context ctx;
+ ptrdiff_t start, end;
+
+ if (EQ (b, e))
+ /* Not modifying because nothing marked */
+ return;
+
+ validate_region (&b, &e);
+ start = XFASTINT (b);
+ end = XFASTINT (e);
+ modify_text (start, end);
+ record_change (start, end - start);
+ prepare_casing_context (&ctx, flag, true);
+
+ if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
+ start = do_casify_unibyte_region (&ctx, start, &end);
+ else
+ start = do_casify_multibyte_region (&ctx, start, &end);
- if (first >= 0)
+ if (start >= 0)
{
- signal_after_change (first, last + 1 - first, last + 1 - first);
- update_compositions (first, last + 1, CHECK_ALL);
+ signal_after_change (start, end + 1 - start, end + 1 - start);
+ update_compositions (start, end + 1, CHECK_ALL);
}
}
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 07/18] Split up casify_region function.
2016-10-04 1:10 ` bug#24603: [RFC 07/18] Split up casify_region function Michal Nazarewicz
@ 2016-10-04 7:17 ` Eli Zaretskii
2016-10-18 2:27 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:17 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:30 +0200
>
> No functional changes at this time but splitting casify_region into
> a function dealing with multibyte and another dealing with unibyte
> buffers will make future code changes slightly easier.
>
> * src/casefiddle.c (casify_region): Move most of the code into two
> new functions:
> (do_casify_multibyte_region, do_casify_unibyte_region): new functions.
Please make sure the multibyte version works with embedded eight-bit
characters, in both representations. Some of the code you removed
could handle those cases.
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 07/18] Split up casify_region function.
2016-10-04 7:17 ` Eli Zaretskii
@ 2016-10-18 2:27 ` Michal Nazarewicz
0 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-18 2:27 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Date: Tue, 4 Oct 2016 03:10:30 +0200
>>
>> No functional changes at this time but splitting casify_region into
>> a function dealing with multibyte and another dealing with unibyte
>> buffers will make future code changes slightly easier.
>>
>> * src/casefiddle.c (casify_region): Move most of the code into two
>> new functions:
>> (do_casify_multibyte_region, do_casify_unibyte_region): new functions.
>
> Please make sure the multibyte version works with embedded eight-bit
> characters, in both representations. Some of the code you removed
> could handle those cases.
In the new version of the first patch, I’ve added bunch of tests for
unibyte and multibyte strings including byte8 characters so this all
should be covered.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (5 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 07/18] Split up casify_region function Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:38 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 09/18] Implement special sigma casing rule Michal Nazarewicz
` (9 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Implement uncoditional special casing rules defined in Unicode standard.
Among other thigs, they deal with cases when a single code point is
replaced by multiple ones becasue simple character does not exist (e.g.
fi ligature turning into FL) or is not commonly used (e.g. ß turning into
SS).
* admin/unidata/SpecialCasing.txt: New data file pulled from Unicode
standard distribution.
* admin/unidata/README: Mention SpecialCasing.txt.
* src/make-special-casing.py: New script to generate special-casing.h
file from the SpecialCasing.txt data file.
* src/casefiddle.c: Include special-casing.h so special casing rules are
available and can be used in the translation unit.
(struct casing_str_buf): New structure for representing short strings.
It’s used to compactly encode special casing rules.
(case_character_imlp): New function which can handle one-to-many
character mappings.
(case_character, case_single_character): Wrappers for the above
functions. The former may map one character to multiple code points
while the latter does what the former used to do (i.e. handles
one-to-one mappings only).
(do_casify_integer, do_casify_unibyte_string,
do_casify_unibyte_region): Use case_single_character.
(do_casify_multibyte_string, do_casify_multibyte_region): Support new
features of case_character.
* (do_casify_region): Updated after do_casify_multibyte_string changes.
(upcase, capitalize, upcase-initials): Update documentation to mention
limitations when working on characters.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Update test
cases which are now passing.
* test/lisp/char-fold-tests.el (char-fold--ascii-upcase,
char-fold--ascii-downcase): New functions which behave like old ‘upcase’
and ‘downcase’.
(char-fold--test-match-exactly): Use the new functions. This is needed
because otherwise fi and similar characters are turned into their multi-
-character representation.
---
.gitignore | 1 +
admin/unidata/README | 4 +
admin/unidata/SpecialCasing.txt | 281 ++++++++++++++++++++++++++++++++++++++++
etc/NEWS | 16 ++-
src/Makefile.in | 3 +
src/casefiddle.c | 218 ++++++++++++++++++++++---------
src/deps.mk | 2 +-
src/make-special-casing.py | 189 +++++++++++++++++++++++++++
test/lisp/char-fold-tests.el | 12 +-
test/src/casefiddle-tests.el | 9 +-
10 files changed, 658 insertions(+), 77 deletions(-)
create mode 100644 admin/unidata/SpecialCasing.txt
create mode 100644 src/make-special-casing.py
diff --git a/.gitignore b/.gitignore
index 15f9c56..a07f972 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,6 +79,7 @@ lib/warn-on-use.h
src/buildobj.h
src/globals.h
src/lisp.mk
+src/special-casing.h
# Lisp-level sources built by 'make'.
*cus-load.el
diff --git a/admin/unidata/README b/admin/unidata/README
index 534670c..06a6666 100644
--- a/admin/unidata/README
+++ b/admin/unidata/README
@@ -24,3 +24,7 @@ http://www.unicode.org/Public/8.0.0/ucd/Blocks.txt
NormalizationTest.txt
http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
2016-07-16
+
+SpecialCasing.txt
+http://unicode.org/Public/UNIDATA/SpecialCasing.txt
+2016-03-03
diff --git a/admin/unidata/SpecialCasing.txt b/admin/unidata/SpecialCasing.txt
new file mode 100644
index 0000000..b23fa7f
--- /dev/null
+++ b/admin/unidata/SpecialCasing.txt
@@ -0,0 +1,281 @@
+# SpecialCasing-9.0.0.txt
+# Date: 2016-03-02, 18:55:13 GMT
+# © 2016 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see http://www.unicode.org/reports/tr44/
+#
+# Special Casing
+#
+# This file is a supplement to the UnicodeData.txt file. It does not define any
+# properties, but rather provides additional information about the casing of
+# Unicode characters, for situations when casing incurs a change in string length
+# or is dependent on context or locale. For compatibility, the UnicodeData.txt
+# file only contains simple case mappings for characters where they are one-to-one
+# and independent of context and language. The data in this file, combined with
+# the simple case mappings in UnicodeData.txt, defines the full case mappings
+# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
+#
+# Note that the preferred mechanism for defining tailored casing operations is
+# the Unicode Common Locale Data Repository (CLDR). For more information, see the
+# discussion of case mappings and case algorithms in the Unicode Standard.
+#
+# All code points not listed in this file that do not have a simple case mappings
+# in UnicodeData.txt map to themselves.
+# ================================================================================
+# Format
+# ================================================================================
+# The entries in this file are in the following machine-readable format:
+#
+# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+#
+# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
+# of <code>, expressed as character values in hex. If there is more than one character,
+# they are separated by spaces. Other than as used to separate elements, spaces are
+# to be ignored.
+#
+# The <condition_list> is optional. Where present, it consists of one or more language IDs
+# or casing contexts, separated by spaces. In these conditions:
+# - A condition list overrides the normal behavior if all of the listed conditions are true.
+# - The casing context is always the context of the characters in the original string,
+# NOT in the resulting string.
+# - Case distinctions in the condition list are not significant.
+# - Conditions preceded by "Not_" represent the negation of the condition.
+# The condition list is not represented in the UCD as a formal property.
+#
+# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
+#
+# A casing context for a character is defined by Section 3.13 Default Case Algorithms
+# of The Unicode Standard.
+#
+# Parsers of this file must be prepared to deal with future additions to this format:
+# * Additional contexts
+# * Additional fields
+# ================================================================================
+
+# ================================================================================
+# Unconditional mappings
+# ================================================================================
+
+# The German es-zed is special--the normal mapping is to SS.
+# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
+
+00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
+
+# Preserve canonical equivalence for I with dot. Turkic is handled below.
+
+0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# Ligatures
+
+FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
+FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
+FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
+FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
+FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
+FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
+FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
+
+0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
+FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
+FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
+FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
+FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
+FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
+
+# No corresponding uppercase precomposed character
+
+0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
+1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
+1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
+1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
+1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
+1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
+1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
+1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
+1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
+1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
+1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+
+# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
+# the result will be incorrect unless the iota-subscript is moved to the end
+# of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
+# This process can be achieved by first transforming the text to NFC before casing.
+# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
+
+# The following cases are already in the UnicodeData.txt file, so are only commented here.
+
+# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
+
+# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
+# have special uppercases.
+# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
+
+1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+
+# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
+
+1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+
+1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+# ================================================================================
+# Conditional Mappings
+# The remainder of this file provides conditional casing data used to produce
+# full case mappings.
+# ================================================================================
+# Language-Insensitive Mappings
+# These are characters whose full case mappings do not depend on language, but do
+# depend on context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Special case for final form of sigma
+
+03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
+
+# Note: the following cases for non-final are already in the UnicodeData.txt file.
+
+# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
+# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
+
+# Note: the following cases are not included, since they would case-fold in lowercasing
+
+# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
+
+# ================================================================================
+# Language-Sensitive Mappings
+# These are characters whose full case mappings depend on language and perhaps also
+# context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Lithuanian
+
+# Lithuanian retains the dot in a lowercase i when followed by accents.
+
+# Remove DOT ABOVE after "i" with upper or titlecase
+
+0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+
+# Introduce an explicit dot above when lowercasing capital I's and J's
+# whenever there are more accents above.
+# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+
+0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
+004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
+012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
+00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
+00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
+0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
+
+# ================================================================================
+
+# Turkish and Azeri
+
+# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+# The following rules handle those cases.
+
+0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
+# This matches the behavior of the canonically equivalent I-dot_above
+
+0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
+
+# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
+
+0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
+0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
+
+# When uppercasing, i turns into a dotted capital I
+
+0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
+0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
+
+# Note: the following case is already in the UnicodeData.txt file.
+
+# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
+
+# EOF
+
diff --git a/etc/NEWS b/etc/NEWS
index f2bbead..3396f9f 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -235,13 +235,17 @@ same as in modes where the character is not whitespace.
Instead of only checking the modification time, Emacs now also checks
the file's actual content before prompting the user.
-** Title case characters are properly cased (from and into).
-'upcase', 'upcase-region' et al. convert title case characters (such
-as Dz) into their upper case form (such as DZ).
+** Various casing improvements.
-Similarly, 'capitalize', 'upcase-initials' et al. make use of
-title-case forms of initial characters (correctly producing for example
-Džungla instead of incorrect DŽungla).
+*** 'upcase', 'upcase-region' et al. convert title case characters
+(such as Dz) into their upper case form (such as DZ).
+
+*** 'capitalize', 'upcase-initials' et al. make use of title-case forms
+of initial characters (correctly producing for example Džungla instead
+of incorrect DŽungla).
+
+*** Characters which turn into multiple ones when cased are correctly handled.
+For example, fi ligature is converted to FI when upper cased.
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
diff --git a/src/Makefile.in b/src/Makefile.in
index 89f7a92..98a6181 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -711,6 +711,9 @@ $(lwlibdir)/TAGS: FORCE
tags: TAGS ../lisp/TAGS $(lwlibdir)/TAGS
.PHONY: tags
+special-casing.h: make-special-casing.py ../admin/unidata/SpecialCasing.txt
+ $(AM_V_GEN)
+ python $^ $@
### Bootstrapping.
diff --git a/src/casefiddle.c b/src/casefiddle.c
index a016871..35ff674 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -29,8 +29,16 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include "composite.h"
#include "keymap.h"
+struct casing_str_buf {
+ unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
+ unsigned char len_chars;
+ unsigned char len_bytes;
+};
+
enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
+#include "special-casing.h"
+
/* State for casing individual characters. */
struct casing_context {
/* A char-table with title-case character mappings or nil. It being non-nil
@@ -70,25 +78,90 @@ prepare_casing_context (struct casing_context *ctx,
SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
}
-/* Based on CTX, case character CH accordingly. Update CTX as necessary.
- Return cased character. */
+/* Based on CTX, case character CH. If BUF is NULL, return cased character.
+ Otherwise, if BUF is non-NULL, save result in it and return whether the
+ character has been changed.
+
+ Since meaning of return value depends on arguments, it’s more convenient to
+ use case_single_character or case_character instead. */
static int
-case_character (struct casing_context *ctx, int ch)
+case_character_impl (struct casing_str_buf *buf,
+ struct casing_context *ctx, int ch)
{
+ enum case_action flag;
Lisp_Object prop;
+ bool was_inword;
+ int cased;
- if (ctx->inword)
- ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+ /* Update inword state */
+ was_inword = ctx->inword;
+ if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+ ctx->inword = SYNTAX (ch) == Sword &&
+ (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+
+ /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+ if (!was_inword)
+ flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+ else if (ctx->flag != CASE_CAPITALIZE_UP)
+ flag = CASE_DOWN;
+ else
+ {
+ cased = ch;
+ goto done;
+ }
+
+ /* Look through the special casing entries. */
+ if (buf)
+ {
+ const special_casing_char_t *it;
+ for (it = special_casing_code_points; *it && *it <= ch; ++it)
+ if (*it == ch)
+ {
+ const struct casing_str_buf *entry = special_casing_entries +
+ ((it - special_casing_code_points) * 3 + (int)flag);
+ memcpy (buf, entry, sizeof *buf);
+ buf->len_chars &= ~SPECIAL_CASING_NO_CHANGE_BIT;
+ return !(entry->len_chars & SPECIAL_CASING_NO_CHANGE_BIT);
+ }
+ }
+
+ /* Handle simple, one-to-one case. */
+ if (flag == CASE_DOWN)
+ cased = downcase (ch);
else if (!NILP (ctx->titlecase_char_table) &&
CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
- ch = XFASTINT (prop);
+ cased = XFASTINT (prop);
else
- ch = upcase(ch);
+ cased = upcase(ch);
+
+ /* And we’re done. */
+ done:
+ if (!buf)
+ return cased;
+ buf->len_chars = 1;
+ buf->len_bytes = CHAR_STRING (cased, buf->data);
+ return cased != ch;
+}
- if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
- ctx->inword = SYNTAX (ch) == Sword &&
- (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
- return ch;
+/* Based on CTX, case character CH accordingly. Update CTX as necessary.
+ Return cased character.
+
+ Special casing rules (such as upcase(fi) = FI) are not handled. For
+ characters whose casing results in multiple code points, the character is
+ returned unchanged. */
+static inline int
+case_single_character (struct casing_context *ctx, int ch)
+{
+ return case_character_impl (NULL, ctx, ch);
+}
+
+/* Save in BUF result of casing character CH. Return whether casing changed the
+ character. This is like case_single_character but also handles one-to-many
+ casing rules. */
+static inline bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+{
+ return case_character_impl (buf, ctx, ch);
}
\f
static Lisp_Object
@@ -115,7 +188,7 @@ do_casify_integer (struct casing_context *ctx, Lisp_Object obj)
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
if (! multibyte)
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (cased == ch)
return obj;
@@ -128,25 +201,34 @@ do_casify_integer (struct casing_context *ctx, Lisp_Object obj)
static Lisp_Object
do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
- ptrdiff_t i, i_byte, size = SCHARS (obj);
- int len, ch, cased;
+ /* We assume data is the first member of casing_str_buf structure so that if
+ we cast a (char *) into (struct casing_str_buf *) the representation of the
+ character is at the beginning of the buffer. This is why we don’t need
+ separate struct casing_str_buf object but rather write directly to o. */
+ typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
+
+ ptrdiff_t size = SCHARS (obj), n;
+ int ch;
USE_SAFE_ALLOCA;
- ptrdiff_t o_size;
- if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
- o_size = PTRDIFF_MAX;
- unsigned char *dst = SAFE_ALLOCA (o_size);
+ if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
+ INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
+ n = PTRDIFF_MAX;
+ unsigned char *const dst = SAFE_ALLOCA (n), *const dst_end = dst + n;
unsigned char *o = dst;
- for (i = i_byte = 0; i < size; i++, i_byte += len)
+ const unsigned char *src = SDATA (obj);
+
+ for (n = 0; size; --size)
{
- if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
+ if (dst_end - o < sizeof(struct casing_str_buf))
string_overflow ();
- ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
- cased = case_character (ctx, ch);
- o += CHAR_STRING (cased, o);
+ ch = STRING_CHAR_ADVANCE (src);
+ case_character ((void *)o, ctx, ch);
+ n += ((struct casing_str_buf *)o)->len_chars;
+ o += ((struct casing_str_buf *)o)->len_bytes;
}
- eassert (o - dst <= o_size);
- obj = make_multibyte_string ((char *) dst, size, o - dst);
+ eassert (o <= dst_end);
+ obj = make_multibyte_string ((char *) dst, n, o - dst);
SAFE_FREE ();
return obj;
}
@@ -162,7 +244,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
ch = SREF (obj, i);
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (ch == cased)
continue;
MAKE_CHAR_UNIBYTE (cased);
@@ -194,7 +276,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
doc: /* Convert argument to upper case and return that.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy.
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged.
See also `capitalize', `downcase' and `upcase-initials'. */)
(Lisp_Object obj)
{
@@ -215,7 +299,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
This means that each word's first character is upper case (more
precisely, if available, title case) and the rest is lower case.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy. */)
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged. */)
(Lisp_Object obj)
{
return casify_object (CASE_CAPITALIZE, obj);
@@ -228,7 +314,9 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
(More precisely, if available, initial of each word is converted to
title-case). Do not change the other letters of each word.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy. */)
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged. */)
(Lisp_Object obj)
{
return casify_object (CASE_CAPITALIZE_UP, obj);
@@ -250,7 +338,7 @@ do_casify_unibyte_region (struct casing_context *ctx,
ch = FETCH_BYTE (pos);
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (cased == ch)
continue;
@@ -271,48 +359,54 @@ do_casify_unibyte_region (struct casing_context *ctx,
characters were changed, return -1 and *ENDP is unspecified. */
static ptrdiff_t
do_casify_multibyte_region (struct casing_context *ctx,
- ptrdiff_t pos, ptrdiff_t *endp)
+ ptrdiff_t *startp, ptrdiff_t *endp)
{
ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
- ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;
- ptrdiff_t opoint = PT;
+ ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
+ ptrdiff_t opoint = PT, added;
+ struct casing_str_buf buf;
int ch, cased, len;
- while (pos < end)
+ for (; size; --size)
{
ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
- cased = case_character (ctx, ch);
- if (cased != ch)
+
+ if (!case_character (&buf, ctx, ch))
{
- last = pos;
- if (first < 0)
- first = pos;
-
- if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
- FETCH_BYTE (pos_byte) = cased;
- else
- {
- unsigned char str[MAX_MULTIBYTE_LENGTH];
- int totlen = CHAR_STRING (cased, str);
- if (len == totlen)
- memcpy (BYTE_POS_ADDR (pos_byte), str, len);
- else
- /* Replace one character with the other(s), keeping text
- properties the same. */
- replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
- (char *) str, 9, totlen, 0);
- len = totlen;
- }
+ pos_byte += len;
+ ++pos;
+ continue;
}
- pos++;
- pos_byte += len;
+
+ last = pos + buf.len_chars;
+ if (first < 0)
+ first = pos;
+
+ if (buf.len_chars == 1 && buf.len_bytes == len)
+ memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
+ else
+ {
+ /* Replace one character with the other(s), keeping text
+ properties the same. */
+ replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
+ (const char *) buf.data, buf.len_chars,
+ buf.len_bytes,
+ 0);
+ added += buf.len_chars - 1;
+ if (opoint > pos)
+ opoint += buf.len_chars - 1;
+ }
+
+ pos_byte += buf.len_bytes;
+ pos += buf.len_chars;
}
if (PT != opoint)
TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
+ *startp = first;
*endp = last;
- return first;
+ return added;
}
/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
@@ -320,8 +414,8 @@ do_casify_multibyte_region (struct casing_context *ctx,
static void
casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
{
+ ptrdiff_t start, end, added;
struct casing_context ctx;
- ptrdiff_t start, end;
if (EQ (b, e))
/* Not modifying because nothing marked */
@@ -337,12 +431,12 @@ casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
start = do_casify_unibyte_region (&ctx, start, &end);
else
- start = do_casify_multibyte_region (&ctx, start, &end);
+ added = do_casify_multibyte_region (&ctx, &start, &end);
if (start >= 0)
{
- signal_after_change (start, end + 1 - start, end + 1 - start);
- update_compositions (start, end + 1, CHECK_ALL);
+ signal_after_change (start, end - start - added, end - start);
+ update_compositions (start, end, CHECK_ALL);
}
}
diff --git a/src/deps.mk b/src/deps.mk
index 72f68ca..1c24414 100644
--- a/src/deps.mk
+++ b/src/deps.mk
@@ -49,7 +49,7 @@ callproc.o: callproc.c epaths.h buffer.h commands.h lisp.h $(config_h) \
composite.h w32.h blockinput.h atimer.h systime.h frame.h termhooks.h \
buffer.h gnutls.h dispextern.h ../lib/unistd.h globals.h
casefiddle.o: casefiddle.c syntax.h commands.h buffer.h character.h \
- composite.h keymap.h lisp.h globals.h $(config_h)
+ composite.h keymap.h special-casing.h lisp.h globals.h $(config_h)
casetab.o: casetab.c buffer.h character.h lisp.h globals.h $(config_h)
category.o: category.c category.h buffer.h charset.h keymap.h \
character.h lisp.h globals.h $(config_h)
diff --git a/src/make-special-casing.py b/src/make-special-casing.py
new file mode 100644
index 0000000..e8725e3
--- /dev/null
+++ b/src/make-special-casing.py
@@ -0,0 +1,189 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""generate-special-casing.py --- generate special-casing.h file
+Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GNU Emacs.
+
+GNU Emacs is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+GNU Emacs is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+import re
+import sys
+import tempfile
+import textwrap
+
+TEMPLATE = '''\
+/* Special case mapping rules. Only unconditional entries are included.
+ This file is automatically generated from SpecialCasing.txt file
+ distributed with Unicode standard by %(generator)s.
+ Do not edit manually. */
+
+#include <stdint.h>
+
+struct special_casing_static_asserts {
+%(asserts)s
+};
+
+typedef %(code_point_type)s special_casing_char_t;
+
+/* Zero-terminated, sorted list. */
+static const special_casing_char_t special_casing_code_points[] = {
+%(code_points)s
+};
+
+/* If buf.len_chars has this bit set, the character maps to itself. */
+#define SPECIAL_CASING_NO_CHANGE_BIT 0x80
+
+static const struct casing_str_buf special_casing_entries[] = {
+%(entries)s
+};
+'''
+
+MAX_DATA_BYTES_LENGTH = 6
+
+ASSERTS = (
+ ('casing_str_buf_data_must_be_at_least_%d_chars' % MAX_DATA_BYTES_LENGTH,
+ 'sizeof ((struct casing_str_buf*)0)->data >= %d' % MAX_DATA_BYTES_LENGTH),
+ ('CASE_UP_must_equal_0', 'CASE_UP == 0'),
+ ('CASE_DOWN_must_equal_1', 'CASE_DOWN == 1'),
+ ('CASE_CAPITALIZE_must_equal_2', 'CASE_CAPITALIZE == 2')
+)
+
+
+def encode(code, code_points):
+ """Convert a space-separated list of code points into UTF-8 C-string.
+
+ Args:
+ code: Code point this mapping is for.
+ code_points: A space-separated list of hexadecimal numbers representing
+ code points in desired representation.
+
+ Returns:
+ A (literal, len_chars, len_bytes) tuple. len_chars may be zero if code
+ point maps to itself.
+ """
+ code_points = [int(cp, 16) for cp in code_points.split()]
+ len_chars = len(code_points)
+ if len_chars == 1 and code_points[0] == code:
+ len_chars = len_chars | 0x80
+ val = ''.join(unichr(cp) for cp in code_points).encode('utf-8')
+ ret = ''
+ for ch in val:
+ o = ord(ch)
+ if o < 32 or o >= 127:
+ ch = '\\x%02x' % o
+ ret += ch
+ return '"%s"' % ret, len_chars, len(val)
+
+
+def read_entries(fd):
+ """Read entries from SpecialCasing.txt file.
+
+ Conditional entries are ignored.
+
+ Args:
+ fd: File object to read data from.
+
+ Returns:
+ A list of [code, up_lit, up_len, down_lit, down_len, title_lit,
+ title_len, comment] lists.
+ """
+ idx_lower = 1
+ idx_title = 2
+ idx_upper = 3
+ # This order must match CASE_UP, CASE_DOWN and CASE_CAPITALIZE_UP from
+ # casefiddle.c. This ordering (which is different than in SpecialCasing.txt
+ # file) is checked via static asserts included in the generated file.
+ indexes = (idx_upper, idx_lower, idx_title)
+
+ entries = []
+ for line in fd:
+ line = line.strip()
+ if not line or line[0] == '#':
+ continue
+
+ line = re.split(r';\s*', line)
+ if len(line) == 6:
+ # Conditional special casing don’t go into special-casing.h
+ #sys.stderr.write('make-special-casing: %s: conditions present '
+ # '(%s), ignoring\n' % (line[0], line[4]))
+ continue
+
+ code = int(line[0], 16)
+ entry = [code]
+
+ for i in indexes:
+ val = encode(code, line[i])
+ entry.append(val)
+ if val:
+ # The data structure we’re using assumes that all C strings are
+ # no more than six bytes (excluding NUL terminator). Enforce
+ # that here.
+ assert val[2] <= MAX_DATA_BYTES_LENGTH, (code, i, val)
+
+ entry.append(line[4].strip(' #').capitalize())
+ entries.append(entry)
+
+ entries.sort()
+ return entries
+
+
+def format_output(entries):
+ # If all code points are 16-bit prefer using uint16_t since it makes the
+ # array smaller and more cache friendly.
+ if all(entry[0] <= 0xffff for entry in entries):
+ cp_type, cp_fmt = 'uint16_t', '%04X'
+ else:
+ cp_type, cp_fmt = 'uint32_t', '%06X'
+
+ fmt = '0x%sU' % cp_fmt;
+ code_points = ', '.join(fmt % entry[0] for entry in entries)
+
+ lines = []
+ for entry in entries:
+ lines.append(' /* U+%s %%s */' % cp_fmt % (entry[0], entry[4]))
+ for val in entry[1:4]:
+ lines.append(' { %s, %d, %d },' % val)
+ lines[-1] = lines[-1].rstrip(',')
+
+ return TEMPLATE % {
+ 'generator': os.path.basename(__file__),
+ 'asserts': '\n'.join(' char %s[%s ? 1 : -1];' % p for p in ASSERTS),
+ 'code_point_type': cp_type,
+ 'code_points': textwrap.fill(code_points + ', 0', width=80,
+ initial_indent=' ',
+ subsequent_indent=' '),
+ 'entries': '\n'.join(lines)
+ }
+
+
+def main(argv):
+ if len(argv) != 3:
+ sys.stderr.write('usage: %s SpecialCasing.txt special-casing.h\n' %
+ argv[0])
+ sys.exit(1)
+
+ with open(argv[1]) as fd:
+ entries = read_entries(fd)
+
+ data = format_output(entries)
+
+ with open(argv[2], 'w') as fd:
+ fd.write(data)
+
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/test/lisp/char-fold-tests.el b/test/lisp/char-fold-tests.el
index 485254a..821c701 100644
--- a/test/lisp/char-fold-tests.el
+++ b/test/lisp/char-fold-tests.el
@@ -54,6 +54,14 @@ char-fold--test-search-with-contents
(concat w1 "\s\n\s\t\f\t\n\r\t" w2)
(concat w1 (make-string 10 ?\s) w2)))))
+(defun char-fold--ascii-upcase (string)
+ "Like `upcase' but acts on ASCII characters only."
+ (replace-regexp-in-string "[a-z]+" 'upcase string))
+
+(defun char-fold--ascii-downcase (string)
+ "Like `downcase' but acts on ASCII characters only."
+ (replace-regexp-in-string "[a-z]+" 'downcase string))
+
(defun char-fold--test-match-exactly (string &rest strings-to-match)
(let ((re (concat "\\`" (char-fold-to-regexp string) "\\'")))
(dolist (it strings-to-match)
@@ -61,8 +69,8 @@ char-fold--test-match-exactly
;; Case folding
(let ((case-fold-search t))
(dolist (it strings-to-match)
- (should (string-match (upcase re) (downcase it)))
- (should (string-match (downcase re) (upcase it)))))))
+ (should (string-match (char-fold--ascii-upcase re) (downcase it)))
+ (should (string-match (char-fold--ascii-downcase re) (upcase it)))))))
(ert-deftest char-fold--test-some-defaults ()
(dolist (it '(("ffl" . "ffl") ("ffi" . "ffi")
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index def74a0..ae557d7 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -143,16 +143,13 @@ casefiddle-tests--characters
("DŽUNGLA" "DŽUNGLA" "džungla" "Džungla" "DžUNGLA")
("Džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
("džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
+ ("define" "DEFINE" "define" "Define" "Define")
+ ("fish" "FISH" "fish" "Fish" "Fish")
+ ("Straße" "STRASSE" "straße" "Straße" "Straße")
;; FIXME: Everything below is broken at the moment. Here’s what
;; should happen:
- ;;("define" "DEFINE" "define" "Define" "Define")
- ;;("fish" "FIsh" "fish" "Fish" "Fish")
- ;;("Straße" "STRASSE" "straße" "Straße" "Straße")
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
- ("define" "DEfiNE" "define" "Define" "Define")
- ("fish" "fiSH" "fish" "fish" "fish")
- ("Straße" "STRAßE" "straße" "Straße" "Straße")
("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2016-10-04 1:10 ` bug#24603: [RFC 08/18] Support casing characters which map into multiple code points Michal Nazarewicz
@ 2016-10-04 7:38 ` Eli Zaretskii
2016-10-06 21:40 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:38 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:31 +0200
>
> * src/make-special-casing.py: New script to generate special-casing.h
> file from the SpecialCasing.txt data file.
Please do this without Python, either in Emacs Lisp and/or the tools
already used in admin/unidata, including awk. Python is still not
available as widely as the other tools.
> +special-casing.h: make-special-casing.py ../admin/unidata/SpecialCasing.txt
> + $(AM_V_GEN)
> + python $^ $@
Don't use a literal name of a program, so users could specify their
name and/or absolute file name at build time. See what we do with
awk, for example.
> +#include "special-casing.h"
Why not a shorter 'casing.h'?
Once again, this stores the casing rules in C, whereas I'd prefer to
have them in tables accessible from Lisp.
> @@ -194,7 +276,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
> DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
> doc: /* Convert argument to upper case and return that.
> The argument may be a character or string. The result has the same type.
> -The argument object is not altered--the value is a copy.
> +The argument object is not altered--the value is a copy. If argument
> +is a character, characters which map to multiple code points when
> +cased, e.g. fi, are returned unchanged.
> See also `capitalize', `downcase' and `upcase-initials'. */)
I think this doc string should say what to do if the application wants
to convert fi into "FI".
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2016-10-04 7:38 ` Eli Zaretskii
@ 2016-10-06 21:40 ` Michal Nazarewicz
2016-10-07 7:46 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-06 21:40 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Date: Tue, 4 Oct 2016 03:10:31 +0200
>>
>> * src/make-special-casing.py: New script to generate special-casing.h
>> file from the SpecialCasing.txt data file.
>
> Please do this without Python, either in Emacs Lisp and/or the tools
> already used in admin/unidata, including awk. Python is still not
> available as widely as the other tools.
Will do.
>> +special-casing.h: make-special-casing.py ../admin/unidata/SpecialCasing.txt
>> + $(AM_V_GEN)
>> + python $^ $@
>
> Don't use a literal name of a program, so users could specify their
> name and/or absolute file name at build time. See what we do with
> awk, for example.
Will do.
>> +#include "special-casing.h"
>
> Why not a shorter 'casing.h'?
It includes data from SpecialCasing.txt only so I figured
‘special-casing.h’ would be a more descriptive name. I can change it to
‘casing.h’ if you prefer.
> Once again, this stores the casing rules in C, whereas I'd prefer to
> have them in tables accessible from Lisp.
There are a few reasons to hard-code the special casing rules in C.
Some of them have conditions (does are implemented in later patches)
which are non-trivial to encode in Lisp. Some look backwards
(e.g. After_Soft_Dotted) and some look forward (e.g. Not_Before_Dot) and
not necessarily only one character forward (e.g. More_Above).
By hard-coding the implementation, each of the predicates can be handled
in a custom way such that the code only ever looks at current and one
character forward. Not to mention that is likely faster.
Furthermore, by not having the data in Lisp I can make certain
assumptions. For example that a single character will get changed into
a sequence of at most six bytes. Having to deal with arbitrary data
that user may have put in the lisp data would further complicate the
code and if the flexibility is not worth it.
There is also the aspect that not all of the language-dependent rules
implemented in this patchset are part of Unicode. Dutch IJ (when
spelled as separate ASCII characters) is not covered by
SpecialCasing.txt. Similarly, I might also get around to implementing
Irish rules¹. Mixing information from SpecialCasing.txt and other
sources feels a bit messy.
¹ https://bugzilla.mozilla.org/show_bug.cgi?id=1018805
>> @@ -194,7 +276,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
>> DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
>> doc: /* Convert argument to upper case and return that.
>> The argument may be a character or string. The result has the same type.
>> -The argument object is not altered--the value is a copy.
>> +The argument object is not altered--the value is a copy. If argument
>> +is a character, characters which map to multiple code points when
>> +cased, e.g. fi, are returned unchanged.
>> See also `capitalize', `downcase' and `upcase-initials'. */)
>
> I think this doc string should say what to do if the application wants
> to convert fi into "FI".
Perhaps it would be better to describe it in Info page and link that
from the docstrings? The reason I’m suggesting that is that there are
11 functions defined in src/casefiddle.c and a lot of the documentation
like that (some of which upcoming in future patches) should be included
in all of them but that would mean either repeating the same thing over
and over or linking to one particular function, but then which one
should be the special one? If all of this was moved to Info page and it
linked from docstring, the problem would go away.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2016-10-06 21:40 ` Michal Nazarewicz
@ 2016-10-07 7:46 ` Eli Zaretskii
2017-01-28 23:48 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-07 7:46 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Thu, 06 Oct 2016 23:40:11 +0200
>
> >> +#include "special-casing.h"
> >
> > Why not a shorter 'casing.h'?
>
> It includes data from SpecialCasing.txt only so I figured
> ‘special-casing.h’ would be a more descriptive name. I can change it to
> ‘casing.h’ if you prefer.
Shorter names are easier to deal with. Also, the "special" part might
beg the question: where's the "normal" part. But it's a minor nit,
admittedly. If you feel strongly about your name, I won't fight that.
> > Once again, this stores the casing rules in C, whereas I'd prefer to
> > have them in tables accessible from Lisp.
>
> There are a few reasons to hard-code the special casing rules in C.
>
> Some of them have conditions (does are implemented in later patches)
> which are non-trivial to encode in Lisp. Some look backwards
> (e.g. After_Soft_Dotted) and some look forward (e.g. Not_Before_Dot) and
> not necessarily only one character forward (e.g. More_Above).
>
> By hard-coding the implementation, each of the predicates can be handled
> in a custom way such that the code only ever looks at current and one
> character forward. Not to mention that is likely faster.
>
> Furthermore, by not having the data in Lisp I can make certain
> assumptions. For example that a single character will get changed into
> a sequence of at most six bytes. Having to deal with arbitrary data
> that user may have put in the lisp data would further complicate the
> code and if the flexibility is not worth it.
It doesn't have to be arbitrary Lisp data. It could be just a set of
flags stored in a Lisp structure whose implementation is in C.
It's IMO okay to have this hard-coded in C, if a Lisp based
implementation would be unreasonably complex and inelegant. But I
don't see it should be quite yet; maybe I'm missing something. May I
suggest that you try designing this, and if it turns out to be too
cumbersome, come back with the evidence?
> There is also the aspect that not all of the language-dependent rules
> implemented in this patchset are part of Unicode. Dutch IJ (when
> spelled as separate ASCII characters) is not covered by
> SpecialCasing.txt.
The way we deal with such augmentations is by having most of the data
auto-generated, and some of it maintained manually. One example is
the current characters.el and charscript.el it loads. Can we use a
similar approach in this case? Experience shows that maintaining
everything manually is error-prone and a huge maintenance head-ache in
the long run, what with a new version of the Unicode Standard
available at least once a year.
> >> @@ -194,7 +276,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
> >> DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
> >> doc: /* Convert argument to upper case and return that.
> >> The argument may be a character or string. The result has the same type.
> >> -The argument object is not altered--the value is a copy.
> >> +The argument object is not altered--the value is a copy. If argument
> >> +is a character, characters which map to multiple code points when
> >> +cased, e.g. fi, are returned unchanged.
> >> See also `capitalize', `downcase' and `upcase-initials'. */)
> >
> > I think this doc string should say what to do if the application wants
> > to convert fi into "FI".
>
> Perhaps it would be better to describe it in Info page and link that
> from the docstrings?
Fine with me.
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2016-10-07 7:46 ` Eli Zaretskii
@ 2017-01-28 23:48 ` Michal Nazarewicz
2017-02-10 9:12 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2017-01-28 23:48 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Fri, Oct 07 2016, Eli Zaretskii wrote:
> The way we deal with such augmentations is by having most of the data
> auto-generated, and some of it maintained manually. One example is
> the current characters.el and charscript.el it loads. Can we use a
> similar approach in this case? Experience shows that maintaining
> everything manually is error-prone and a huge maintenance head-ache in
> the long run, what with a new version of the Unicode Standard
> available at least once a year.
The majority is handled automatically in both cases. My approach is
that rules that are conditionals and those not included in Unicode are
manually maintained as C code.
In practice, if the Lisp data changes, C code that handles it would have
to change as well. For example, if Unicode adds rules for Dutch ‘ij’¹,
it would be done by adding an ‘After_Uppercased_I’ condition but then
for that rule to work it’s not enough to include it in Lisp data but it
has to be coded in C.
¹ ‘ij’ at the beginning of a word should be capitalised as ‘IJ’ not ‘Ij’.
There’s also the case of ‘More_Above’:
0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
The rule means that ‘I <combining chars>* <combining acute>’
(where <combining chars> is any sequence of combining characters)
should be lower-cased as ‘i <combining chars>* <combining dot
above> <combining acute>’².
The way the SpecialCasing rules are structured would want us to
scan the string from the where we encountered I to look for any
combining characters and indeed this is how some libraries
implement it. The problem in Emacs is that the casefiddle.c
needs to work on strings as well as buffers which are different
data structures. As a result, scanning future characters needs
two different cases.
So instead, the way I implemented it is by flipping a bit in
casing_context so that case_character_impl knows to handle
combining characters correctly.
² Without addition of the <combining dot above>, the tittle (dot
above ‘i’) would disappear when rendering because of the
<combining acute> and that’s apparently not how Lithuanian is
supposed to work.
So, yeah… Of course, I’m a bit biased by the virtue of having
the code already written and not wanting to rewrite it (which
will probably take me another few months, *sighs*) but with the
conditional casing rules I’m honestly not convinced at the moment
that trying to keep them in Lisp data would be better.
Attached below is a new version of 08/18 with the unconditional
casing rules moved from C code to a uniprop char table (I haven’t
updated commit message yet). (Compared to previous version it’s
a bit more C code but overall 200-line AWK script is replaced by
around 50 lines of Elisp so overall the patch is shorter).
This also fixes issues with undo and cursor positioning that I’ve
mentioned before.
Both versions are available on GitHub:
- Elisp version: git://github.com/mina86/emacs.git master-el
- C version: git://github.com/mina86/emacs.git master
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
From bbcf826071b158438a03ab3c9fea92528b915bc8 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Wed, 5 Oct 2016 00:06:01 +0200
Subject: [PATCH 08/19] Support casing characters which map into multiple code
points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Implement unconditional special casing rules defined in Unicode standard.
Among other things, they deal with cases when a single code point is
replaced by multiple ones because simple character does not exist (e.g.
fi ligature turning into FL) or is not commonly used (e.g. ß turning into
SS).
* admin/unidata/SpecialCasing.txt: New data file pulled from Unicode
standard distribution.
* admin/unidata/README: Mention SpecialCasing.txt.
* src/make-special-casing.awk: New script to generate special-casing.h
file from the SpecialCasing.txt data file.
* src/casefiddle.c: Include special-casing.h so special casing rules are
available and can be used in the translation unit.
(struct casing_str_buf): New structure for representing short strings.
It’s used to compactly encode special casing rules.
(case_character_imlp): New function which can handle one-to-many
character mappings.
(case_character, case_single_character): Wrappers for the above
functions. The former may map one character to multiple code points
while the latter does what the former used to do (i.e. handles
one-to-one mappings only).
(do_casify_integer, do_casify_unibyte_string,
do_casify_unibyte_region): Use case_single_character.
(do_casify_multibyte_string, do_casify_multibyte_region): Support new
features of case_character.
* (do_casify_region): Updated after do_casify_multibyte_string changes.
(upcase, capitalize, upcase-initials): Update documentation to mention
limitations when working on characters.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Update test
cases which are now passing.
* test/lisp/char-fold-tests.el (char-fold--ascii-upcase,
char-fold--ascii-downcase): New functions which behave like old ‘upcase’
and ‘downcase’.
(char-fold--test-match-exactly): Use the new functions. This is needed
because otherwise fi and similar characters are turned into their multi-
-character representation.
* doc/lispref/strings.texi: Describe issue with casing characters versus
strings.
---
admin/unidata/README | 4 +
admin/unidata/SpecialCasing.txt | 281 ++++++++++++++++++++++++++++++++++++
admin/unidata/unidata-gen.el | 40 ++++++
doc/lispref/strings.texi | 23 +++
etc/NEWS | 16 ++-
src/casefiddle.c | 305 +++++++++++++++++++++++++++++-----------
test/lisp/char-fold-tests.el | 12 +-
test/src/casefiddle-tests.el | 9 +-
8 files changed, 591 insertions(+), 99 deletions(-)
create mode 100644 admin/unidata/SpecialCasing.txt
diff --git a/admin/unidata/README b/admin/unidata/README
index 534670ce6db..06a66663a72 100644
--- a/admin/unidata/README
+++ b/admin/unidata/README
@@ -24,3 +24,7 @@ http://www.unicode.org/Public/8.0.0/ucd/Blocks.txt
NormalizationTest.txt
http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
2016-07-16
+
+SpecialCasing.txt
+http://unicode.org/Public/UNIDATA/SpecialCasing.txt
+2016-03-03
diff --git a/admin/unidata/SpecialCasing.txt b/admin/unidata/SpecialCasing.txt
new file mode 100644
index 00000000000..b23fa7f7680
--- /dev/null
+++ b/admin/unidata/SpecialCasing.txt
@@ -0,0 +1,281 @@
+# SpecialCasing-9.0.0.txt
+# Date: 2016-03-02, 18:55:13 GMT
+# © 2016 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see http://www.unicode.org/reports/tr44/
+#
+# Special Casing
+#
+# This file is a supplement to the UnicodeData.txt file. It does not define any
+# properties, but rather provides additional information about the casing of
+# Unicode characters, for situations when casing incurs a change in string length
+# or is dependent on context or locale. For compatibility, the UnicodeData.txt
+# file only contains simple case mappings for characters where they are one-to-one
+# and independent of context and language. The data in this file, combined with
+# the simple case mappings in UnicodeData.txt, defines the full case mappings
+# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
+#
+# Note that the preferred mechanism for defining tailored casing operations is
+# the Unicode Common Locale Data Repository (CLDR). For more information, see the
+# discussion of case mappings and case algorithms in the Unicode Standard.
+#
+# All code points not listed in this file that do not have a simple case mappings
+# in UnicodeData.txt map to themselves.
+# ================================================================================
+# Format
+# ================================================================================
+# The entries in this file are in the following machine-readable format:
+#
+# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+#
+# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
+# of <code>, expressed as character values in hex. If there is more than one character,
+# they are separated by spaces. Other than as used to separate elements, spaces are
+# to be ignored.
+#
+# The <condition_list> is optional. Where present, it consists of one or more language IDs
+# or casing contexts, separated by spaces. In these conditions:
+# - A condition list overrides the normal behavior if all of the listed conditions are true.
+# - The casing context is always the context of the characters in the original string,
+# NOT in the resulting string.
+# - Case distinctions in the condition list are not significant.
+# - Conditions preceded by "Not_" represent the negation of the condition.
+# The condition list is not represented in the UCD as a formal property.
+#
+# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
+#
+# A casing context for a character is defined by Section 3.13 Default Case Algorithms
+# of The Unicode Standard.
+#
+# Parsers of this file must be prepared to deal with future additions to this format:
+# * Additional contexts
+# * Additional fields
+# ================================================================================
+
+# ================================================================================
+# Unconditional mappings
+# ================================================================================
+
+# The German es-zed is special--the normal mapping is to SS.
+# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
+
+00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
+
+# Preserve canonical equivalence for I with dot. Turkic is handled below.
+
+0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# Ligatures
+
+FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
+FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
+FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
+FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
+FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
+FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
+FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
+
+0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
+FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
+FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
+FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
+FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
+FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
+
+# No corresponding uppercase precomposed character
+
+0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
+1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
+1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
+1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
+1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
+1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
+1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
+1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
+1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
+1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
+1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+
+# IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
+# the result will be incorrect unless the iota-subscript is moved to the end
+# of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
+# This process can be achieved by first transforming the text to NFC before casing.
+# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
+
+# The following cases are already in the UnicodeData.txt file, so are only commented here.
+
+# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
+
+# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
+# have special uppercases.
+# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
+
+1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+
+# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
+
+1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+
+1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+# ================================================================================
+# Conditional Mappings
+# The remainder of this file provides conditional casing data used to produce
+# full case mappings.
+# ================================================================================
+# Language-Insensitive Mappings
+# These are characters whose full case mappings do not depend on language, but do
+# depend on context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Special case for final form of sigma
+
+03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
+
+# Note: the following cases for non-final are already in the UnicodeData.txt file.
+
+# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
+# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
+
+# Note: the following cases are not included, since they would case-fold in lowercasing
+
+# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
+
+# ================================================================================
+# Language-Sensitive Mappings
+# These are characters whose full case mappings depend on language and perhaps also
+# context (which characters come before or after). For more information
+# see the header of this file and the Unicode Standard.
+# ================================================================================
+
+# Lithuanian
+
+# Lithuanian retains the dot in a lowercase i when followed by accents.
+
+# Remove DOT ABOVE after "i" with upper or titlecase
+
+0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+
+# Introduce an explicit dot above when lowercasing capital I's and J's
+# whenever there are more accents above.
+# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+
+0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
+004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
+012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
+00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
+00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
+0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
+
+# ================================================================================
+
+# Turkish and Azeri
+
+# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+# The following rules handle those cases.
+
+0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
+# This matches the behavior of the canonically equivalent I-dot_above
+
+0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
+
+# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
+
+0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
+0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
+
+# When uppercasing, i turns into a dotted capital I
+
+0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
+0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
+
+# Note: the following case is already in the UnicodeData.txt file.
+
+# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
+
+# EOF
+
diff --git a/admin/unidata/unidata-gen.el b/admin/unidata/unidata-gen.el
index 3c5119a8a3d..5575f0e745a 100644
--- a/admin/unidata/unidata-gen.el
+++ b/admin/unidata/unidata-gen.el
@@ -268,6 +268,20 @@ unidata-prop-alist
The value nil means that the actual property value of a character
is the character itself."
string)
+ (special-casing
+ nil unidata-gen-table-special-casing "uni-special-casing.el"
+ "Unicode special casing mapping.
+
+Property value is nil or a three-element list of strings or characters. Each
+element denotes what characters maps into when upper-casing, lower-casing or
+title-casing respectively. String is used when the mapping is into an empty
+string or more than one character.
+
+The value nil means that no special casing rules exist for the character and
+`uppercase', `lowercase' or `titlecase' property needs to be consulted.
+
+The mapping includes only unconditional casing rules defined by Unicode."
+ nil)
(mirroring
unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el"
"Unicode bidi-mirroring characters.
@@ -1084,6 +1098,32 @@ unidata-gen-table-decomposition
\f
+
+(defun unidata-gen-table-special-casing (prop &rest ignore)
+ (let ((table (make-char-table 'char-code-property-table)))
+ (set-char-table-extra-slot table 0 prop)
+ (with-temp-buffer
+ (insert-file-contents (expand-file-name "SpecialCasing.txt" unidata-dir))
+ (goto-char (point-min))
+ (while (not (eobp))
+ (unless (or (eq (char-after) ?\n) (eq (char-after) ?#)) ;empty line or comment
+ (let ((line (split-string
+ (buffer-substring (point) (progn (end-of-line) (point)))
+ ";" "")))
+ ;; Ignore entries with conditions, i.e. those with six values.
+ (when (= (length line) 5)
+ (let ((ch (string-to-number (pop line) 16)) lo tc up)
+ (dolist (var '(lo tc up))
+ (let ((v (mapcar (lambda (num) (string-to-number num 16))
+ (split-string (pop line)))))
+ (set var (if (or (null v) (cdr v)) (apply 'string v) (car v)))))
+ ;; Order must match order of case_action enum fields defined in
+ ;; src/casefiddle.c
+ (set-char-table-range table ch (list up lo tc))))))
+ (forward-line)))
+ table))
+
+\f
(defun unidata-describe-general-category (val)
(cdr (assq val
'((nil . "Uknown")
diff --git a/doc/lispref/strings.texi b/doc/lispref/strings.texi
index cf47db4a814..ba1cf2606ce 100644
--- a/doc/lispref/strings.texi
+++ b/doc/lispref/strings.texi
@@ -1166,6 +1166,29 @@ Case Conversion
@end example
@end defun
+ Note that case conversion is not a one-to-one mapping and the length
+of the result may differ from the length of the argument (including
+being shorter). Furthermore, because passing a character forces
+return type to be a character, functions are unable to perform proper
+substitution and result may differ compared to treating
+a one-character string. For example:
+
+@example
+@group
+(upcase "fi") ; note: single character, ligature "fi"
+ @result{} "FI"
+@end group
+@group
+(upcase ?fi)
+ @result{} 64257 ; i.e. ?fi
+@end group
+@end example
+
+ To avoid this, a character must first be converted into a string,
+using @code{string} function, before being passed to one of the casing
+functions. Of course, no assumptions on the length of the result may
+be made.
+
@xref{Text Comparison}, for functions that compare strings; some of
them ignore case differences, or can optionally ignore case differences.
diff --git a/etc/NEWS b/etc/NEWS
index 03790cac53f..bac396ecc18 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -325,13 +325,17 @@ same as in modes where the character is not whitespace.
Instead of only checking the modification time, Emacs now also checks
the file's actual content before prompting the user.
-** Title case characters are properly cased (from and into).
-'upcase', 'upcase-region' et al. convert title case characters (such
-as Dz) into their upper case form (such as DZ).
+** Various casing improvements.
-Similarly, 'capitalize', 'upcase-initials' et al. make use of
-title-case forms of initial characters (correctly producing for example
-Džungla instead of incorrect DŽungla).
+*** 'upcase', 'upcase-region' et al. convert title case characters
+(such as Dz) into their upper case form (such as DZ).
+
+*** 'capitalize', 'upcase-initials' et al. make use of title-case forms
+of initial characters (correctly producing for example Džungla instead
+of incorrect DŽungla).
+
+*** Characters which turn into multiple ones when cased are correctly handled.
+For example, fi ligature is converted to FI when upper cased.
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
diff --git a/src/casefiddle.c b/src/casefiddle.c
index c09d0609367..8a03eaabeaf 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -29,6 +29,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include "composite.h"
#include "keymap.h"
+/* Order must match order in unidata-gen-table-special-casing. */
enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
/* State for casing individual characters. */
@@ -37,6 +38,9 @@ struct casing_context {
implies flag being CASE_CAPITALIZE or CASE_CAPITALIZE_UP (but the reverse
is not true). */
Lisp_Object titlecase_char_table;
+ /* The special-casing Unicode properties case table with unconditional special
+ casing rules defined by Unicode. */
+ Lisp_Object specialcase_char_table;
/* User-requested action. */
enum case_action flag;
/* If true, function operates on a buffer as opposed to a string or character.
@@ -61,6 +65,8 @@ prepare_casing_context (struct casing_context *ctx,
ctx->titlecase_char_table = (int)flag >= (int)CASE_CAPITALIZE
? uniprop_table (intern_c_string ("titlecase"))
: Qnil;
+ ctx->specialcase_char_table =
+ uniprop_table (intern_c_string ("special-casing"));
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -70,25 +76,117 @@ prepare_casing_context (struct casing_context *ctx,
SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
}
-/* Based on CTX, case character CH accordingly. Update CTX as necessary.
- Return cased character. */
+struct casing_str_buf {
+ unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
+ unsigned char len_chars;
+ unsigned char len_bytes;
+};
+
+/* Based on CTX, case character CH. If BUF is NULL, return cased character.
+ Otherwise, if BUF is non-NULL, save result in it and return whether the
+ character has been changed.
+
+ Since meaning of return value depends on arguments, it’s more convenient to
+ use case_single_character or case_character instead. */
static int
-case_character (struct casing_context *ctx, int ch)
+case_character_impl (struct casing_str_buf *buf,
+ struct casing_context *ctx, int ch)
{
+ enum case_action flag;
Lisp_Object prop;
+ bool was_inword;
+ int cased;
+
+ /* Update inword state */
+ was_inword = ctx->inword;
+ if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+ ctx->inword = SYNTAX (ch) == Sword &&
+ (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+
+ /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+ if (!was_inword)
+ flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+ else if (ctx->flag != CASE_CAPITALIZE_UP)
+ flag = CASE_DOWN;
+ else
+ {
+ cased = ch;
+ goto done;
+ }
- if (ctx->inword)
- ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+ /* Look through the special casing entries. */
+ if (buf && !NILP(ctx->specialcase_char_table))
+ {
+ prop = CHAR_TABLE_REF(ctx->specialcase_char_table, ch);
+ switch (flag) {
+ case CASE_CAPITALIZE:
+ case CASE_CAPITALIZE_UP:
+ if (!CONSP(prop))
+ break;
+ prop = XCDR(prop);
+ /* FALL THROUGH */
+ case CASE_DOWN:
+ if (!CONSP(prop))
+ break;
+ prop = XCDR(prop);
+ /* FALL THROUGH */
+ default:
+ if (!CONSP(prop))
+ break;
+ prop = XCAR(prop);
+ if (INTEGERP(prop)) {
+ cased = XINT(prop);
+ if (0 <= cased && cased <= MAX_CHAR)
+ goto done;
+ } else if (STRINGP(prop)) {
+ struct Lisp_String *str = XSTRING(prop);
+ if (STRING_BYTES(str) <= sizeof buf->data) {
+ buf->len_chars = str->size;
+ buf->len_bytes = STRING_BYTES(str);
+ memcpy(buf->data, str->data, buf->len_bytes);
+ return 1;
+ }
+ }
+ }
+ }
+
+ /* Handle simple, one-to-one case. */
+ if (flag == CASE_DOWN)
+ cased = downcase (ch);
else if (!NILP (ctx->titlecase_char_table) &&
CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
- ch = XFASTINT (prop);
+ cased = XFASTINT (prop);
else
- ch = upcase(ch);
+ cased = upcase(ch);
+
+ /* And we’re done. */
+ done:
+ if (!buf)
+ return cased;
+ buf->len_chars = 1;
+ buf->len_bytes = CHAR_STRING (cased, buf->data);
+ return cased != ch;
+}
- if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
- ctx->inword = SYNTAX (ch) == Sword &&
- (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
- return ch;
+/* Based on CTX, case character CH accordingly. Update CTX as necessary.
+ Return cased character.
+
+ Special casing rules (such as upcase(fi) = FI) are not handled. For
+ characters whose casing results in multiple code points, the character is
+ returned unchanged. */
+static inline int
+case_single_character (struct casing_context *ctx, int ch)
+{
+ return case_character_impl (NULL, ctx, ch);
+}
+
+/* Save in BUF result of casing character CH. Return whether casing changed the
+ character. This is like case_single_character but also handles one-to-many
+ casing rules. */
+static inline bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+{
+ return case_character_impl (buf, ctx, ch);
}
\f
static Lisp_Object
@@ -115,7 +213,7 @@ do_casify_integer (struct casing_context *ctx, Lisp_Object obj)
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
if (! multibyte)
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (cased == ch)
return obj;
@@ -128,25 +226,34 @@ do_casify_integer (struct casing_context *ctx, Lisp_Object obj)
static Lisp_Object
do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
- ptrdiff_t i, i_byte, size = SCHARS (obj);
- int len, ch, cased;
+ /* We assume data is the first member of casing_str_buf structure so that if
+ we cast a (char *) into (struct casing_str_buf *) the representation of the
+ character is at the beginning of the buffer. This is why we don’t need
+ separate struct casing_str_buf object but rather write directly to o. */
+ typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
+
+ ptrdiff_t size = SCHARS (obj), n;
+ int ch;
USE_SAFE_ALLOCA;
- ptrdiff_t o_size;
- if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
- o_size = PTRDIFF_MAX;
- unsigned char *dst = SAFE_ALLOCA (o_size);
+ if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
+ INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
+ n = PTRDIFF_MAX;
+ unsigned char *const dst = SAFE_ALLOCA (n), *const dst_end = dst + n;
unsigned char *o = dst;
- for (i = i_byte = 0; i < size; i++, i_byte += len)
+ const unsigned char *src = SDATA (obj);
+
+ for (n = 0; size; --size)
{
- if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
+ if (dst_end - o < sizeof(struct casing_str_buf))
string_overflow ();
- ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
- cased = case_character (ctx, ch);
- o += CHAR_STRING (cased, o);
+ ch = STRING_CHAR_ADVANCE (src);
+ case_character ((void *)o, ctx, ch);
+ n += ((struct casing_str_buf *)o)->len_chars;
+ o += ((struct casing_str_buf *)o)->len_bytes;
}
- eassert (o - dst <= o_size);
- obj = make_multibyte_string ((char *) dst, size, o - dst);
+ eassert (o <= dst_end);
+ obj = make_multibyte_string ((char *) dst, n, o - dst);
SAFE_FREE ();
return obj;
}
@@ -162,7 +269,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
{
ch = SREF (obj, i);
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (ch == cased)
continue;
MAKE_CHAR_UNIBYTE (cased);
@@ -194,7 +301,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
doc: /* Convert argument to upper case and return that.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy.
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged.
See also `capitalize', `downcase' and `upcase-initials'. */)
(Lisp_Object obj)
{
@@ -215,7 +324,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
This means that each word's first character is upper case (more
precisely, if available, title case) and the rest is lower case.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy. */)
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged. */)
(Lisp_Object obj)
{
return casify_object (CASE_CAPITALIZE, obj);
@@ -228,21 +339,28 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
(More precisely, if available, initial of each word is converted to
title-case). Do not change the other letters of each word.
The argument may be a character or string. The result has the same type.
-The argument object is not altered--the value is a copy. */)
+The argument object is not altered--the value is a copy. If argument
+is a character, characters which map to multiple code points when
+cased, e.g. fi, are returned unchanged. */)
(Lisp_Object obj)
{
return casify_object (CASE_CAPITALIZE_UP, obj);
}
\f
-/* Based on CTX, case region in a unibyte buffer from POS to *ENDP. Return
- first position that has changed and save last position in *ENDP. If no
- characters were changed, return -1 and *ENDP is unspecified. */
+/* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP.
+
+ Save first and last positions that has changed in *STARTP and *ENDP
+ respectively. If no characters were changed, save -1 to *STARTP and leave
+ *ENDP unspecified.
+
+ Always return 0. This is so that interface of this function is the same as
+ do_casify_multibyte_region. */
static ptrdiff_t
do_casify_unibyte_region (struct casing_context *ctx,
- ptrdiff_t pos, ptrdiff_t *endp)
+ ptrdiff_t *startp, ptrdiff_t *endp)
{
ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
- ptrdiff_t end = *endp;
+ ptrdiff_t pos = *startp, end = *endp;
int ch, cased;
for (; pos < end; ++pos)
@@ -250,11 +368,11 @@ do_casify_unibyte_region (struct casing_context *ctx,
ch = FETCH_BYTE (pos);
MAKE_CHAR_MULTIBYTE (ch);
- cased = case_character (ctx, ch);
+ cased = case_single_character (ctx, ch);
if (cased == ch)
continue;
- last = pos;
+ last = pos + 1;
if (first < 0)
first = pos;
@@ -262,88 +380,107 @@ do_casify_unibyte_region (struct casing_context *ctx,
FETCH_BYTE (pos) = cased;
}
- *endp = last + 1;
- return first;
+ *startp = first;
+ *endp = last;
+ return 0;
}
-/* Based on CTX, case region in a multibyte buffer from POS to *ENDP. Return
- first position that has changed and save last position in *ENDP. If no
- characters were changed, return -1 and *ENDP is unspecified. */
+/* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP.
+
+ Return number of added characters (may be negative if more characters were
+ deleted then inserted), save first and last positions that has changed in
+ *STARTP and *ENDP respectively. If no characters were changed, return 0,
+ save -1 to *STARTP and leave *ENDP unspecified. */
static ptrdiff_t
do_casify_multibyte_region (struct casing_context *ctx,
- ptrdiff_t pos, ptrdiff_t *endp)
+ ptrdiff_t *startp, ptrdiff_t *endp)
{
ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
- ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;
- ptrdiff_t opoint = PT;
+ ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
+ ptrdiff_t opoint = PT, added = 0;
+ struct casing_str_buf buf;
int ch, cased, len;
- while (pos < end)
+ for (; size; --size)
{
ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
- cased = case_character (ctx, ch);
- if (cased != ch)
+ if (!case_character (&buf, ctx, ch))
+ {
+ pos_byte += len;
+ ++pos;
+ continue;
+ }
+
+ last = pos + buf.len_chars;
+ if (first < 0)
+ first = pos;
+
+ if (buf.len_chars == 1 && buf.len_bytes == len)
+ memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
+ else
{
- last = pos;
- if (first < 0)
- first = pos;
-
- if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
- FETCH_BYTE (pos_byte) = cased;
- else
- {
- unsigned char str[MAX_MULTIBYTE_LENGTH];
- int totlen = CHAR_STRING (cased, str);
- if (len == totlen)
- memcpy (BYTE_POS_ADDR (pos_byte), str, len);
- else
- /* Replace one character with the other(s), keeping text
- properties the same. */
- replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
- (char *) str, 9, totlen, 0);
- len = totlen;
- }
+ /* Replace one character with the other(s), keeping text
+ properties the same. */
+ replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
+ (const char *) buf.data, buf.len_chars,
+ buf.len_bytes,
+ 0);
+ added += (ptrdiff_t) buf.len_chars - 1;
+ if (opoint > pos)
+ opoint += (ptrdiff_t) buf.len_chars - 1;
}
- pos++;
- pos_byte += len;
+
+ pos_byte += buf.len_bytes;
+ pos += buf.len_chars;
}
if (PT != opoint)
TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
+ *startp = first;
*endp = last;
- return first;
+ return added;
}
-/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
- b and e specify range of buffer to operate on. */
-static void
+/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. b and
+ e specify range of buffer to operate on. Return character position of the
+ end of the region after changes. */
+static ptrdiff_t
casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
{
+ ptrdiff_t start, end, orig_end, added;
struct casing_context ctx;
- ptrdiff_t start, end;
-
- if (EQ (b, e))
- /* Not modifying because nothing marked */
- return;
validate_region (&b, &e);
start = XFASTINT (b);
end = XFASTINT (e);
+ if (start == end)
+ /* Not modifying because nothing marked */
+ return end;
modify_text (start, end);
- record_change (start, end - start);
prepare_casing_context (&ctx, flag, true);
+ orig_end = end;
+ record_delete (start, make_buffer_string (start, end, true), false);
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
- start = do_casify_unibyte_region (&ctx, start, &end);
+ {
+ record_insert (start, end - start);
+ added = do_casify_unibyte_region (&ctx, &start, &end);
+ }
else
- start = do_casify_multibyte_region (&ctx, start, &end);
+ {
+ ptrdiff_t len = end - start, ostart = start;
+ added = do_casify_multibyte_region (&ctx, &start, &end);
+ record_insert (ostart, len + added);
+ }
if (start >= 0)
{
- signal_after_change (start, end + 1 - start, end + 1 - start);
- update_compositions (start, end + 1, CHECK_ALL);
+ signal_after_change (start, end - start - added, end - start);
+ update_compositions (start, end, CHECK_ALL);
}
+
+ return orig_end + added;
}
DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,
@@ -435,9 +572,7 @@ casify_word (enum case_action flag, Lisp_Object arg)
ptrdiff_t farend = scan_words (PT, XINT (arg));
if (!farend)
farend = XINT (arg) <= 0 ? BEGV : ZV;
- ptrdiff_t newpoint = max (PT, farend);
- casify_region (flag, make_number (PT), make_number (farend));
- SET_PT (newpoint);
+ SET_PT (casify_region (flag, make_number (PT), make_number (farend)));
return Qnil;
}
diff --git a/test/lisp/char-fold-tests.el b/test/lisp/char-fold-tests.el
index d86c731b6e3..00bc3c83d05 100644
--- a/test/lisp/char-fold-tests.el
+++ b/test/lisp/char-fold-tests.el
@@ -54,6 +54,14 @@ char-fold--test-search-with-contents
(concat w1 "\s\n\s\t\f\t\n\r\t" w2)
(concat w1 (make-string 10 ?\s) w2)))))
+(defun char-fold--ascii-upcase (string)
+ "Like `upcase' but acts on ASCII characters only."
+ (replace-regexp-in-string "[a-z]+" 'upcase string))
+
+(defun char-fold--ascii-downcase (string)
+ "Like `downcase' but acts on ASCII characters only."
+ (replace-regexp-in-string "[a-z]+" 'downcase string))
+
(defun char-fold--test-match-exactly (string &rest strings-to-match)
(let ((re (concat "\\`" (char-fold-to-regexp string) "\\'")))
(dolist (it strings-to-match)
@@ -61,8 +69,8 @@ char-fold--test-match-exactly
;; Case folding
(let ((case-fold-search t))
(dolist (it strings-to-match)
- (should (string-match (upcase re) (downcase it)))
- (should (string-match (downcase re) (upcase it)))))))
+ (should (string-match (char-fold--ascii-upcase re) (downcase it)))
+ (should (string-match (char-fold--ascii-downcase re) (upcase it)))))))
(ert-deftest char-fold--test-some-defaults ()
(dolist (it '(("ffl" . "ffl") ("ffi" . "ffi")
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index d7fe55f97d7..e347ed7b875 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -188,16 +188,13 @@ casefiddle-tests--test-casing
("DŽUNGLA" "DŽUNGLA" "džungla" "Džungla" "DžUNGLA")
("Džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
("džungla" "DŽUNGLA" "džungla" "Džungla" "Džungla")
+ ("define" "DEFINE" "define" "Define" "Define")
+ ("fish" "FISH" "fish" "Fish" "Fish")
+ ("Straße" "STRASSE" "straße" "Straße" "Straße")
;; FIXME: Everything below is broken at the moment. Here’s what
;; should happen:
- ;;("define" "DEFINE" "define" "Define" "Define")
- ;;("fish" "FIsh" "fish" "Fish" "Fish")
- ;;("Straße" "STRASSE" "straße" "Straße" "Straße")
;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; And here’s what is actually happening:
- ("define" "DEfiNE" "define" "Define" "Define")
- ("fish" "fiSH" "fish" "fish" "fish")
- ("Straße" "STRAßE" "straße" "Straße" "Straße")
("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))))))
--
2.11.0.483.g087da7b7c-goog
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 08/18] Support casing characters which map into multiple code points
2017-01-28 23:48 ` Michal Nazarewicz
@ 2017-02-10 9:12 ` Eli Zaretskii
0 siblings, 0 replies; 89+ messages in thread
From: Eli Zaretskii @ 2017-02-10 9:12 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Sun, 29 Jan 2017 00:48:02 +0100
>
> Attached below is a new version of 08/18 with the unconditional
> casing rules moved from C code to a uniprop char table (I haven’t
> updated commit message yet). (Compared to previous version it’s
> a bit more C code but overall 200-line AWK script is replaced by
> around 50 lines of Elisp so overall the patch is shorter).
Thanks, this looks good.
It seems you've forgotten a log entry for the change in
unidata-gen.el. Also, the new uni-special-casing.el file defines a
new uniprop table, whose user-visible manifestation -- the new
char-code property, I think should to be mentioned in the ELisp
manual.
Thanks again for working on this.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 09/18] Implement special sigma casing rule
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (6 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 08/18] Support casing characters which map into multiple code points Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:22 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings Michal Nazarewicz
` (8 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
In Greek, a sigma character has two lower case forms which depend on
their position in the word. Implement logic determining it.
* src/casefiddle.c (struct casing_context, case_character_impl): Don’t
assume inword is true when flag is CASE_UP and false when flag is
CASE_DOWN. For final sigma detection we need this information tracked
reliably.
(CAPITAL_SIGMA, SMALL_SIGMA, SMALL_FINAL_SIGMA): New macros defining
Unicode code point of different forms of sigma letter.
(do_casify_multibyte_string, do_casify_multibyte_region): Update after
changes to case_character.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test
cases for final sigma.
---
etc/NEWS | 4 +++
src/casefiddle.c | 72 +++++++++++++++++++++++++++++++++-----------
test/src/casefiddle-tests.el | 15 +++++----
3 files changed, 67 insertions(+), 24 deletions(-)
diff --git a/etc/NEWS b/etc/NEWS
index 3396f9f..4516812 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -247,6 +247,10 @@ of incorrect DŽungla).
*** Characters which turn into multiple ones when cased are correctly handled.
For example, fi ligature is converted to FI when upper cased.
+*** Greek small sigma is correctly handled when at the end of the word.
+Strings such as ΌΣΟΣ are now correctly converted to Όσος when
+capitalised to follow rules of Greek spelling.
+
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 35ff674..ace589c 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -51,9 +51,7 @@ struct casing_context {
When run on a buffer, syntax_prefix_flag_p is taken into account when
determined inword flag. */
bool inbuffer;
- /* Conceptually, this denotes whether we are inside of a word except
- that if flag is CASE_UP it’s always false and if flag is CASE_DOWN
- this is always true. */
+ /* Whether we are inside of a word. */
bool inword;
};
@@ -65,7 +63,7 @@ prepare_casing_context (struct casing_context *ctx,
{
ctx->flag = flag;
ctx->inbuffer = inbuffer;
- ctx->inword = flag == CASE_DOWN;
+ ctx->inword = false;
ctx->titlecase_char_table = (int)flag >= (int)CASE_CAPITALIZE
? uniprop_table (intern_c_string ("titlecase"))
: Qnil;
@@ -95,15 +93,16 @@ case_character_impl (struct casing_str_buf *buf,
/* Update inword state */
was_inword = ctx->inword;
- if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
- ctx->inword = SYNTAX (ch) == Sword &&
- (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+ ctx->inword = SYNTAX (ch) == Sword &&
+ (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
/* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
- if (!was_inword)
- flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+ if (ctx->flag == CASE_CAPITALIZE)
+ flag = (enum case_action)((int)ctx->flag - was_inword);
else if (ctx->flag != CASE_CAPITALIZE_UP)
- flag = CASE_DOWN;
+ flag = ctx->flag;
+ else if (!was_inword)
+ flag = CASE_CAPITALIZE;
else
{
cased = ch;
@@ -142,7 +141,18 @@ case_character_impl (struct casing_str_buf *buf,
buf->len_bytes = CHAR_STRING (cased, buf->data);
return cased != ch;
}
+\f
+/* In Greek, lower case sigma has two forms: one when used in the middle and one
+ when used at the end of a word. Below is to help handle those cases when
+ casing.
+
+ The rule does not conflict with any other casing rules so while it is
+ a conditional one, it is independent on language. */
+#define CAPITAL_SIGMA 0x03A3
+#define SMALL_SIGMA 0x03C3
+#define SMALL_FINAL_SIGMA 0x03C2
+\f
/* Based on CTX, case character CH accordingly. Update CTX as necessary.
Return cased character.
@@ -156,12 +166,34 @@ case_single_character (struct casing_context *ctx, int ch)
}
/* Save in BUF result of casing character CH. Return whether casing changed the
- character. This is like case_single_character but also handles one-to-many
- casing rules. */
-static inline bool
-case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+ character.
+
+ If not-NULL, NEXT points to the next character in the cased string. If NULL,
+ it is assumed current character is the last one being cased. This is used to
+ apply some rules which depend on proceeding state.
+
+ This is like case_single_character but also handles one-to-many casing
+ rules. */
+static bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx,
+ int ch, const unsigned char *next)
{
- return case_character_impl (buf, ctx, ch);
+ bool changed, was_inword;
+
+ was_inword = ctx->inword;
+ changed = case_character_impl (buf, ctx, ch);
+
+ /* If we have just down-cased a capital sigma and the next character no longer
+ has a word syntax (i.e. current character is end of word), use final
+ sigma. */
+ if (was_inword && ch == CAPITAL_SIGMA && changed &&
+ (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+ {
+ buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
+ buf->len_chars = 1;
+ }
+
+ return changed;
}
\f
static Lisp_Object
@@ -223,7 +255,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
if (dst_end - o < sizeof(struct casing_str_buf))
string_overflow ();
ch = STRING_CHAR_ADVANCE (src);
- case_character ((void *)o, ctx, ch);
+ case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
n += ((struct casing_str_buf *)o)->len_chars;
o += ((struct casing_str_buf *)o)->len_bytes;
}
@@ -365,13 +397,17 @@ do_casify_multibyte_region (struct casing_context *ctx,
ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
ptrdiff_t opoint = PT, added;
struct casing_str_buf buf;
- int ch, cased, len;
+ bool changed;
+ int ch, len;
for (; size; --size)
{
ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
+ changed = case_character (
+ &buf, ctx, ch,
+ size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
- if (!case_character (&buf, ctx, ch))
+ if (!changed)
{
pos_byte += len;
++pos;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index ae557d7..1bd745e 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -146,13 +146,16 @@ casefiddle-tests--characters
("define" "DEFINE" "define" "Define" "Define")
("fish" "FISH" "fish" "Fish" "Fish")
("Straße" "STRASSE" "straße" "Straße" "Straße")
- ;; FIXME: Everything below is broken at the moment. Here’s what
- ;; should happen:
- ;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
- ;; And here’s what is actually happening:
- ("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
- ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))
+ ;; The word repeated twice to test behaviour at the end of a word
+ ;; inside of an input string as well as at the end of the string.
+ ("ΌΣΟΣ ΌΣΟΣ" "ΌΣΟΣ ΌΣΟΣ" "όσος όσος" "Όσος Όσος" "ΌΣΟΣ ΌΣΟΣ")
+ ;; What should be done with sole sigma? It is ‘final’ but on the
+ ;; other hand it does not form a word. Let’s use regular sigma.
+ ("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
+ ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
+ ;; If sigma is already lower case, we don’t want to change it.
+ ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))
(nreverse errors))
(let* ((input (car test))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 09/18] Implement special sigma casing rule
2016-10-04 1:10 ` bug#24603: [RFC 09/18] Implement special sigma casing rule Michal Nazarewicz
@ 2016-10-04 7:22 ` Eli Zaretskii
0 siblings, 0 replies; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:22 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:32 +0200
>
> In Greek, a sigma character has two lower case forms which depend on
> their position in the word. Implement logic determining it.
Once again, can we store the needed conversions from
SpecialCasing.txt, instead of hard-coding them in C?
> +*** Greek small sigma is correctly handled when at the end of the word.
> +Strings such as ΌΣΟΣ are now correctly converted to Όσος when
> +capitalised to follow rules of Greek spelling.
^^^^^^^^^^^
US English spelling, please.
Also, I suggest to show the previous "capitalization" results as well,
and explain the difference, as the difference is subtle and may not be
clear to the reader.
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (7 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 09/18] Implement special sigma casing rule Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:12 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 11/18] Implement casing rules for Lithuanian Michal Nazarewicz
` (7 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Implement part of Unicode special handling rules for Azeri and Turkish
languages, namely ‘i’ is paired with ‘İ’ while ‘ı’ is paired with ‘I’.
* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
flag for handling of Turkic i.
(case_character_impl): Extract flag normalisation to a new function:
(normalise_flag): New function.
(case_single_character): Update after above changes.
(case_character): Rename to:
(case_characters): Add handling of Turkic i.
(do_casify_multibyte_string, do_casify_multibyte_region): Update to use
renamed case_characters.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test
cases for dotless and dotted i’s.
---
src/casefiddle.c | 251 ++++++++++++++++++++++++++++++-------------
test/src/casefiddle-tests.el | 37 ++++++-
2 files changed, 213 insertions(+), 75 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index ace589c..2a7aa64 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -35,7 +35,8 @@ struct casing_str_buf {
unsigned char len_bytes;
};
-enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
+enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP,
+ /* Only for internal use: */ CASE_NO_ACTION};
#include "special-casing.h"
@@ -53,6 +54,8 @@ struct casing_context {
bool inbuffer;
/* Whether we are inside of a word. */
bool inword;
+ /* Whether to apply Azeri/Turkish rules for dotted and dotless i. */
+ bool treat_turkic_i;
};
/* Initialise CTX structure and prepares related global data for casing
@@ -61,6 +64,8 @@ static void
prepare_casing_context (struct casing_context *ctx,
enum case_action flag, bool inbuffer)
{
+ Lisp_Object lang, l, tr, az;
+
ctx->flag = flag;
ctx->inbuffer = inbuffer;
ctx->inword = false;
@@ -68,42 +73,77 @@ prepare_casing_context (struct casing_context *ctx,
? uniprop_table (intern_c_string ("titlecase"))
: Qnil;
+ ctx->treat_turkic_i = false;
+
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
Fset_case_table (BVAR (current_buffer, downcase_table));
if (inbuffer && (int) flag >= (int) CASE_CAPITALIZE)
SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
+
+ /* FIXME: Is current-iso639-language the best source of that information? */
+ lang = Vcurrent_iso639_language;
+ tr = intern_c_string ("tr");
+ az = intern_c_string ("az");
+ if (SYMBOLP (lang))
+ {
+ l = lang;
+ goto check_language;
+ }
+ while (CONSP (lang))
+ {
+ l = XCAR (lang);
+ lang = XCDR (lang);
+ check_language:
+ if (EQ (l, tr) || EQ (l, az))
+ {
+ ctx->treat_turkic_i = true;
+ break;
+ }
+ }
+}
+
+/* Normalise CFG->flag and return CASE_UP, CASE_DOWN, CASE_CAPITALIZE or
+ CASE_NO_ACTION. The latter if CFG->flag is CASE_CAPITALIZE_UP and we are
+ inside of a word. */
+static enum case_action
+normalise_flag (struct casing_context *ctx)
+{
+ /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+ switch (ctx->flag) {
+ case CASE_CAPITALIZE:
+ return (enum case_action)((int)ctx->flag - ctx->inword);
+ case CASE_CAPITALIZE_UP:
+ return ctx->inword ? CASE_NO_ACTION : CASE_CAPITALIZE;
+ default:
+ return ctx->flag;
+ }
}
-/* Based on CTX, case character CH. If BUF is NULL, return cased character.
- Otherwise, if BUF is non-NULL, save result in it and return whether the
- character has been changed.
+/* Based on CTX and FLAG, case character CH. If BUF is NULL, return cased
+ character. Otherwise, if BUF is non-NULL, save result in it and return 0 if
+ the character changed or -1 if it didn’t.
+
+ FLAG may be one of CASE_UP, CASE_DOWN, CASE_CAPITALIZE (title-case if
+ possible, upper-aces otherwise) or CASE_NO_ACTION. CTX->inword is not taken
+ into account when interpreting FLAG (it may be taken into account for other
+ decisions though).
Since meaning of return value depends on arguments, it’s more convenient to
- use case_single_character or case_character instead. */
+ use case_single_character or case_characters instead. */
static int
case_character_impl (struct casing_str_buf *buf,
- struct casing_context *ctx, int ch)
+ struct casing_context *ctx, enum case_action flag, int ch)
{
- enum case_action flag;
Lisp_Object prop;
- bool was_inword;
int cased;
/* Update inword state */
- was_inword = ctx->inword;
ctx->inword = SYNTAX (ch) == Sword &&
- (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+ (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
- /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
- if (ctx->flag == CASE_CAPITALIZE)
- flag = (enum case_action)((int)ctx->flag - was_inword);
- else if (ctx->flag != CASE_CAPITALIZE_UP)
- flag = ctx->flag;
- else if (!was_inword)
- flag = CASE_CAPITALIZE;
- else
+ if (flag == CASE_NO_ACTION)
{
cased = ch;
goto done;
@@ -120,7 +160,7 @@ case_character_impl (struct casing_str_buf *buf,
((it - special_casing_code_points) * 3 + (int)flag);
memcpy (buf, entry, sizeof *buf);
buf->len_chars &= ~SPECIAL_CASING_NO_CHANGE_BIT;
- return !(entry->len_chars & SPECIAL_CASING_NO_CHANGE_BIT);
+ return entry->len_chars & SPECIAL_CASING_NO_CHANGE_BIT ? -1 : 0;
}
}
@@ -139,7 +179,7 @@ case_character_impl (struct casing_str_buf *buf,
return cased;
buf->len_chars = 1;
buf->len_bytes = CHAR_STRING (cased, buf->data);
- return cased != ch;
+ return cased == ch ? -1 : 0;
}
\f
/* In Greek, lower case sigma has two forms: one when used in the middle and one
@@ -152,6 +192,13 @@ case_character_impl (struct casing_str_buf *buf,
#define CAPITAL_SIGMA 0x03A3
#define SMALL_SIGMA 0x03C3
#define SMALL_FINAL_SIGMA 0x03C2
+
+/* Azeri and Turkish have dotless and dotted i. An upper case of i is
+ İ while lower case of I is ı. */
+
+#define CAPITAL_DOTTED_I 0x130
+#define SMALL_DOTLESS_I 0x131
+#define COMBINING_DOT_ABOVE 0x307
\f
/* Based on CTX, case character CH accordingly. Update CTX as necessary.
Return cased character.
@@ -162,38 +209,88 @@ case_character_impl (struct casing_str_buf *buf,
static inline int
case_single_character (struct casing_context *ctx, int ch)
{
- return case_character_impl (NULL, ctx, ch);
+ enum case_action flag = normalise_flag (ctx);
+ return case_character_impl (NULL, ctx, flag, ch);
}
-/* Save in BUF result of casing character CH. Return whether casing changed the
- character.
+/* Save in BUF result of casing character CH.
If not-NULL, NEXT points to the next character in the cased string. If NULL,
it is assumed current character is the last one being cased. This is used to
apply some rules which depend on proceeding state.
- This is like case_single_character but also handles one-to-many casing
- rules. */
-static bool
-case_character (struct casing_str_buf *buf, struct casing_context *ctx,
- int ch, const unsigned char *next)
+ Return:
+ - -1 if character has not been changed,
+ - 0 if the character has changed or
+ - a positive number if the character CH and the one following it (pointed by
+ NEXT) map to character saved in BUF. Returned value is the length in bytes
+ of the next character.
+
+ This is like case_single_character but also handles one-to-many as well as
+ many-to-one and many-to-many casing rules. */
+static int
+case_characters (struct casing_str_buf *buf, struct casing_context *ctx,
+ int ch, const unsigned char *next)
{
- bool changed, was_inword;
+ enum case_action flag = normalise_flag (ctx);
- was_inword = ctx->inword;
- changed = case_character_impl (buf, ctx, ch);
+ if (flag != CASE_NO_ACTION && __builtin_expect(ctx->treat_turkic_i, false))
+ {
+ bool dot_above = false;
+ int cased = ch;
- /* If we have just down-cased a capital sigma and the next character no longer
- has a word syntax (i.e. current character is end of word), use final
- sigma. */
- if (was_inword && ch == CAPITAL_SIGMA && changed &&
- (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+ switch (ch) {
+ case 'I':
+ if (flag == CASE_DOWN)
+ {
+ dot_above = next && STRING_CHAR (next) == COMBINING_DOT_ABOVE;
+ cased = dot_above ? 'i' : SMALL_DOTLESS_I;
+ }
+ break;
+
+ case 'i':
+ if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+ cased = CAPITAL_DOTTED_I;
+ break;
+
+ case CAPITAL_DOTTED_I:
+ if (flag == CASE_DOWN)
+ cased = 'i';
+ break;
+
+ case SMALL_DOTLESS_I:
+ if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+ cased = 'I';
+ break;
+
+ default:
+ goto not_turkic_i;
+ }
+
+ ctx->inword = true;
+ buf->len_chars = 1;
+ buf->len_bytes = CHAR_STRING (cased, buf->data);
+ if (dot_above)
+ return CHAR_BYTES (COMBINING_DOT_ABOVE);
+ else
+ return ch == cased ? -1 : 0;
+ }
+
+ not_turkic_i:
+ /* Capital sigma down-cases differently based on whether it’s last
+ letter of a word or not. */
+ if (flag == CASE_DOWN && ch == CAPITAL_SIGMA)
{
- buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
+ ch = (ctx->inword && (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+ ? SMALL_FINAL_SIGMA : SMALL_SIGMA;
+ buf->len_bytes = CHAR_STRING (ch, buf->data);
buf->len_chars = 1;
+ ctx->inword = true;
+ return 0;
}
- return changed;
+ /* Do the casing. */
+ return case_character_impl (buf, ctx, flag, ch);
}
\f
static Lisp_Object
@@ -240,7 +337,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
ptrdiff_t size = SCHARS (obj), n;
- int ch;
+ int ch, len_bytes;
USE_SAFE_ALLOCA;
if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
@@ -250,12 +347,16 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
const unsigned char *src = SDATA (obj);
- for (n = 0; size; --size)
+ n = 0;
+ while (size)
{
if (dst_end - o < sizeof(struct casing_str_buf))
string_overflow ();
ch = STRING_CHAR_ADVANCE (src);
- case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
+ len_bytes = case_characters ((void *)o, ctx, ch, size > 1 ? src : NULL);
+ if (len_bytes > 0)
+ src += len_bytes;
+ size -= len_bytes > 0 ? 2 : 1;
n += ((struct casing_str_buf *)o)->len_chars;
o += ((struct casing_str_buf *)o)->len_bytes;
}
@@ -397,44 +498,50 @@ do_casify_multibyte_region (struct casing_context *ctx,
ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
ptrdiff_t opoint = PT, added;
struct casing_str_buf buf;
- bool changed;
- int ch, len;
+ int ch, len_bytes, len_chars, ret;
- for (; size; --size)
+ while (size)
{
- ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
- changed = case_character (
+ ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len_bytes);
+ ret = case_characters (
&buf, ctx, ch,
- size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
+ size > 1 ? BYTE_POS_ADDR (pos_byte + len_bytes) : NULL);
+ len_chars = 1;
- if (!changed)
- {
- pos_byte += len;
- ++pos;
- continue;
- }
+ switch (ret) {
+ default:
+ len_chars += 1;
+ /* FALL THROUGH */
- last = pos + buf.len_chars;
- if (first < 0)
- first = pos;
+ case 0:
+ len_bytes += ret;
+ len_chars = ret ? 2 : 1;
- if (buf.len_chars == 1 && buf.len_bytes == len)
- memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
- else
- {
- /* Replace one character with the other(s), keeping text
- properties the same. */
- replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
- (const char *) buf.data, buf.len_chars,
- buf.len_bytes,
- 0);
- added += buf.len_chars - 1;
- if (opoint > pos)
- opoint += buf.len_chars - 1;
- }
+ last = pos + buf.len_chars;
+ if (first < 0)
+ first = pos;
+
+ if (ret == 0 && buf.len_chars == 1 && buf.len_bytes == len_bytes)
+ memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len_bytes);
+ else
+ {
+ /* Replace one character with the other(s), keeping text
+ properties the same. */
+ replace_range_2 (pos, pos_byte, pos + len_chars, pos_byte + len_bytes,
+ (const char *) buf.data, buf.len_chars,
+ buf.len_bytes,
+ 0);
+ added += buf.len_chars - len_chars;
+ if (opoint > pos)
+ opoint += buf.len_chars - len_chars;
+ }
- pos_byte += buf.len_bytes;
- pos += buf.len_chars;
+ /* FALL THOUGH */
+ case -1:
+ size -= len_chars;
+ pos += buf.len_chars;
+ pos_byte += buf.len_bytes;
+ }
}
if (PT != opoint)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 1bd745e..9f5e43f 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -134,7 +134,7 @@ casefiddle-tests--characters
(with-temp-buffer
(dolist
(test
- ;; input upcase downcase capitalize upcase-initials
+ ;; input upcase downcase capitalize upcase-initials [locale]
'(("Foo baR" "FOO BAR" "foo bar" "Foo Bar" "Foo BaR")
("Ⅷ ⅷ" "Ⅷ Ⅷ" "ⅷ ⅷ" "Ⅷ Ⅷ" "Ⅷ Ⅷ")
;; "DžUNGLA" is an unfortunate result but it’s really best we can
@@ -155,10 +155,41 @@ casefiddle-tests--characters
("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; If sigma is already lower case, we don’t want to change it.
- ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))
+ ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ")
+
+ ;; There is a language-independent special casing rule which
+ ;; converts İ into i followed by combining dot above that’s why we
+ ;; get the weird \u0307. Conceptually, it converts i with
+ ;; a soft-dot into an i with a hard-dot so it makes some doze of
+ ;; sense.
+ ("İstanbul" "İSTANBUL" "i\u0307stanbul" "İstanbul" "İstanbul")
+ ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'tr)
+ ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'az)
+ ("istanbul" "ISTANBUL" "istanbul" "Istanbul" "Istanbul")
+ ("istanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'tr)
+ ("istanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" 'az)
+ ("Irmak" "IRMAK" "irmak" "Irmak" "Irmak")
+ ("Irmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'tr)
+ ("Irmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'az)
+ ;; FIXME: We explicitly exclude ı→I mapping from the case tables
+ ;; in characters.el which is why instead of:
+ ;;("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak")
+ ;; we actually get:
+ ("ırmak" "ıRMAK" "ırmak" "Irmak" "Irmak")
+ ;; ‘But wait,’ you ask, ‘why capitalise examples work? This is
+ ;; because those bypass case-table and use character’s Unicode
+ ;; titlecase property.
+ ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'tr)
+ ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" 'az)
+ ;; And for some combining dot above removal.
+ ("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
+ "I\u0307si\u0307s" "I\u0307si\u0307s" 'tr)
+ ("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
+ "I\u0307sis" "I\u0307sI\u0307s" 'tr))
(nreverse errors))
- (let* ((input (car test))
+ (let* ((input (string-to-multibyte (car test)))
(expected (cdr test))
+ (current-iso639-language (or (nth 5 test) 'en))
(check (lambda (func got)
(unless (string-equal got (car expected))
(let ((fmt (length (symbol-name func))))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings
2016-10-04 1:10 ` bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings Michal Nazarewicz
@ 2016-10-04 7:12 ` Eli Zaretskii
0 siblings, 0 replies; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:12 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:33 +0200
>
> +
> + /* FIXME: Is current-iso639-language the best source of that information? */
> + lang = Vcurrent_iso639_language;
> + tr = intern_c_string ("tr");
> + az = intern_c_string ("az");
> + if (SYMBOLP (lang))
> + {
> + l = lang;
> + goto check_language;
> + }
> + while (CONSP (lang))
> + {
> + l = XCAR (lang);
> + lang = XCDR (lang);
> + check_language:
> + if (EQ (l, tr) || EQ (l, az))
> + {
> + ctx->treat_turkic_i = true;
> + break;
> + }
> + }
I'm not sure I like this mechanism. AFAIU, current-iso639-language is
a read-only variable that conveys the outside locale's language. So
the above would limit this feature to users in the corresponding
locales, which is against Emacs's design as a multilingual system. We
should allow Lisp applications and users in _any_ locale take
advantage of this feature.
So I suggest a separate variable which, when non-nil, will cause these
conversions to take effect. Lisp applications could then bind that
variable when they want these special conversions. (With the eye
towards future developments, as hinted by the rest of Unicode's
SpecialCasing.txt file, perhaps don't make the variable's name mention
a specific language, but instead make its value a language symbol,
such as 'tr or 'az.) We could make it a defcustom, if we think users
will want to turn this on as their default.
> +/* Normalise CFG->flag and return CASE_UP, CASE_DOWN, CASE_CAPITALIZE or
^^^^^^^^^
A nit: we use US English spelling, so "Normalize".
> +static enum case_action
> +normalise_flag (struct casing_context *ctx)
^^^^^^^^^
Likewise.
> +{
> + /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
This comment repeats what was already said above.
> /* In Greek, lower case sigma has two forms: one when used in the middle and one
> @@ -152,6 +192,13 @@ case_character_impl (struct casing_str_buf *buf,
> #define CAPITAL_SIGMA 0x03A3
> #define SMALL_SIGMA 0x03C3
> #define SMALL_FINAL_SIGMA 0x03C2
> +
> +/* Azeri and Turkish have dotless and dotted i. An upper case of i is
> + İ while lower case of I is ı. */
> +
> +#define CAPITAL_DOTTED_I 0x130
> +#define SMALL_DOTLESS_I 0x131
> +#define COMBINING_DOT_ABOVE 0x307
How about deriving these rules from SpecialCasing.txt and storing them
in some char-table, instead of hard-coding them in C? That would
allow us to update these features more easily with each release of the
Unicode Standard.
> + if (flag != CASE_NO_ACTION && __builtin_expect(ctx->treat_turkic_i, false))
I don't think we can use __builtin_expect here, it's AFAIK
non-portable to any platform without glibc.
> + if (len_bytes > 0)
> + src += len_bytes;
> + size -= len_bytes > 0 ? 2 : 1;
Another nit: please use whitespace consistently in the indentation,
either all TABs and spaces, or just spaces. (I think our default is
the former for now.)
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 11/18] Implement casing rules for Lithuanian
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (8 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 12/18] Implement rules for title-casing Dutch ij ‘letter’ Michal Nazarewicz
` (6 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
In Lithuanian, tittle above lower case i and j are retained even if
there are other diacritics above present. For that to work, an explicit
combining dot above must be added after i and j or otherwise the
rendering engine will remove the tittle.
* src/casefiddle.c (struct casing_context, prepare_casing_context): Add
lithuanian_tittle member to hold state of Lithuanian rules handling.
(case_lithuanian): New function which implements Lithuanian rules.
(case_characters): Make use of case_lithuanian.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test cases
for Lithuanian rules.
---
src/casefiddle.c | 149 +++++++++++++++++++++++++++++++++++++++++--
test/src/casefiddle-tests.el | 27 +++++++-
2 files changed, 170 insertions(+), 6 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 2a7aa64..0377fe6 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -56,6 +56,16 @@ struct casing_context {
bool inword;
/* Whether to apply Azeri/Turkish rules for dotted and dotless i. */
bool treat_turkic_i;
+
+ /* Whether to use Lithuanian rules for i’s and j’s tittle. */
+ unsigned char lithuanian_tittle;
+#define LT_OFF 0 /* No */
+#define LT_ON 1 /* Yes */
+#define LT_DEL_DOT_ABOVE 2 /* Yes and look out for combining dot above to
+ delete. */
+#define LT_INS_DOT_ABOVE 3 /* Yes and look out for diacritics combining above
+ because we may need to inject dot above before
+ them. */
};
/* Initialise CTX structure and prepares related global data for casing
@@ -64,7 +74,7 @@ static void
prepare_casing_context (struct casing_context *ctx,
enum case_action flag, bool inbuffer)
{
- Lisp_Object lang, l, tr, az;
+ Lisp_Object lang, l, tr, az, lt;
ctx->flag = flag;
ctx->inbuffer = inbuffer;
@@ -74,6 +84,7 @@ prepare_casing_context (struct casing_context *ctx,
: Qnil;
ctx->treat_turkic_i = false;
+ ctx->lithuanian_tittle = LT_OFF;
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -86,6 +97,7 @@ prepare_casing_context (struct casing_context *ctx,
lang = Vcurrent_iso639_language;
tr = intern_c_string ("tr");
az = intern_c_string ("az");
+ lt = intern_c_string ("lt");
if (SYMBOLP (lang))
{
l = lang;
@@ -97,10 +109,9 @@ prepare_casing_context (struct casing_context *ctx,
lang = XCDR (lang);
check_language:
if (EQ (l, tr) || EQ (l, az))
- {
- ctx->treat_turkic_i = true;
- break;
- }
+ ctx->treat_turkic_i = true;
+ else if (EQ (l, lt))
+ ctx->lithuanian_tittle = LT_ON;
}
}
@@ -199,6 +210,131 @@ case_character_impl (struct casing_str_buf *buf,
#define CAPITAL_DOTTED_I 0x130
#define SMALL_DOTLESS_I 0x131
#define COMBINING_DOT_ABOVE 0x307
+
+/* Lithuanian retains tittle in lower case i and j when there are more
+ accents above those letters. */
+
+#define CAPITAL_I_WITH_GRAVE 0x0CC
+#define CAPITAL_I_WITH_ACUTE 0x0CD
+#define CAPITAL_I_WITH_TILDE 0x128
+#define CAPITAL_I_WITH_OGONEK 0x12E
+#define SMALL_I_WITH_OGONEK 0x12F
+#define COMBINING_GRAVE_ABOVE 0x300
+#define COMBINING_ACUTE_ABOVE 0x301
+#define COMBINING_TILDE_ABOVE 0x303
+#define COMBINING_OGONEK 0x328
+
+/* Attempt to case CH using rules for Lithuanian i and j. Return true if
+ character has been cased (in which case it’s saved in BUF), false otherwise.
+ If CTX->lithuanian_tittle is LT_OFF, return false. */
+static bool
+case_lithuanian (struct casing_str_buf *buf, struct casing_context *ctx,
+ enum case_action flag, int ch)
+{
+ switch (__builtin_expect(ctx->lithuanian_tittle, LT_OFF)) {
+ case LT_OFF:
+ return false;
+
+ case LT_DEL_DOT_ABOVE:
+ /* When upper-casing i or j, a combining dot above that follows it must be
+ removed. This is true even if there’s a combining ogonek in between.
+ But, if there’s another character combining above in between, combining
+ dot needs to stay (since the dot will be rendered above the other
+ diacritic). */
+ switch (ch) {
+ case COMBINING_DOT_ABOVE:
+ buf->len_chars = buf->len_bytes = 0;
+ ctx->lithuanian_tittle = LT_ON;
+ return true;
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ ctx->lithuanian_tittle = LT_ON;
+ return false;
+ case COMBINING_OGONEK:
+ return false;
+ default:
+ ctx->lithuanian_tittle = LT_ON;
+ }
+ break;
+
+ case LT_INS_DOT_ABOVE:
+ /* When lower-casing I or J, if the letter has any accents above,
+ a combining dot above must be added before them. If we are here, it
+ means that we have lower cased I or J and we’re now on the lookout for
+ accents combining above. */
+ switch (ch) {
+ case COMBINING_GRAVE_ABOVE:
+ case COMBINING_ACUTE_ABOVE:
+ case COMBINING_TILDE_ABOVE:
+ buf->len_chars = 2;
+ buf->len_bytes = CHAR_STRING (COMBINING_DOT_ABOVE, buf->data);
+ buf->len_bytes += CHAR_STRING (ch, buf->data + buf->len_bytes);
+ ctx->lithuanian_tittle = LT_ON;
+ return true;
+ case COMBINING_OGONEK:
+ return false;
+ default:
+ ctx->lithuanian_tittle = LT_ON;
+ }
+ break;
+ }
+
+ switch (flag) {
+ case CASE_UP:
+ case CASE_CAPITALIZE:
+ if (ch == 'i' || ch == 'j')
+ {
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ }
+ else if (ch == SMALL_I_WITH_OGONEK)
+ buf->len_bytes = CHAR_STRING (CAPITAL_I_WITH_OGONEK, buf->data);
+ else
+ break;
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for combining dot above. */
+ ctx->lithuanian_tittle = LT_DEL_DOT_ABOVE;
+ return true;
+
+ case CASE_DOWN:
+ /* Turning I or J to lower case requires combining dot above to be included
+ IF there are any other characters combining above present. This is so
+ that the tittle is preserved. */
+ switch (ch) {
+ case CAPITAL_I_WITH_GRAVE:
+ ch = 0x80; /* U+300, "\xCC\x80", combining grave accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_ACUTE:
+ ch = 0x81; /* U+301, "\xCC \x81", combining acute accent */
+ goto has_accent;
+ case CAPITAL_I_WITH_TILDE:
+ ch = 0x83; /* U+303, "\xCC\x83", combining tilde */
+ has_accent:
+ memcpy (buf->data, "i\xCC\x87\xCC", 4);
+ buf->data[4] = ch;
+ buf->len_chars = 3;
+ buf->len_bytes = 5;
+ return true;
+
+ case 'I':
+ case 'J':
+ buf->data[0] = ch ^ ('i' ^ 'I');
+ buf->len_bytes = 1;
+ if (false)
+ case CAPITAL_I_WITH_OGONEK:
+ buf->len_bytes = CHAR_STRING (SMALL_I_WITH_OGONEK, buf->data);
+ buf->len_chars = 1;
+ /* Change the state so we’re on the lookout for diacritics combining
+ above. If one is found, we need to add combining dot above. */
+ ctx->lithuanian_tittle = LT_INS_DOT_ABOVE;
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
\f
/* Based on CTX, case character CH accordingly. Update CTX as necessary.
Return cased character.
@@ -234,6 +370,9 @@ case_characters (struct casing_str_buf *buf, struct casing_context *ctx,
{
enum case_action flag = normalise_flag (ctx);
+ if (case_lithuanian (buf, ctx, flag, ch))
+ return 0;
+
if (flag != CASE_NO_ACTION && __builtin_expect(ctx->treat_turkic_i, false))
{
bool dot_above = false;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 9f5e43f..bae4242 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -185,7 +185,32 @@ casefiddle-tests--characters
("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
"I\u0307si\u0307s" "I\u0307si\u0307s" 'tr)
("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
- "I\u0307sis" "I\u0307sI\u0307s" 'tr))
+ "I\u0307sis" "I\u0307sI\u0307s" 'tr)
+
+ ;; Test combining dot above in inserted when needed when lower
+ ;; casing I or J.
+ ("I\u0328\u0300" ; I + ogonek + grave
+ "I\u0328\u0300" "i\u0328\u0307\u0300"
+ "I\u0328\u0300" "I\u0328\u0300" 'lt)
+
+ ("J\u0328\u0300" ; J + ogonek + grave
+ "J\u0328\u0300" "j\u0328\u0307\u0300"
+ "J\u0328\u0300" "J\u0328\u0300" 'lt)
+
+ ("Į\u0300" ; I-ogonek + grave
+ "Į\u0300" "į\u0307\u0300" "Į\u0300" "Į\u0300" 'lt)
+
+ ("Ì Í Ĩ"
+ "Ì Í Ĩ" "i\u0307\u0300 i\u0307\u0301 i\u0307\u0303"
+ "Ì Í Ĩ" "Ì Í Ĩ" 'lt)
+
+ ;; Test combining dot above in removed when upper casing i or j.
+ ("i\u0328\u0307" ; i + ogonek + dot above
+ "I\u0328" "i\u0328\u0307" "I\u0328" "I\u0328" 'lt)
+ ("j\u0328\u0307" ; j + ogonek + dot above
+ "J\u0328" "j\u0328\u0307" "J\u0328" "J\u0328" 'lt)
+ ("į\u0307" ; i-ogonek + dot above
+ "Į" "į\u0307" "Į" "Į" 'lt))
(nreverse errors))
(let* ((input (string-to-multibyte (car test)))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 12/18] Implement rules for title-casing Dutch ij ‘letter’
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (9 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 11/18] Implement casing rules for Lithuanian Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 13/18] Add some tricky Unicode characters to regex test Michal Nazarewicz
` (5 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Dutch treats ‘ij’ as a single letter and when capitalising a word it
should be capitalised as a single letter (i.e. ‘ij’ becomes ‘IJ’).
Implement that.
* src/casefiddle.c (casify_context): Add treat_dutch_ij member for
determining whether special handling of ij is necessary.
(prepare_cosify_context): Set treat_dutch_ij to true when in Dutch
locale and capitalising.
(dutch_ij_p_impl, dutch_ij_p, handle_dutch_ij_impl,
handle_dutch_ij): New routines for detecting and handling when ‘ij’
must be upcased together.
(do_casify_multibyte_string, do_casify_unibyte_string,
do_casify_unibyte_region, do_casify_multibyte_region): Implement
handling of Dutch ij.
---
src/casefiddle.c | 49 +++++++++++++++++++++++++++++++++++++++++++-
test/src/casefiddle-tests.el | 6 +++++-
2 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 0377fe6..0de7814 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -66,6 +66,27 @@ struct casing_context {
#define LT_INS_DOT_ABOVE 3 /* Yes and look out for diacritics combining above
because we may need to inject dot above before
them. */
+
+ /* In Dutch, ‘ij’ is a digraph and when capitalised the whole thing is upper
+ cased. Unicode has ‘ij’ and ‘IJ’ (with proper casing mappings) but they
+ aren’t always used so we cannot/should not rely on them.
+
+ Note that rule for capitalising ‘ij’ as a single letter is not present in
+ Unicode 9.0’s SpecialCasing.txt. On the flip side, Firefox implements this
+ as well so we’re not completely alone.
+
+ There are words where ‘ij’ are two separate letters (such as bijectie or
+ bijoux) in which case the capitalisation rules do not apply. I (mina86)
+ have googled this a little and couldn’t find a Dutch word which beings with
+ ‘ij’ that is not a digraph so we should be in the clear since we only care
+ about the initial. */
+
+ /* Whether to apply Dutch rules for title-casing ij as IJ. Non-zero
+ value implies flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */
+ unsigned char treat_dutch_ij;
+#define NL_OFF 0 /* No */
+#define NL_ON 1 /* Yes */
+#define NL_UPCASE_J 2 /* Yes and the previous character was upcased ‘i’. */
};
/* Initialise CTX structure and prepares related global data for casing
@@ -74,7 +95,7 @@ static void
prepare_casing_context (struct casing_context *ctx,
enum case_action flag, bool inbuffer)
{
- Lisp_Object lang, l, tr, az, lt;
+ Lisp_Object lang, l, tr, az, lt, nl;
ctx->flag = flag;
ctx->inbuffer = inbuffer;
@@ -85,6 +106,7 @@ prepare_casing_context (struct casing_context *ctx,
ctx->treat_turkic_i = false;
ctx->lithuanian_tittle = LT_OFF;
+ ctx->treat_dutch_ij = NL_OFF;
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -98,6 +120,7 @@ prepare_casing_context (struct casing_context *ctx,
tr = intern_c_string ("tr");
az = intern_c_string ("az");
lt = intern_c_string ("lt");
+ nl = intern_c_string ("nl");
if (SYMBOLP (lang))
{
l = lang;
@@ -112,6 +135,8 @@ prepare_casing_context (struct casing_context *ctx,
ctx->treat_turkic_i = true;
else if (EQ (l, lt))
ctx->lithuanian_tittle = LT_ON;
+ else if (EQ (l, nl))
+ ctx->treat_dutch_ij = (int) flag >= (int) CASE_CAPITALIZE;
}
}
@@ -154,6 +179,28 @@ case_character_impl (struct casing_str_buf *buf,
ctx->inword = SYNTAX (ch) == Sword &&
(!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
+ /* Handle dutch ij. We need to do it here before the flag == CASE_NO_ACTION
+ check. Note that non-zero treat_dutch_ij implies ctx->flag being ≥
+ CASE_CAPITALIZE. */
+ switch (__builtin_expect(ctx->treat_dutch_ij, NL_OFF)) {
+ case NL_ON:
+ if (ch == 'i' && flag == CASE_CAPITALIZE)
+ {
+ ctx->treat_dutch_ij = NL_UPCASE_J;
+ cased = 'I';
+ goto done;
+ }
+ break;
+ case NL_UPCASE_J:
+ ctx->treat_dutch_ij = NL_ON;
+ if (ch == 'j')
+ {
+ cased = 'J';
+ goto done;
+ }
+ }
+
+ /* We are inside of a word and capitalising initials only. */
if (flag == CASE_NO_ACTION)
{
cased = ch;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index bae4242..3857f08 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -210,7 +210,11 @@ casefiddle-tests--characters
("j\u0328\u0307" ; j + ogonek + dot above
"J\u0328" "j\u0328\u0307" "J\u0328" "J\u0328" 'lt)
("į\u0307" ; i-ogonek + dot above
- "Į" "į\u0307" "Į" "Į" 'lt))
+ "Į" "į\u0307" "Į" "Į" 'lt)
+
+ ;; Dutch 'ij' is capitalised as single digraph.
+ ("ijsland" "IJSLAND" "ijsland" "Ijsland" "Ijsland")
+ ("ijsland" "IJSLAND" "ijsland" "IJsland" "IJsland" 'nl))
(nreverse errors))
(let* ((input (string-to-multibyte (car test)))
(expected (cdr test))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 13/18] Add some tricky Unicode characters to regex test
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (10 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 12/18] Implement rules for title-casing Dutch ij ‘letter’ Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 14/18] Factor out character category lookup to separate function Michal Nazarewicz
` (4 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
* test/src/regex-tests.el: Include capital ‘DZ’ dygraph, sharp ‘s’,
capital ligature ‘IJ’, small ligature ‘fi’, title-case dygraph ‘Dz’,
all three forms of Greek sigma and and IPA ɕ symbol in the regex tests.
---
test/src/regex-tests.el | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index c4844c7..fa66ff1 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -65,27 +65,30 @@ regex--test-cc
(skip-chars-forward (concat "[:" name ":]\u2622"))
(should (or (equal (point) p) (equal (point) (1+ p))))))))
-(dolist (test '(("alnum" "abcABC012łąka" "-, \t\n")
- ("alpha" "abcABCłąka" "-,012 \t\n")
+(dolist (test '(("alnum" "abcABC012łąkaDZßIJfiDzΣσςɕ" "-, \t\n")
+ ("alpha" "abcABCłąkaDZßIJfiDzΣσςɕ" "-,012 \t\n")
("digit" "012" "abcABCłąka-, \t\n")
("xdigit" "0123aBc" "łąk-, \t\n")
- ("upper" "ABCŁĄKA" "abc012-, \t\n")
- ("lower" "abcłąka" "ABC012-, \t\n")
+ ("upper" "ABCŁĄKADZIJΣ" "abcß0fiσςɕ12-, \t\n")
+ ;; FIXME: ßfiɕ are all lower case (even though they don’t have
+ ;; (single-character) upper-case form).
+ ("lower" "abcłąkaσς" "ABC012DZIJΣ-, \t\n")
- ("word" "abcABC012\u2620" "-, \t\n")
+ ("word" "abcABC012\u2620DZßIJfiDzΣσςɕ" "-, \t\n")
("punct" ".,-" "abcABC012\u2620 \t\n")
("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
- ("graph" "abcłąka\u2620-," " \t\n\1")
- ("print" "abcłąka\u2620-, " "\t\n\1")
+ ("graph" "abcłąka\u2620-,DZßIJfiDzΣσςɕ" " \t\n\1")
+ ("print" "abcłąka\u2620-,DZßIJfiDzΣσςɕ " "\t\n\1")
("space" " \t\n\u2001" "abcABCł0123")
("blank" " \t" "\n\u2001")
- ("ascii" "abcABC012 \t\n\1" "łą\u2620")
- ("nonascii" "łą\u2622" "abcABC012 \t\n\1")
- ("unibyte" "abcABC012 \t\n\1" "łą\u2622")
- ("multibyte" "łą\u2622" "abcABC012 \t\n\1")))
+ ("ascii" "abcABC012 \t\n\1" "łą\u2620DZßIJfiDzΣσςɕ")
+ ("nonascii" "łą\u2622DZßIJfiDzΣσςɕ" "abcABC012 \t\n\1")
+ ;; Note: sharp s is unibyte since it’s code point is below 256.
+ ("unibyte" "abcABC012ß \t\n\1" "łą\u2622DZIJfiDzΣσςɕ")
+ ("multibyte" "łą\u2622DZIJfiDzΣσςɕ" "abcABC012ß \t\n\1")))
(let ((name (intern (concat "regex-tests-" (car test) "-character-class")))
(doc (concat "Perform sanity test of regexes using " (car test)
" character class.
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 14/18] Factor out character category lookup to separate function
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (11 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 13/18] Add some tricky Unicode characters to regex test Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties Michal Nazarewicz
` (3 subsequent siblings)
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
* src/character.c (char_unicode_category): New function returning Unicode
general category of specified character.
(alphabeticp, alphanumericp, graphicp, printablep): Use the above.
---
src/character.c | 33 +++++++++++++++------------------
1 file changed, 15 insertions(+), 18 deletions(-)
diff --git a/src/character.c b/src/character.c
index 75a7dab..1e49536 100644
--- a/src/character.c
+++ b/src/character.c
@@ -960,14 +960,18 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
return make_number (c);
}
+static unicode_category_t
+char_unicode_category (int c)
+{
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+ return INTEGERP (category) ? XINT (category) : UNICODE_CATEGORY_UNKNOWN;
+}
+
/* Return true if C is an alphabetic character. */
bool
alphabeticp (int c)
{
- Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
- if (! INTEGERP (category))
- return false;
- EMACS_INT gen_cat = XINT (category);
+ unicode_category_t gen_cat = char_unicode_category (c);
/* See UTS #18. There are additional characters that should be
here, those designated as Other_uppercase, Other_lowercase,
@@ -987,10 +991,7 @@ alphabeticp (int c)
bool
alphanumericp (int c)
{
- Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
- if (! INTEGERP (category))
- return false;
- EMACS_INT gen_cat = XINT (category);
+ unicode_category_t gen_cat = char_unicode_category (c);
/* See UTS #18. Same comment as for alphabeticp applies. FIXME. */
return (gen_cat == UNICODE_CATEGORY_Lu
@@ -1009,13 +1010,11 @@ alphanumericp (int c)
bool
graphicp (int c)
{
- Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
- if (! INTEGERP (category))
- return false;
- EMACS_INT gen_cat = XINT (category);
+ unicode_category_t gen_cat = char_unicode_category (c);
/* See UTS #18. */
- return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
+ return (!(gen_cat == UNICODE_CATEGORY_UNKNOWN
+ || gen_cat == UNICODE_CATEGORY_Zs /* space separator */
|| gen_cat == UNICODE_CATEGORY_Zl /* line separator */
|| gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
|| gen_cat == UNICODE_CATEGORY_Cc /* control */
@@ -1027,13 +1026,11 @@ graphicp (int c)
bool
printablep (int c)
{
- Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
- if (! INTEGERP (category))
- return false;
- EMACS_INT gen_cat = XINT (category);
+ unicode_category_t gen_cat = char_unicode_category (c);
/* See UTS #18. */
- return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
+ return (!(gen_cat == UNICODE_CATEGORY_UNKNOWN
+ || gen_cat == UNICODE_CATEGORY_Cc /* control */
|| gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
}
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (12 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 14/18] Factor out character category lookup to separate function Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 6:54 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case Michal Nazarewicz
` (2 subsequent siblings)
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Not all lower case characters have a simple upper case form.
For example, an fi ligature has no one-character upper case mapping.
Similarly, ɕ from IPA block has no upper case form at all.
It isn’t therefore sufficient to lookup character’s upper-case form to
determine whether it’s lower-case or not. As such, rewrite the tests
to be based on Unicode properties.
* src/buffer.h (uppercasep, lowercasep): Delete.
* src/character.c (uppercasep, lowercasep): New functions which base
their test on Unicode character properties rather than case table.
* test/src/regex-tests.el (regex-tests-lower-character-class): Include
fi ligature, σ letter and ς IPA symbol in the test.
---
etc/NEWS | 7 +++++++
src/buffer.h | 13 -------------
src/character.c | 24 ++++++++++++++++++++++++
src/character.h | 2 ++
test/src/regex-tests.el | 4 +---
5 files changed, 34 insertions(+), 16 deletions(-)
diff --git a/etc/NEWS b/etc/NEWS
index 4516812..727af59 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -251,6 +251,10 @@ For example, fi ligature is converted to FI when upper cased.
Strings such as ΌΣΟΣ are now correctly converted to Όσος when
capitalised to follow rules of Greek spelling.
+*** 'upper' and 'lower' classes match characters w/o simple cased forms.
+For instance, ß letter and fi ligature are now matched by [[:lower:]]
+regular expression.
+
\f
* Changes in Specialized Modes and Packages in Emacs 26.1
@@ -512,6 +516,9 @@ of curved quotes in format arguments to functions like 'message' and
now generate less chatter and more-compact diagnostics. The auxiliary
function 'check-declare-errmsg' has been removed.
+** 'upper' and 'lower' character classes are unaffected by case table
+since they are now based purely on Unicode properties.
+
\f
* Lisp Changes in Emacs 26.1
diff --git a/src/buffer.h b/src/buffer.h
index 1543f67..aade0ea 100644
--- a/src/buffer.h
+++ b/src/buffer.h
@@ -1358,19 +1358,6 @@ upcase (int c)
return NATNUMP (up) ? XFASTINT (up) : c;
}
-/* True if C is upper case. */
-INLINE bool uppercasep (int c)
-{
- return downcase (c) != c;
-}
-
-/* True if C is lower case. */
-INLINE bool
-lowercasep (int c)
-{
- return !uppercasep (c) && upcase (c) != c;
-}
-
INLINE_HEADER_END
#endif /* EMACS_BUFFER_H */
diff --git a/src/character.c b/src/character.c
index 1e49536..707ae10 100644
--- a/src/character.c
+++ b/src/character.c
@@ -967,6 +967,30 @@ char_unicode_category (int c)
return INTEGERP (category) ? XINT (category) : UNICODE_CATEGORY_UNKNOWN;
}
+/* Return true if C is a upper case character. This does not imply mean it
+ has a lower case form. */
+bool
+uppercasep (int c)
+{
+ unicode_category_t gen_cat = char_unicode_category (c);
+
+ /* See UTS #18. There are additional characters that should be
+ here, those designated as Other_uppercase; FIXME. */
+ return gen_cat == UNICODE_CATEGORY_Lu;
+}
+
+/* Return true if C is a lower case character. This does not imply mean it
+ has an upper case form. */
+bool
+lowercasep (int c)
+{
+ unicode_category_t gen_cat = char_unicode_category (c);
+
+ /* See UTS #18. There are additional characters that should be
+ here, those designated as Other_lowercase; FIXME. */
+ return gen_cat == UNICODE_CATEGORY_Ll;
+}
+
/* Return true if C is an alphabetic character. */
bool
alphabeticp (int c)
diff --git a/src/character.h b/src/character.h
index fc8a0dd..5931c5c 100644
--- a/src/character.h
+++ b/src/character.h
@@ -676,6 +676,8 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
+extern bool uppercasep (int);
+extern bool lowercasep (int);
extern bool alphabeticp (int);
extern bool alphanumericp (int);
extern bool graphicp (int);
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index fa66ff1..fc50344 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -70,9 +70,7 @@ regex--test-cc
("digit" "012" "abcABCłąka-, \t\n")
("xdigit" "0123aBc" "łąk-, \t\n")
("upper" "ABCŁĄKADZIJΣ" "abcß0fiσςɕ12-, \t\n")
- ;; FIXME: ßfiɕ are all lower case (even though they don’t have
- ;; (single-character) upper-case form).
- ("lower" "abcłąkaσς" "ABC012DZIJΣ-, \t\n")
+ ("lower" "abcłąkaßfiσς" "ABC012DZIJΣ-, \t\n")
("word" "abcABC012\u2620DZßIJfiDzΣσςɕ" "-, \t\n")
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties
2016-10-04 1:10 ` bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties Michal Nazarewicz
@ 2016-10-04 6:54 ` Eli Zaretskii
0 siblings, 0 replies; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 6:54 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:38 +0200
>
> +** 'upper' and 'lower' character classes are unaffected by case table
> +since they are now based purely on Unicode properties.
This is actually a backward-incompatible change, isn't it? If so, it
should be in the corresponding section of NEWS. More importantly,
there should be a way to get back the old behavior, i.e. to force
'upper' and 'lower' use the current case tables.
Better yet, can we use the Unicode properties only where case tables
are insufficient, like in the case of ligatures being broken up into
individual characters by case conversions? That'd be
backward-compatible, so won't risk breaking existing code.
I'm also okay with a defcustom, by default off, to prefer the Unicode
data, as you did, so that we could in the future make this the default
behavior. But doing this right now without any transition period and
no way of going back is too radical, I think.
Please also note that Unicode tables are global, very large, and in
many cases tricky to change from Lisp (as compared to simple
char-tables). So customizing the case conversions that are based
solely on the Unicode tables is much harder and/or has global
implications, unlike the case tables. With that in mind, I think we
should make the transition smoother, and we should probably add
convenience APIs for customizing the case conversions the Unicode way,
before we switch to that as the default.
Thanks.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (13 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 7:48 ` Eli Zaretskii
2016-10-04 1:10 ` bug#24603: [RFC 17/18] Optimise character class matching in regexes Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 18/18] Fix case-fold-search character class matching Michal Nazarewicz
16 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Use a lookup table to map Unicode general categories to character
categories. This generalises lowercasep, uppercasep et al. functions.
Furthermore, provide another lookup table for ASCII characters such that
the common case can be optimised and Unicode general category lookup
avoided.
Using lookup table in place of conditions may theoretically improve
performance even though I have not measure it. Moreover though, having
the lookup table will allow regex engine to be optimised in an upcoming
patch. Stay tuned. ;)
* src/character.c (alphanumericp, alphabeticp, uppercasep, lowercasep,
graphicp, printablep): Replaced with static inline functions define in
the header file.
(category_char_bits): New lookup table mapping Unicode
general category to character classes.
(ascii_char_bits): New lookup table mapping ASCII characters to
character classes.
* src/character.h (unicode_alphanumericp, unicode_alphabeticp,
unicode_uppercasep, unicode_lowercasep, unicode_graphicp,
unicode_printablep, _ascii_alphanumericp, _ascii_alphabeticp,
_ascii_uppercasep, _ascii_lowercasep, _ascii_graphicp,
_ascii_printablep): New static inline functions which are special cases
of respective unprefixed functions.
* src/regex.c (ISALNUM, ISALPHA): Remove special cases for ASCII
characters since alphanumericp and uppercasep already handle those.
---
src/character.c | 168 +++++++++++++++++++++++++-------------------------------
src/character.h | 77 +++++++++++++++++++++++---
src/regex.c | 20 ++-----
3 files changed, 151 insertions(+), 114 deletions(-)
diff --git a/src/character.c b/src/character.c
index 707ae10..63f89d3 100644
--- a/src/character.c
+++ b/src/character.c
@@ -960,104 +960,88 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
return make_number (c);
}
-static unicode_category_t
+/* Return C’s Unicode general category (or UNICODE_CATEGORY_UNKNOWN). */
+unicode_category_t
char_unicode_category (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
return INTEGERP (category) ? XINT (category) : UNICODE_CATEGORY_UNKNOWN;
}
-/* Return true if C is a upper case character. This does not imply mean it
- has a lower case form. */
-bool
-uppercasep (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. There are additional characters that should be
- here, those designated as Other_uppercase; FIXME. */
- return gen_cat == UNICODE_CATEGORY_Lu;
-}
-
-/* Return true if C is a lower case character. This does not imply mean it
- has an upper case form. */
-bool
-lowercasep (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. There are additional characters that should be
- here, those designated as Other_lowercase; FIXME. */
- return gen_cat == UNICODE_CATEGORY_Ll;
-}
-
-/* Return true if C is an alphabetic character. */
-bool
-alphabeticp (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. There are additional characters that should be
- here, those designated as Other_uppercase, Other_lowercase,
- and Other_alphabetic; FIXME. */
- return (gen_cat == UNICODE_CATEGORY_Lu
- || gen_cat == UNICODE_CATEGORY_Ll
- || gen_cat == UNICODE_CATEGORY_Lt
- || gen_cat == UNICODE_CATEGORY_Lm
- || gen_cat == UNICODE_CATEGORY_Lo
- || gen_cat == UNICODE_CATEGORY_Mn
- || gen_cat == UNICODE_CATEGORY_Mc
- || gen_cat == UNICODE_CATEGORY_Me
- || gen_cat == UNICODE_CATEGORY_Nl);
-}
-
-/* Return true if C is a alphabetic or decimal-number character. */
-bool
-alphanumericp (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. Same comment as for alphabeticp applies. FIXME. */
- return (gen_cat == UNICODE_CATEGORY_Lu
- || gen_cat == UNICODE_CATEGORY_Ll
- || gen_cat == UNICODE_CATEGORY_Lt
- || gen_cat == UNICODE_CATEGORY_Lm
- || gen_cat == UNICODE_CATEGORY_Lo
- || gen_cat == UNICODE_CATEGORY_Mn
- || gen_cat == UNICODE_CATEGORY_Mc
- || gen_cat == UNICODE_CATEGORY_Me
- || gen_cat == UNICODE_CATEGORY_Nl
- || gen_cat == UNICODE_CATEGORY_Nd);
-}
-
-/* Return true if C is a graphic character. */
-bool
-graphicp (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. */
- return (!(gen_cat == UNICODE_CATEGORY_UNKNOWN
- || gen_cat == UNICODE_CATEGORY_Zs /* space separator */
- || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
- || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
- || gen_cat == UNICODE_CATEGORY_Cc /* control */
- || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
- || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
-}
-
-/* Return true if C is a printable character. */
-bool
-printablep (int c)
-{
- unicode_category_t gen_cat = char_unicode_category (c);
-
- /* See UTS #18. */
- return (!(gen_cat == UNICODE_CATEGORY_UNKNOWN
- || gen_cat == UNICODE_CATEGORY_Cc /* control */
- || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
- || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
-}
+#define CHAR_BIT_ALNUM_ CHAR_BIT_ALNUM | CHAR_BIT_GRAPH | CHAR_BIT_PRINT
+#define CHAR_BIT_ALPHA_ CHAR_BIT_ALPHA | CHAR_BIT_ALNUM_
+
+/* See UTS #18 and DerivedCoreProperties.txt. alpha, alnum, upper and
+ lower are missing some characters, namely those designated as
+ Other_uppercase, Other_lowercase and Other_alphabetic; FIXME. */
+
+const unsigned char category_char_bits[] = {
+ [UNICODE_CATEGORY_UNKNOWN] = 0,
+ [UNICODE_CATEGORY_Lu] = CHAR_BIT_ALPHA_ | CHAR_BIT_UPPER,
+ [UNICODE_CATEGORY_Ll] = CHAR_BIT_ALPHA_ | CHAR_BIT_LOWER,
+ [UNICODE_CATEGORY_Lt] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Lm] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Lo] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Mn] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Mc] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Me] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Nd] = CHAR_BIT_ALNUM_,
+ [UNICODE_CATEGORY_Nl] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_No] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Pc] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Pd] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Ps] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Pe] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Pi] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Pf] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Po] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Sm] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Sc] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Sk] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_So] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Zs] = CHAR_BIT_PRINT,
+ [UNICODE_CATEGORY_Zl] = CHAR_BIT_PRINT,
+ [UNICODE_CATEGORY_Zp] = CHAR_BIT_PRINT,
+ [UNICODE_CATEGORY_Cc] = 0,
+ [UNICODE_CATEGORY_Cf] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Cs] = 0,
+ [UNICODE_CATEGORY_Co] = CHAR_BIT_PRINT | CHAR_BIT_GRAPH,
+ [UNICODE_CATEGORY_Cn] = 0,
+};
+
+#undef CHAR_BIT_ALNUM_
+#undef CHAR_BIT_ALPHA_
+
+#define P_ CHAR_BIT_PRINT
+#define G_ CHAR_BIT_GRAPH | P_
+#define N_ CHAR_BIT_ALNUM | G_
+#define U_ CHAR_BIT_UPPER | CHAR_BIT_ALPHA | N_
+#define L_ CHAR_BIT_LOWER | CHAR_BIT_ALPHA | N_
+
+const unsigned char ascii_char_bits[] = {
+/*\0 ... \17 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*\20 ... \37 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/*' ' '!' '"' '#' '$' '%' '&' '´' '(' ')' '*' '+' ',' '-' '.' '/' */
+ P_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_, G_,
+/*'0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?' */
+ N_, N_, N_, N_, N_, N_, N_, N_, N_, N_, G_, G_, G_, G_, G_, G_,
+/*'@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
+ G_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_,
+/*'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
+ U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, U_, G_, G_, G_, G_, G_,
+/*'`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
+ G_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_,
+/*'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' \177 */
+ L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, L_, G_, G_, G_, G_, 0,
+};
+
+#undef P_
+#undef G_
+#undef N_
+#undef U_
+#undef L_
void
syms_of_character (void)
diff --git a/src/character.h b/src/character.h
index 5931c5c..6dc95ad 100644
--- a/src/character.h
+++ b/src/character.h
@@ -652,8 +652,78 @@ typedef enum {
UNICODE_CATEGORY_Cs,
UNICODE_CATEGORY_Co,
UNICODE_CATEGORY_Cn
+ /* Don’t forget to extend category_char_bits in character.c when new entries
+ are added here. */
} unicode_category_t;
+extern unicode_category_t char_unicode_category (int);
+
+/* Limited set of character categories which syntax-independent. Testing of
+ * those characters do not require any run-time data, e.g. do not depend on
+ * syntax table. */
+#define CHAR_BIT_ALNUM (1 << 0)
+#define CHAR_BIT_ALPHA (1 << 1)
+#define CHAR_BIT_UPPER (1 << 2)
+#define CHAR_BIT_LOWER (1 << 3)
+#define CHAR_BIT_GRAPH (1 << 4)
+#define CHAR_BIT_PRINT (1 << 5)
+
+/* Map from Unicode general category to character classes the character is in.
+ *
+ * Only character classes defined by CHAR_BIT_* above are present.
+ *
+ * This is an array of bit fields so for example ‘category_char_bits[gc] &
+ * CHAR_BIT_ALPHA’ tells you whether characters in general category GC are
+ * alphabetic or not. */
+extern const unsigned char category_char_bits[];
+
+/* Map from ASCII character to character classes the character is in.
+ *
+ * Only character classes defined by CHAR_BIT_* above are present.
+ *
+ * This is an array of bit fields so for example ascii_char_bits[ch] &
+ * CHAR_BIT_ALPHA’ tells you whether character CH is alphabetic or not. */
+extern const unsigned char ascii_char_bits[128];
+
+#define DEFINE_CHAR_TEST(name, bit) \
+ static inline bool unicode_ ## name (int c) { \
+ return category_char_bits[char_unicode_category(c)] & bit; \
+ } \
+ static inline bool _ascii_ ## name (int c) { \
+ return ascii_char_bits[c] & bit; \
+ } \
+ static inline bool name (int c) { \
+ return (unsigned)c < (sizeof ascii_char_bits / sizeof *ascii_char_bits) ? \
+ _ascii_ ## name (c) : unicode_ ## name (c); \
+ }
+
+/* For TEST being one of:
+ alphanumericp
+ alphabeticp
+ uppercasep
+ lowercasep
+ graphicp
+ printablep
+ define
+ bool TEST (int c);
+ bool unicode_TEST (int c);
+ bool _ascii_TEST (int c);
+ which test whether C has given character property. TEST works for any
+ character, Unicode or not. unicode_TEST works for any character as well but
+ is potentially slower for ASCII characters (since it requires Unicode
+ category lookup). _ascii_TEST works for ASCII characters only and creates
+ naked singularity if non-ASCII character is passed to it. */
+
+DEFINE_CHAR_TEST (alphanumericp, CHAR_BIT_ALNUM)
+DEFINE_CHAR_TEST (alphabeticp, CHAR_BIT_ALPHA)
+DEFINE_CHAR_TEST (uppercasep, CHAR_BIT_UPPER)
+DEFINE_CHAR_TEST (lowercasep, CHAR_BIT_LOWER)
+DEFINE_CHAR_TEST (graphicp, CHAR_BIT_GRAPH)
+DEFINE_CHAR_TEST (printablep, CHAR_BIT_PRINT)
+
+#undef DEFINE_CHAR_TEST
+
+
extern EMACS_INT char_resolve_modifier_mask (EMACS_INT) ATTRIBUTE_CONST;
extern int char_string (unsigned, unsigned char *);
extern int string_char (const unsigned char *,
@@ -676,13 +746,6 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
-extern bool uppercasep (int);
-extern bool lowercasep (int);
-extern bool alphabeticp (int);
-extern bool alphanumericp (int);
-extern bool graphicp (int);
-extern bool printablep (int);
-
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
(XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
diff --git a/src/regex.c b/src/regex.c
index 1917a84..02da1fb 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -313,6 +313,11 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
/* The rest must handle multibyte characters. */
+# define ISALNUM(c) alphanumericp (c)
+# define ISALPHA(c) alphabeticp (c)
+# define ISUPPER(c) uppercasep (c)
+# define ISLOWER(c) lowercasep (c)
+
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
: graphicp (c))
@@ -321,19 +326,6 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
: printablep (c))
-# define ISALNUM(c) (IS_REAL_ASCII (c) \
- ? (((c) >= 'a' && (c) <= 'z') \
- || ((c) >= 'A' && (c) <= 'Z') \
- || ((c) >= '0' && (c) <= '9')) \
- : alphanumericp (c))
-
-# define ISALPHA(c) (IS_REAL_ASCII (c) \
- ? (((c) >= 'a' && (c) <= 'z') \
- || ((c) >= 'A' && (c) <= 'Z')) \
- : alphabeticp (c))
-
-# define ISLOWER(c) lowercasep (c)
-
# define ISPUNCT(c) (IS_REAL_ASCII (c) \
? ((c) > ' ' && (c) < 0177 \
&& !(((c) >= 'a' && (c) <= 'z') \
@@ -343,8 +335,6 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
-# define ISUPPER(c) uppercasep (c)
-
# define ISWORD(c) (SYNTAX (c) == Sword)
#else /* not emacs */
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-10-04 1:10 ` bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case Michal Nazarewicz
@ 2016-10-04 7:48 ` Eli Zaretskii
2016-10-17 13:22 ` Michal Nazarewicz
2016-11-06 19:26 ` Michal Nazarewicz
0 siblings, 2 replies; 89+ messages in thread
From: Eli Zaretskii @ 2016-10-04 7:48 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Date: Tue, 4 Oct 2016 03:10:39 +0200
>
> +const unsigned char category_char_bits[] = {
> + [UNICODE_CATEGORY_UNKNOWN] = 0,
> + [UNICODE_CATEGORY_Lu] = CHAR_BIT_ALPHA_ | CHAR_BIT_UPPER,
> + [UNICODE_CATEGORY_Ll] = CHAR_BIT_ALPHA_ | CHAR_BIT_LOWER,
Is this syntax portable enough for us to use it?
> +/* Limited set of character categories which syntax-independent. Testing of
^^^^^^^^^^^^^^^^^^^^^^^^
"which are syntax-independent"
> + * those characters do not require any run-time data, e.g. do not depend on
^^^^^^^^^^^^^^ ^^^^^^^^^^^^^
"does not require" and "does not depend"
Thanks. I think this change will require a benchmark to make sure we
don't lose too much in terms of performance.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-10-04 7:48 ` Eli Zaretskii
@ 2016-10-17 13:22 ` Michal Nazarewicz
2016-11-06 19:26 ` Michal Nazarewicz
1 sibling, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-17 13:22 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Date: Tue, 4 Oct 2016 03:10:39 +0200
>>
>> +const unsigned char category_char_bits[] = {
>> + [UNICODE_CATEGORY_UNKNOWN] = 0,
>> + [UNICODE_CATEGORY_Lu] = CHAR_BIT_ALPHA_ | CHAR_BIT_UPPER,
>> + [UNICODE_CATEGORY_Ll] = CHAR_BIT_ALPHA_ | CHAR_BIT_LOWER,
>
> Is this syntax portable enough for us to use it?
It’s a C99 feature and C99 compiler is required since Emacs 25.1.
>> +/* Limited set of character categories which syntax-independent. Testing of
> ^^^^^^^^^^^^^^^^^^^^^^^^
> "which are syntax-independent"
>
>> + * those characters do not require any run-time data, e.g. do not depend on
> ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^
> "does not require" and "does not depend"
Both done.
> Thanks. I think this change will require a benchmark to make sure we
> don't lose too much in terms of performance.
Will do.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-10-04 7:48 ` Eli Zaretskii
2016-10-17 13:22 ` Michal Nazarewicz
@ 2016-11-06 19:26 ` Michal Nazarewicz
2016-11-06 19:44 ` Eli Zaretskii
1 sibling, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-11-06 19:26 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Oct 04 2016, Eli Zaretskii wrote:
> Thanks. I think this change will require a benchmark to make sure we
> don't lose too much in terms of performance.
Benchmark and its results included below.
It’s a bit noisy and as all benchmarks of that kind it doesn’t really
measure the real usage, but I think it’s safe to say that things aren’t
getting worse.
---- >8 ------------------------------------------------------------------------
Class [[:cc:]] no-case [^[:cc:]] no-case
--------- --------- --------- --------- ---------
==== Add regex character class matching benchmark ====
alnum 59.870 60.148 63.548 64.048
alpha 60.355 60.137 63.333 62.684
digit 27.835 27.648 0.513 0.488
xdigit 27.160 27.320 0.969 0.883
upper 91.027 91.572 39.423 39.595
lower 60.591 61.307 60.332 59.730
word 36.201 36.046 108.118 109.396
punct 110.987 111.683 35.110 35.200
cntrl 27.005 26.756 1.212 1.176
graph 25.694 26.097 75.872 75.711
print 24.783 24.976 76.652 74.921
space 147.210 148.431 1.261 1.252
blank 27.602 27.722 0.373 0.189
ascii 23.243 23.302 4.550 4.486
nonascii 5.448 5.407 90.733 90.410
unibyte 22.986 23.342 4.559 4.655
multibyte 5.508 5.535 92.457 91.163
...all... 1.138 1.030 93.275 93.383
==== Refactor character class checking; optimise ASCII case ====
alnum 54.643 54.301 56.668 56.898
alpha 54.654 54.558 56.134 56.281
digit 26.103 26.044 0.495 0.443
xdigit 25.606 25.690 0.815 0.806
upper 83.269 83.306 36.704 36.487
lower 56.278 55.804 54.872 54.917
word 34.820 55.092 99.577 100.618
punct 103.410 103.465 31.673 31.590
cntrl 25.509 25.274 1.119 1.101
graph 23.593 23.673 69.335 69.481
print 23.003 23.123 69.962 70.132
space 132.224 132.458 1.143 1.120
blank 26.223 26.342 0.193 0.187
ascii 22.329 22.257 4.094 4.082
nonascii 4.910 4.897 84.633 84.515
unibyte 22.866 22.385 4.094 4.078
multibyte 4.913 4.886 95.385 85.341
...all... 0.942 0.936 88.979 88.744
==== Optimise character class matching in regexes ====
alnum 53.338 53.052 56.571 56.434
alpha 53.591 53.350 56.218 56.255
digit 26.266 26.502 0.438 0.438
xdigit 25.793 25.887 0.877 0.876
upper 82.539 82.700 31.994 32.200
lower 55.280 55.040 54.615 54.429
word 33.666 33.530 100.678 101.721
punct 101.714 101.715 31.766 31.620
cntrl 25.669 25.068 1.113 1.114
graph 27.848 28.067 81.669 81.619
print 27.128 28.297 82.326 82.306
space 131.847 132.242 1.124 1.128
blank 26.493 26.607 0.190 0.188
ascii 22.332 22.315 4.379 4.358
nonascii 5.169 5.159 84.872 85.488
unibyte 22.259 22.529 4.374 4.361
multibyte 5.193 5.181 86.421 86.568
...all... 0.945 0.939 92.903 93.209
==== Fix case-fold-search character class matching ====
alnum 53.553 53.527 56.918 56.886
alpha 53.657 53.758 56.541 57.107
digit 26.616 26.641 0.467 0.510
xdigit 27.255 26.271 0.894 0.923
upper 56.608 55.073 55.792 55.422
lower 55.419 55.330 55.486 55.018
word 35.537 35.434 103.414 103.516
punct 105.810 106.618 33.454 33.322
cntrl 25.875 26.020 1.274 1.271
graph 28.011 28.185 82.239 82.245
print 26.935 27.016 99.945 83.213
space 136.774 138.135 1.170 1.159
blank 26.984 26.976 0.192 0.204
ascii 22.365 22.661 4.652 4.652
nonascii 5.759 5.524 85.805 86.403
unibyte 22.568 22.375 4.995 4.909
multibyte 5.729 5.749 84.671 84.396
...all... 0.990 0.978 89.520 89.612
All times in ms; lower is better.
---- >8 ------------------------------------------------------------------------
From 23d8fe0b093730406b64e0e20207c2fb929f707f Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Fri, 7 Oct 2016 02:44:30 +0200
Subject: [PATCH] Add regex character class matching benchmark
* test/src/regex-tests.el (regex-tests-benchmark-cc-match): New function
running character class matching benchmark.
---
test/src/regex-tests.el | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index fc50344..d0aad97 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -98,6 +98,65 @@ regex--test-cc
(eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t)))
+(defun regex-tests-benchmark-cc-match ()
+ "Benchmark regex character class matching."
+ (interactive)
+ (let* ((prn (if (called-interactively-p)
+ 'insert
+ (lambda (&rest args) (mapc 'princ args))))
+ (strings
+ (nconc (list
+ (apply 'string (number-sequence 32 126))
+ (apply 'string (number-sequence 0 127))
+ (apply 'unibyte-string (number-sequence 128 255))
+ (concat (apply 'string (number-sequence 0 255))
+ (apply 'unibyte-string (number-sequence 128 255)))
+ (make-string 10000 #x3FFF80)
+ (make-string 10000 #x3FFFFF))
+ (mapcar (lambda (ch) (make-string 10000 ch))
+ (number-sequence 0 256))))
+
+ (ccs '("alnum" "alpha" "digit" "xdigit" "upper" "lower"
+ "word" "punct" "cntrl" "graph" "print" "space" "blank"
+ "ascii" "nonascii" "unibyte" "multibyte"))
+
+ (benchmark-re
+ (lambda (re)
+ (dolist (cf '(nil t))
+ ;; Compile the regex so it ends up in cache.
+ (string-match re "")
+ (let ((res (benchmark-run 10
+ (dolist (str strings) (string-match re str)))))
+ (funcall prn (format " %10.3f"
+ (* (- (nth 0 res) (nth 2 res)) 100))))))))
+
+ (when (called-interactively-p)
+ (switch-to-buffer (get-buffer-create "*Regex Benchmark*"))
+ (delete-region (point-min) (point-max)))
+
+ (funcall prn (format "%-9s %-9s %-9s %-9s %-9s\n"
+ "Class" "[[:cc:]]" "no-case"
+ "[^[:cc:]]" "no-case")
+ (make-string 9 ?-)
+ " " (make-string 9 ?-) " " (make-string 9 ?-)
+ " " (make-string 9 ?-) " " (make-string 9 ?-) "\n")
+
+ (dolist (cc ccs)
+ (funcall prn (format "%-9s" cc))
+ (dolist (re (list (format "[[:%s:]]" cc)
+ (format "[^[:%s:]]" cc)))
+ (funcall benchmark-re re))
+ (funcall prn "\n"))
+
+ (funcall prn (format "%-9s" "...all..."))
+ (let ((all-ccs (mapconcat (lambda (cc) (format "[:%s:]" cc)) ccs "")))
+ (funcall benchmark-re (concat "[" all-ccs "]"))
+ (funcall benchmark-re (concat "[^" all-ccs "]")))
+
+ (funcall prn "\n" (make-string 53 ?-)
+ "\nAll times in ms; lower is better.\n")))
+
+
(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body)
"Reads a line of the test file TEST-FILE, skipping
comments (defined by COMMENT-CHAR), and evaluates the tests in
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-11-06 19:26 ` Michal Nazarewicz
@ 2016-11-06 19:44 ` Eli Zaretskii
2016-12-20 14:32 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-11-06 19:44 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Sun, 06 Nov 2016 20:26:11 +0100
>
> On Tue, Oct 04 2016, Eli Zaretskii wrote:
> > Thanks. I think this change will require a benchmark to make sure we
> > don't lose too much in terms of performance.
>
> Benchmark and its results included below.
>
> It’s a bit noisy and as all benchmarks of that kind it doesn’t really
> measure the real usage, but I think it’s safe to say that things aren’t
> getting worse.
Thanks. What happened here:
> ==== Refactor character class checking; optimise ASCII case ====
> alnum 54.643 54.301 56.668 56.898
> alpha 54.654 54.558 56.134 56.281
> digit 26.103 26.044 0.495 0.443
> xdigit 25.606 25.690 0.815 0.806
> upper 83.269 83.306 36.704 36.487
> lower 56.278 55.804 54.872 54.917
> word 34.820 55.092 99.577 100.618
^^^^^^
Is this slow-down real?
The rest of the results look fine to me.
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-11-06 19:44 ` Eli Zaretskii
@ 2016-12-20 14:32 ` Michal Nazarewicz
2016-12-20 16:39 ` Eli Zaretskii
0 siblings, 1 reply; 89+ messages in thread
From: Michal Nazarewicz @ 2016-12-20 14:32 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
Sorry about the delay. I hope I’ll have some time during Xmas to work
on this more.
On Sun, Nov 06 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Cc: 24603@debbugs.gnu.org
>> Date: Sun, 06 Nov 2016 20:26:11 +0100
>>
>> On Tue, Oct 04 2016, Eli Zaretskii wrote:
>> > Thanks. I think this change will require a benchmark to make sure we
>> > don't lose too much in terms of performance.
>>
>> Benchmark and its results included below.
>>
>> It’s a bit noisy and as all benchmarks of that kind it doesn’t really
>> measure the real usage, but I think it’s safe to say that things aren’t
>> getting worse.
>
> Thanks. What happened here:
>
>> ==== Refactor character class checking; optimise ASCII case ====
>> alnum 54.643 54.301 56.668 56.898
>> alpha 54.654 54.558 56.134 56.281
>> digit 26.103 26.044 0.495 0.443
>> xdigit 25.606 25.690 0.815 0.806
>> upper 83.269 83.306 36.704 36.487
>> lower 56.278 55.804 54.872 54.917
>> word 34.820 55.092 99.577 100.618
> ^^^^^^
> Is this slow-down real?
I’ve re-run the benchmarks five times and took averages. Based on that,
this slow-down does not appear to be real, but it seems there are some
other which I haven’t noticed previously:
Class [[:cc:]] no-case [^[:cc:]] no-case
--------- -------------------- -------------------- -------------------- --------------------
alnum 56.772 54.973 58.132 58.388
-1.385 -2.44% 0.571 +1.04% -0.041 -0.07% -0.346 -0.59%
-1.539 -2.71% 0.198 +0.36% -0.967 -1.66% -1.272 -2.18%
-3.017 -5.31% 2.990 +5.44% -1.013 -1.74% -1.681 -2.88%
-3.850 -6.78% -1.229 -2.24% -0.086 -0.15% -1.453 -2.49%
--------- -------------------- -------------------- -------------------- --------------------
alpha 54.386 54.380 56.698 58.332
1.135 +2.09% 0.892 +1.64% 0.587 +1.04% 0.667 +1.14%
1.052 +1.93% 1.108 +2.04% 0.661 +1.17% -1.555 -2.67%
-0.338 -0.62% -0.235 -0.43% -0.363 -0.64% -1.788 -3.06%
-1.068 -1.96% -0.541 -1.00% -0.182 -0.32% -2.659 -4.56%
--------- -------------------- -------------------- -------------------- --------------------
digit 26.416 26.574 0.454 0.455
0.203 +0.77% -0.030 -0.11% -0.010 -2.20% -0.007 -1.58%
0.138 +0.52% -0.013 -0.05% -0.006 -1.28% -0.008 -1.71%
-0.021 -0.08% -0.161 -0.61% -0.014 -3.08% -0.018 -4.04%
-0.293 -1.11% -0.417 -1.57% -0.003 -0.57% -0.009 -2.02%
--------- -------------------- -------------------- -------------------- --------------------
xdigit 26.015 25.956 0.902 0.898
0.194 +0.75% 0.186 +0.72% -0.074 -8.20% -0.075 -8.33%
1.092 +4.20% 0.191 +0.74% -0.073 -8.13% -0.070 -7.84%
-0.003 -0.01% 0.239 +0.92% -0.084 -9.35% -0.083 -9.22%
-0.345 -1.32% -0.124 -0.48% -0.069 -7.62% -0.060 -6.64%
--------- -------------------- -------------------- -------------------- --------------------
upper 83.257 82.562 41.189 41.284
3.298 +3.96% 0.683 +0.83% -4.733 -11.49% -3.970 -9.62%
1.791 +2.15% 3.616 +4.38% -3.875 -9.41% -3.845 -9.31%
0.045 +0.05% 5.854 +7.09% -8.977 -21.80% -9.105 -22.05%
-28.204 -33.88% -27.548 -33.37% 13.052 +31.69% 12.946 +31.36%
--------- -------------------- -------------------- -------------------- --------------------
lower 64.299 64.218 61.111 62.093
-7.671 -11.93% -8.443 -13.15% -6.356 -10.40% -7.320 -11.79%
-7.251 -11.28% -5.967 -9.29% -5.593 -9.15% -6.500 -10.47%
-7.901 -12.29% -8.447 -13.15% -6.268 -10.26% -7.304 -11.76%
-9.213 -14.33% -9.183 -14.30% -4.879 -7.98% -7.422 -11.95%
--------- -------------------- -------------------- -------------------- --------------------
word 35.618 37.086 104.661 105.706
0.198 +0.55% -1.206 -3.25% 1.497 +1.43% 2.618 +2.48%
0.614 +1.72% 0.263 +0.71% 1.618 +1.55% 2.099 +1.99%
0.692 +1.94% -0.403 -1.09% -2.975 -2.84% -3.099 -2.93%
-1.210 -3.40% -1.759 -4.74% -3.491 -3.34% -3.722 -3.52%
--------- -------------------- -------------------- -------------------- --------------------
punct 107.447 107.661 33.509 33.453
3.037 +2.83% 1.931 +1.79% 0.640 +1.91% 0.596 +1.78%
3.106 +2.89% 4.309 +4.00% 0.539 +1.61% 0.680 +2.03%
-0.588 -0.55% 3.730 +3.46% -1.138 -3.40% -1.046 -3.13%
1.013 +0.94% 2.857 +2.65% 1.679 +5.01% -1.142 -3.41%
--------- -------------------- -------------------- -------------------- --------------------
cntrl 25.770 25.718 1.246 1.229
0.115 +0.45% 0.150 +0.58% -0.068 -5.47% -0.063 -5.11%
0.031 +0.12% 0.112 +0.44% -0.087 -7.00% -0.057 -4.64%
-0.089 -0.35% -0.034 -0.13% -0.103 -8.30% -0.088 -7.16%
-0.410 -1.59% -0.334 -1.30% -0.047 -3.77% -0.043 -3.53%
--------- -------------------- -------------------- -------------------- --------------------
graph 23.703 23.595 69.221 70.017
0.306 +1.29% 0.245 +1.04% 0.592 +0.85% -0.146 -0.21%
1.838 +7.75% 0.641 +2.71% 0.517 +0.75% -0.316 -0.45%
4.503 +19.00% 4.599 +19.49% 13.219 +19.10% 15.108 +21.58%
6.628 +27.96% 4.209 +17.84% 12.004 +17.34% 11.160 +15.94%
--------- -------------------- -------------------- -------------------- --------------------
print 22.798 22.781 69.873 69.795
0.670 +2.94% 0.607 +2.67% 0.826 +1.18% 0.699 +1.00%
1.225 +5.37% 1.171 +5.14% 2.049 +2.93% 1.427 +2.04%
4.540 +19.91% 4.574 +20.08% 14.046 +20.10% 17.268 +24.74%
4.178 +18.33% 4.188 +18.38% 12.189 +17.44% 12.351 +17.70%
--------- -------------------- -------------------- -------------------- --------------------
space 141.314 144.661 1.130 1.125
0.331 +0.23% -3.395 -2.35% 0.011 +0.99% 0.011 +0.94%
2.535 +1.79% 0.202 +0.14% 0.029 +2.53% 0.029 +2.60%
-5.808 -4.11% -6.856 -4.74% -0.001 -0.11% 0.076 +6.79%
-6.470 -4.58% -9.847 -6.81% 0.010 +0.85% 0.005 +0.48%
--------- -------------------- -------------------- -------------------- --------------------
blank 26.706 26.740 0.183 0.181
0.147 +0.55% -0.009 -0.04% 0.003 +1.74% 0.004 +2.10%
1.461 +5.47% 0.091 +0.34% 0.006 +3.05% 0.007 +3.99%
3.021 +11.31% 0.591 +2.21% -0.002 -0.98% 0.000 +0.11%
-0.305 -1.14% -0.372 -1.39% -0.001 -0.33% 0.000 +0.22%
--------- -------------------- -------------------- -------------------- --------------------
ascii 22.202 22.140 4.722 4.751
0.489 +2.20% 0.601 +2.71% -0.493 -10.44% -0.436 -9.18%
0.625 +2.81% 0.597 +2.69% -0.397 -8.41% -0.436 -9.18%
0.348 +1.57% 1.043 +4.71% 0.287 +6.08% 0.249 +5.25%
-0.033 -0.15% 0.826 +3.73% 0.398 +8.42% 0.251 +5.29%
--------- -------------------- -------------------- -------------------- --------------------
nonascii 5.586 5.544 85.792 83.721
-0.392 -7.02% -0.405 -7.30% 5.600 +6.53% 1.420 +1.70%
-0.459 -8.21% 0.213 +3.84% 5.553 +6.47% 3.031 +3.62%
0.461 +8.25% -0.144 -2.59% 4.086 +4.76% 1.803 +2.15%
-0.368 -6.58% -0.296 -5.35% -0.947 -1.10% 1.088 +1.30%
--------- -------------------- -------------------- -------------------- --------------------
unibyte 22.166 22.172 5.299 5.403
0.545 +2.46% 0.533 +2.40% -1.041 -19.65% -1.140 -21.09%
1.187 +5.36% 0.843 +3.80% -1.068 -20.16% -1.182 -21.87%
0.429 +1.94% 0.385 +1.74% -1.043 -19.69% -1.163 -21.52%
0.237 +1.07% 0.063 +0.28% -0.915 -17.26% -1.025 -18.98%
--------- -------------------- -------------------- -------------------- --------------------
multibyte 6.031 5.571 83.834 85.391
-0.875 -14.50% -0.432 -7.75% 1.855 +2.21% -0.073 -0.09%
-0.902 -14.96% -0.440 -7.89% 7.195 +8.58% 1.665 +1.95%
-0.904 -14.99% -0.531 -9.53% 2.005 +2.39% 0.094 +0.11%
-0.786 -13.03% -0.336 -6.04% 0.692 +0.83% 1.607 +1.88%
--------- -------------------- -------------------- -------------------- --------------------
...all... 0.928 0.927 89.115 89.857
0.080 +8.60% 0.076 +8.22% -0.314 -0.35% 5.126 +5.70%
0.058 +6.23% 0.058 +6.30% -0.304 -0.34% 0.038 +0.04%
0.002 +0.19% 0.001 +0.11% -0.413 -0.46% -1.742 -1.94%
0.037 +3.97% 0.034 +3.64% 0.824 +0.92% -1.253 -1.39%
(The first line in each group are absolute results with Emacs before my
changes. The other groups show absolute and relative change to that
baseline (i.e. negative is good)).
Slow-downs in intermediate commits aren’t that big of an issue as long
as the last line shows an improvement (or at least negligible
regression) but sadly that is not always the case.
As can be seen, [[:graph:]] slows-down by almost 28% :( and I don’t
quite understand where all that can come from.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-12-20 14:32 ` Michal Nazarewicz
@ 2016-12-20 16:39 ` Eli Zaretskii
2016-12-22 14:02 ` Michal Nazarewicz
0 siblings, 1 reply; 89+ messages in thread
From: Eli Zaretskii @ 2016-12-20 16:39 UTC (permalink / raw)
To: Michal Nazarewicz; +Cc: 24603
> From: Michal Nazarewicz <mina86@mina86.com>
> Cc: 24603@debbugs.gnu.org
> Date: Tue, 20 Dec 2016 15:32:27 +0100
>
> As can be seen, [[:graph:]] slows-down by almost 28% :( and I don’t
> quite understand where all that can come from.
If no ideas come up, perhaps run this under perf?
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case
2016-12-20 16:39 ` Eli Zaretskii
@ 2016-12-22 14:02 ` Michal Nazarewicz
0 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-12-22 14:02 UTC (permalink / raw)
To: Eli Zaretskii; +Cc: 24603
On Tue, Dec 20 2016, Eli Zaretskii wrote:
>> From: Michal Nazarewicz <mina86@mina86.com>
>> Cc: 24603@debbugs.gnu.org
>> Date: Tue, 20 Dec 2016 15:32:27 +0100
>>
>> As can be seen, [[:graph:]] slows-down by almost 28% :( and I don’t
>> quite understand where all that can come from.
>
> If no ideas come up, perhaps run this under perf?
Yes, that’s what I’m trying to do.
It gets a bit convoluted since the benchmark attempts to test C code
with Elisp code which adds various superfluous entries in the profile
(e.g. Finotify_valid_p takes third of the time but I doubt it has
anything to do with character class matching).
On top of that, the generated gmon.out doesn’t include call traces which
makes it harder to track what is actually going on.
But hopefully I’ll figure something out.
--
Best regards
ミハウ “𝓶𝓲𝓷𝓪86” ナザレヴイツ
«If at first you don’t succeed, give up skydiving»
^ permalink raw reply [flat|nested] 89+ messages in thread
* bug#24603: [RFC 17/18] Optimise character class matching in regexes
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (14 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
2016-10-04 1:10 ` bug#24603: [RFC 18/18] Fix case-fold-search character class matching Michal Nazarewicz
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
Use lookup tables defined in src/character.h to bundle checks together
if possible. For example, ‘[[:lower:][:digit:]]’ would perform an
equivalence of ‘lowercasep(ch) || numericp(ch)’ check. Now, such checks
are done all at once with at most one Unicode general category lookup.
Similarly, do at most one syntax table lookup by unrolling macros
testing character properties.
* src/regex.c (execute_charset): Use category_char_bits and call SYNTAX
at most once.
* test/src/regex-tests.el (regex-tests--letter-character-classes): New
test case for various character classes relating to letters etc.
---
src/regex.c | 86 +++++++++++++++++++++++++++++++------------------
test/src/regex-tests.el | 45 ++++++++++++++++++++++++++
2 files changed, 100 insertions(+), 31 deletions(-)
diff --git a/src/regex.c b/src/regex.c
index 02da1fb..bfd04a1 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1789,16 +1789,26 @@ struct range_table_work_area
/* Bits used to implement the multibyte-part of the various character classes
such as [:alnum:] in a charset's range table. The code currently assumes
that only the low 16 bits are used. */
-#define BIT_WORD 0x1
-#define BIT_LOWER 0x2
-#define BIT_PUNCT 0x4
-#define BIT_SPACE 0x8
-#define BIT_UPPER 0x10
-#define BIT_MULTIBYTE 0x20
-#define BIT_ALPHA 0x40
-#define BIT_ALNUM 0x80
-#define BIT_GRAPH 0x100
-#define BIT_PRINT 0x200
+#ifdef emacs
+# define BIT_ALNUM CHAR_BIT_ALNUM
+# define BIT_ALPHA CHAR_BIT_ALPHA
+# define BIT_UPPER CHAR_BIT_UPPER
+# define BIT_LOWER CHAR_BIT_LOWER
+# define BIT_GRAPH CHAR_BIT_GRAPH
+# define BIT_PRINT CHAR_BIT_PRINT
+#else
+# define BIT_ALNUM (1 << 0)
+# define BIT_ALPHA (1 << 1)
+# define BIT_UPPER (1 << 2)
+# define BIT_LOWER (1 << 3)
+# define BIT_GRAPH (1 << 4)
+# define BIT_PRINT (1 << 5)
+#endif
+#define BIT_WORD (BIT_PRINT << 1)
+#define BIT_PUNCT (BIT_PRINT << 2)
+#define BIT_SPACE (BIT_PRINT << 3)
+#define BIT_MULTIBYTE (BIT_PRINT << 4)
+
\f
/* Set the bit for character C in a list. */
@@ -1988,9 +1998,6 @@ re_wctype_parse (const unsigned char **strp, unsigned limit)
2 [:print:]
2 [:cntrl:]
1 [:ff:]
-
- If you update this list, consider also updating chain of or’ed conditions
- in execute_charset function.
*/
switch (it - beg) {
@@ -4657,28 +4664,45 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
else if (rtp)
{
int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+ int bits;
re_wchar_t range_start, range_end;
- /* Sort tests by the most commonly used classes with some adjustment to which
- tests are easiest to perform. Take a look at comment in re_wctype_parse
- for table with frequencies of character class names. */
-
- if ((class_bits & BIT_MULTIBYTE) ||
- (class_bits & BIT_ALNUM && ISALNUM (c)) ||
- (class_bits & BIT_ALPHA && ISALPHA (c)) ||
- (class_bits & BIT_SPACE && ISSPACE (c)) ||
- (class_bits & BIT_WORD && ISWORD (c)) ||
- ((class_bits & BIT_UPPER) &&
- (ISUPPER (c) || (corig != c &&
- c == downcase (corig) && ISLOWER (c)))) ||
- ((class_bits & BIT_LOWER) &&
- (ISLOWER (c) || (corig != c &&
- c == upcase (corig) && ISUPPER(c)))) ||
- (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
- (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
- (class_bits & BIT_PRINT && ISPRINT (c)))
+ if (class_bits & BIT_MULTIBYTE)
return !not;
+ /* If we are at this point, the character is not an ASCII or single byte
+ character. This means that whenever ISFOO macros have special case for
+ IS_REAL_ASCII (c), we can ignore that. */
+
+ bits = class_bits & (BIT_ALNUM | BIT_ALPHA | BIT_UPPER | BIT_LOWER |
+ BIT_GRAPH | BIT_PRINT);
+ if (bits)
+ {
+ int char_bits = category_char_bits[char_unicode_category (c)];
+ if (bits & char_bits)
+ return !not;
+
+ /* Handle case folding. */
+ if (corig != c)
+ {
+ if ((bits & BIT_UPPER) && (char_bits & BIT_LOWER) &&
+ c == downcase (corig))
+ return !not;
+ if ((bits & BIT_LOWER) && (char_bits & BIT_UPPER) &&
+ c == upcase (corig))
+ return !not;
+ }
+ }
+
+ if (class_bits & (BIT_SPACE | BIT_WORD | BIT_PUNCT))
+ {
+ enum syntaxcode s = SYNTAX (c);
+ if (((class_bits & BIT_SPACE) && s == Swhitespace) ||
+ ((class_bits & BIT_WORD ) && s == Sword) ||
+ ((class_bits & BIT_PUNCT) && s != Sword))
+ return !not;
+ }
+
for (p = *pp; rtp < p; rtp += 2 * 3)
{
EXTRACT_CHARACTER (range_start, rtp);
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index fc50344..7617823 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -98,6 +98,51 @@ regex--test-cc
(eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t)))
+(ert-deftest regex-tests--letter-character-classes ()
+ "Test character classes against various letters types."
+ (should-not
+ (cl-remove-if
+ 'not
+ (let ((check-ccs (lambda (ch fold)
+ (mapconcat
+ (lambda (str) str)
+ (let ((case-fold-search fold))
+ (cl-remove-if-not
+ (lambda (cc)
+ (string-match-p (concat "[[:" cc ":]]")
+ (string ch)))
+ '("alnum" "alpha" "upper" "lower")))
+ " "))))
+ (mapcar
+ (lambda (entry)
+ (let ((ch (car entry)) (expected (cdr entry)))
+ (setq entry
+ (format "%s | %s | case-fold: %s"
+ (get-char-code-property ch 'general-category)
+ (funcall check-ccs ch nil) (funcall check-ccs ch t)))
+ (unless (string-equal expected entry)
+ (format "\n%c expected: %s\nU+%06X but got: %s"
+ ch expected ch entry))))
+ '((?A . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?ẞ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?DZ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+ (?a . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; FIXME: Should match upper when case-fold case
+ ;; (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ ;; (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?Dz . "Lt | alnum alpha | case-fold: alnum alpha upper lower")
+ (?ʰ . "Lm | alnum alpha | case-fold: alnum alpha")
+ (?º . "Lo | alnum alpha | case-fold: alnum alpha")))))))
+
+
(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body)
"Reads a line of the test file TEST-FILE, skipping
comments (defined by COMMENT-CHAR), and evaluates the tests in
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread
* bug#24603: [RFC 18/18] Fix case-fold-search character class matching
2016-10-04 1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
` (15 preceding siblings ...)
2016-10-04 1:10 ` bug#24603: [RFC 17/18] Optimise character class matching in regexes Michal Nazarewicz
@ 2016-10-04 1:10 ` Michal Nazarewicz
16 siblings, 0 replies; 89+ messages in thread
From: Michal Nazarewicz @ 2016-10-04 1:10 UTC (permalink / raw)
To: 24603
The uppar and lower character classes should match any cased characters
when case-fold-search is enabled. So ‘[[:upper:]]’ sould match ‘a’ but
also ‘ł’, ‘ß’ and ‘fi’. Fix character class tests to make that happen.
* src/character.h (CHAR_BIT_TITLE): New character bit for title case
characters (such as Dz).
* src/character.c (category_char_bits): Characters in Lt category are
title case; update lookup table.
* src/regex.c (re_wctype_to_bit): When case-folding is enabled return
any-case bits pattern for RECC_LOWER and RECC_UPPER.
(regex_compile): Update re_wctype_to_bit calls (it has new argument).
(execute_charset): Simplify case-folding case since now it’s encoded
in the bits. corig argument is no longer necessary.
(mutually_exclusive_p, re_match_2_internal): Update execute_charset (it
no longer has corig argument).
* test/src/regex-tests.el (regex-tests--letter-character-classes): Fix
case-fold letter matching.
---
src/character.c | 2 +-
src/character.h | 5 +++--
src/regex.c | 53 ++++++++++++++++++++-----------------------------
test/src/regex-tests.el | 16 +++++----------
4 files changed, 30 insertions(+), 46 deletions(-)
diff --git a/src/character.c b/src/character.c
index 63f89d3..cf42f30 100644
--- a/src/character.c
+++ b/src/character.c
@@ -979,7 +979,7 @@ const unsigned char category_char_bits[] = {
[UNICODE_CATEGORY_UNKNOWN] = 0,
[UNICODE_CATEGORY_Lu] = CHAR_BIT_ALPHA_ | CHAR_BIT_UPPER,
[UNICODE_CATEGORY_Ll] = CHAR_BIT_ALPHA_ | CHAR_BIT_LOWER,
- [UNICODE_CATEGORY_Lt] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Lt] = CHAR_BIT_ALPHA_ | CHAR_BIT_TITLE,
[UNICODE_CATEGORY_Lm] = CHAR_BIT_ALPHA_,
[UNICODE_CATEGORY_Lo] = CHAR_BIT_ALPHA_,
[UNICODE_CATEGORY_Mn] = CHAR_BIT_ALPHA_,
diff --git a/src/character.h b/src/character.h
index 6dc95ad..f2849e5 100644
--- a/src/character.h
+++ b/src/character.h
@@ -665,8 +665,9 @@ extern unicode_category_t char_unicode_category (int);
#define CHAR_BIT_ALPHA (1 << 1)
#define CHAR_BIT_UPPER (1 << 2)
#define CHAR_BIT_LOWER (1 << 3)
-#define CHAR_BIT_GRAPH (1 << 4)
-#define CHAR_BIT_PRINT (1 << 5)
+#define CHAR_BIT_TITLE (1 << 4)
+#define CHAR_BIT_GRAPH (1 << 5)
+#define CHAR_BIT_PRINT (1 << 6)
/* Map from Unicode general category to character classes the character is in.
*
diff --git a/src/regex.c b/src/regex.c
index bfd04a1..aa8c6ef 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1794,6 +1794,7 @@ struct range_table_work_area
# define BIT_ALPHA CHAR_BIT_ALPHA
# define BIT_UPPER CHAR_BIT_UPPER
# define BIT_LOWER CHAR_BIT_LOWER
+# define BIT_TITLE CHAR_BIT_TITLE
# define BIT_GRAPH CHAR_BIT_GRAPH
# define BIT_PRINT CHAR_BIT_PRINT
#else
@@ -1801,8 +1802,9 @@ struct range_table_work_area
# define BIT_ALPHA (1 << 1)
# define BIT_UPPER (1 << 2)
# define BIT_LOWER (1 << 3)
-# define BIT_GRAPH (1 << 4)
-# define BIT_PRINT (1 << 5)
+# define BIT_TITLE (1 << 4)
+# define BIT_GRAPH (1 << 5)
+# define BIT_PRINT (1 << 6)
#endif
#define BIT_WORD (BIT_PRINT << 1)
#define BIT_PUNCT (BIT_PRINT << 2)
@@ -2067,7 +2069,7 @@ re_iswctype (int ch, re_wctype_t cc)
/* Return a bit-pattern to use in the range-table bits to match multibyte
chars of class CC. */
static int
-re_wctype_to_bit (re_wctype_t cc)
+re_wctype_to_bit (re_wctype_t cc, bool case_fold)
{
switch (cc)
{
@@ -2076,8 +2078,10 @@ re_wctype_to_bit (re_wctype_t cc)
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
case RECC_WORD: return BIT_WORD;
- case RECC_LOWER: return BIT_LOWER;
- case RECC_UPPER: return BIT_UPPER;
+ case RECC_LOWER:
+ return case_fold ? BIT_LOWER | BIT_UPPER | BIT_TITLE : BIT_LOWER;
+ case RECC_UPPER:
+ return case_fold ? BIT_LOWER | BIT_UPPER | BIT_TITLE : BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE;
case RECC_GRAPH: return BIT_GRAPH;
@@ -2886,7 +2890,8 @@ regex_compile (const_re_char *pattern, size_t size,
SET_LIST_BIT (c1);
}
SET_RANGE_TABLE_WORK_AREA_BIT
- (range_table_work, re_wctype_to_bit (cc));
+ (range_table_work,
+ re_wctype_to_bit (cc, RE_TRANSLATE_P (translate)));
#endif /* emacs */
/* In most cases the matching rule for char classes only
uses the syntax table for multibyte chars, so that the
@@ -4633,11 +4638,10 @@ skip_noops (const_re_char *p, const_re_char *pend)
/* Test if C matches charset op. *PP points to the charset or charset_not
opcode. When the function finishes, *PP will be advanced past that opcode.
- C is character to test (possibly after translations) and CORIG is original
- character (i.e. without any translations). UNIBYTE denotes whether c is
- unibyte or multibyte character. */
+ C is character to test. UNIBYTE denotes whether c is unibyte or multibyte
+ character. */
static bool
-execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
+execute_charset (const_re_char **pp, unsigned c, bool unibyte)
{
re_char *p = *pp, *rtp = NULL;
bool not = (re_opcode_t) *p == charset_not;
@@ -4675,24 +4679,9 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
IS_REAL_ASCII (c), we can ignore that. */
bits = class_bits & (BIT_ALNUM | BIT_ALPHA | BIT_UPPER | BIT_LOWER |
- BIT_GRAPH | BIT_PRINT);
- if (bits)
- {
- int char_bits = category_char_bits[char_unicode_category (c)];
- if (bits & char_bits)
- return !not;
-
- /* Handle case folding. */
- if (corig != c)
- {
- if ((bits & BIT_UPPER) && (char_bits & BIT_LOWER) &&
- c == downcase (corig))
- return !not;
- if ((bits & BIT_LOWER) && (char_bits & BIT_UPPER) &&
- c == upcase (corig))
- return !not;
- }
- }
+ BIT_TITLE | BIT_GRAPH | BIT_PRINT);
+ if (bits && (category_char_bits[char_unicode_category (c)] & bits))
+ return !not;
if (class_bits & (BIT_SPACE | BIT_WORD | BIT_PUNCT))
{
@@ -4772,7 +4761,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
else if ((re_opcode_t) *p1 == charset
|| (re_opcode_t) *p1 == charset_not)
{
- if (!execute_charset (&p1, c, c, !multibyte || IS_REAL_ASCII (c)))
+ if (!execute_charset (&p1, c, !multibyte || IS_REAL_ASCII (c)))
{
DEBUG_PRINT (" No match => fast loop.\n");
return 1;
@@ -5482,7 +5471,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
case charset:
case charset_not:
{
- register unsigned int c, corig;
+ register unsigned int c;
int len;
/* Whether matching against a unibyte character. */
@@ -5492,7 +5481,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
(re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
PREFETCH ();
- corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
+ c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
if (target_multibyte)
{
int c1;
@@ -5524,7 +5513,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
}
p -= 1;
- if (!execute_charset (&p, c, corig, unibyte_char))
+ if (!execute_charset (&p, c, unibyte_char))
goto fail;
d += len;
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index 7617823..4da9ab3 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -127,17 +127,11 @@ regex--test-cc
(?ẞ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
(?DZ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
(?a . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; FIXME: Should match upper when case-fold case
- ;; (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
(?Dz . "Lt | alnum alpha | case-fold: alnum alpha upper lower")
(?ʰ . "Lm | alnum alpha | case-fold: alnum alpha")
(?º . "Lo | alnum alpha | case-fold: alnum alpha")))))))
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related [flat|nested] 89+ messages in thread