unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
* bug#20140: 24.4; M17n shaper output rejected
@ 2015-03-18 22:20 Richard Wordingham
  2015-03-19  3:43 ` Eli Zaretskii
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Richard Wordingham @ 2015-03-18 22:20 UTC (permalink / raw)
  To: 20140

I am running Emacs 24.4 in a Ubuntu 12.04 Precise Pangolin
installation, for which the version of libm17n-0 is 1.6.3-1.  I am
attempting to induce Emacs to render the Tai Tham script.  There
appears to be a bug/feature in Emacs which makes this unnecessarily
difficult.

To achieve Tai Tham rendering, I added the following in new, loaded file
tai-tham.el:

(defvar tai-tham-composable-pattern
  (let ((table
	 ;; C is letters, independent vowels, digits, punctuation and
symbols. '(("C" .
"[\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA0-\u1AAD]") ("M" .
"[\u1A55-\u1A5E\u1A61-\u1A7C\u1A7F]"); Mark ("S" . "[\u1A75-\u1A7C]") ;
Marks commuting with sakot ("H" . "\u1A60") ; sakot
	   ("N" . "\u1A58"))) ; mai kang lai - also included in M.
;; Which orthographic syllable mai kang lai belongs to can depend on
the font! (regexp "C\\(M\\|HS*C?\\)*\\(NC\\(M\\|HS*C?\\)*\\)*N?"))
    (let ((case-fold-search nil))
      (dolist (elt table)
	(setq regexp (replace-regexp-in-string (car elt) (cdr elt)
					       regexp t t))))
    regexp))

(let ((elt (list (vector tai-tham-composable-pattern 0
'font-shape-gstring) (vector "." 0 'font-shape-gstring)
		 )))
  (set-char-table-range composition-function-table '(#x1A20 . #x1AAD)
  elt))

I added the following (cut-down) file LANA-OFT.flt to the m17n database:

(font layouter lana-otf nil
      (font (nil nil unicode-bmp :otf=lana)))
(category
 ;; H: SAKOT
 ;; N: Other character with non-zero canonical combining class
 ;; Z: Character with ccc=0 or other with ccc=9 
 (0x0000 0x1A5F ?Z)bug-gnu-emacs@gnu.org
 (0x1A60        ?H)
 (0x1A61 0x1A74 ?Z)
 (0x1A75 0x1A7C ?N)
 (0x1A7D 0xFFFF ?Z)
)

(generator
  (0
    (cond
      ("(H)(N+)" (2 = *) (1 =))
      ("." =)
    ) *
  )
)

(category
 ;; C: Consonant and non-mark (lenient processing)
 ;; H: SAKOT
 ;; P: Preposed vowelbug-gnu-emacs@gnu.org
 ;; R: Medial RA (preposed dependent consonant)
 ;; M: Mark
 (0x1A20 0x1A54 ?C)
 (0x1A55 0x1A55 ?R)
 (0x1A56 0x1A5E ?M)
 (0x1A5F        ?C) ; Unassigned
 (0x1A60        ?H)
 (0x1A61 0x1A6D ?M)
 (0x1A6E 0x1A72 ?P)
 (0x1A73 0x1A7C ?M)
 (0x1A7D 0x1A7E ?C) ; Unassigned
 (0x1A7F        ?M)
 (0x1A80 0x1A89 ?C)
 (0x1A8A 0x1A8F ?C) ; Unassigned
 (0x1A90 0x1A99 ?C)
 (0x1A9A 0x1A9F ?C) ; Unassigned
 (0x1AA0 0x1AAC ?C) ; Punctuation
 (0x1AAD        ?C) ; Can take a vowel!
 (0x1AAE 0x1AAF ?C) ; Unassigned
)

(generator
  (0
    (cond
      ("(C)(R|P)" (2 =) (1 =) )
      ("." =)
    )*
  )
)

(generator (0 otf:lana))

However, much Tai Tham text failed to render properly.  To determine
what was wrong, I added some monitoring code to ftfont.c:

*** ftfont.c.orig	2014-03-21 05:34:40.000000000 +0000
--- ftfont.c	2015-03-18 19:47:30.032718995 +0000
***************
*** 2516,2522 ****
--- 2516,2553 ----
      flt = mflt_get (msymbol ("combining"));
    for (i = 0; i < 3; i++)
      {
+       int k;
+       fprintf(stdout, "mflt_run(");
+       if (gstring.glyphs[0].encoded) {
+ 	for (k = 0; k < len; k++) {
+ 	  fprintf(stdout, " %d", gstring.glyphs[k].code);
+ 	}
+       } else {
+ 	for (k = 0; k < len; k++) {
+ 	  fprintf(stdout, " %4.4X", gstring.glyphs[k].c);
+ 	}
+       }
        int result = mflt_run (&gstring, 0, len, &flt_font_ft.flt_font,
flt);
+       if (-1 == result) {
+ 	fprintf(stdout, ") failed.\n");
+       } else if (result >= 0) {
+ 	fprintf(stdout, ") produced (");
+ 	for (k = 0; k < result; k++) {
+ #if 0
+ 	  fprintf(stdout, " %d", gstring.glyphs[k].code);
+ #else
+ 	  fprintf(stdout, " %4.4X>%d:%d:%d",
+ 		  gstring.glyphs[k].c, gstring.glyphs[k].code,
+ 		  gstring.glyphs[k].from, gstring.glyphs[k].to);
+ #endif
+ 	}
+ 	fprintf(stdout, ")\n");
+ 	if (result != gstring.used) {
+ 	  fprintf(stdout, "Anomalously, gstring.used = %d\n",
+ 		  (int) gstring.used);
+ 	}
+ 	fflush(0);
+       }
        if (result != -2)
  	break;
        if (INT_MAX / 2 < gstring.allocated)

The sample Tai Tham text was:
;; ᩈᩣᩴᩁᩢ᩠ᨷᨽᩣᩈᩣᩃ᩶ᩣ᩠ᨶᨶᩣ / ᨣᩣᩴᨾᩮᩬᩥᨦ - ᩈᩢᨬ᩠ᨬᩣ ᨠ᩠᩵ᨷ ᩃ᩠᩶ᨯ ᨮ᩠ ᨳᩫ᩠᩵ᨶ
ᨠᩢ᩠᩵ᨷᨠᩫ᩠᩶ᨯᨿᩥ᩠ᨷᨶᩦ᩠᩵ᨷ
;; ᨣᩕ   ᨲᩱ

I extract and analyse what was rendered as shaped ('accepted') and what
was not ('rejected'), quoting the monitoring output.  I suspect the
problem is the strict testing of the from and to fields in Lisp function
font-shape-gstring, which is defined in file font.c.

The shaping of the following was accepted:
mflt_run( 1A48 1A63 1A74) produced ( 1A48>820:0:0 1A63>858:1:1 1A74>878:2:2)

mflt_run( 1A41 1A62 1A60 1A37) produced ( 1A41>813:0:1 1A62>853:0:1
0000>953:2:3)

mflt_run( 1A3D 1A63) produced ( 1A3D>808:0:0 1A63>858:1:1)

mflt_run( 1A48 1A63) produced ( 1A48>820:0:0 1A63>858:1:1)

mflt_run( 1A43 1A76 1A63 1A60 1A36) produced ( 1A43>815:0:1
1A76>890:0:1 1A63>858:2:4 0000>952:2:4) 

mflt_run( 1A36 1A63) produced ( 1A36>800:0:0 1A63>858:1:1)

mflt_run( 1A23 1A63 1A74) produced ( 1A23>777:0:0 0000>859:1:2)

mflt_run( 1A26) produced ( 1A26>780:0:0)

mflt_run( 1A48 1A62) produced ( 1A48>820:0:1 1A62>853:0:1)

mflt_run( 1A2C 1A60 1A2C 1A63) produced ( 0000>789:0:2 1A63>858:3:3)

mflt_run( 1A43 1A60 1A76 1A2F) produced ( 1A43>815:0:3 1A76>890:0:3
0000>941:0:3) 

mflt_run( 1A2E 1A60) produced ( 1A2E>792:0:1 1A60>851:0:1)

mflt_run( 1A33 1A6B 1A60 1A75 1A36) produced ( 1A33>797:0:4
1A6B>868:0:4 1A75>889:0:4 0000>952:0:4) 

mflt_run( 1A20 1A6B 1A76 1A60 1A2F) produced ( 1A20>774:0:4
1A6B>868:0:4 1A76>890:0:4 0000>941:0:4)

mflt_run( 1A3F 1A65 1A60 1A37) produced ( 1A3F>811:0:1 1A65>862:0:1
0000>953:2:3)

The shaping of the following, with vowels or MEDIAL RA that should be
rendered before the consonant, was rejected:

mflt_run( 1A3E 1A6E 1A6C 1A65) produced ( 1A6E>872:1:1 1A3E>810:0:3
1A6C>869:0:3 1A65>862:0:3) 

mflt_run( 1A23 1A55) produced ( 1A55>835:1:1 1A23>777:0:0)

mflt_run( 1A32 1A71) produced ( 1A71>875:1:1 1A32>796:0:0)

The problem is that the first glyph does not derive from the first
character.

The shaping of the following was rejected:

mflt_run( 1A20 1A60 1A75 1A37) produced ( 1A20>774:0:2 1A75>889:0:2
0000>953:1:3)

In this case, character 2 is stacked below character 0,
and characters 1 and 3 combine to form a spacing glyph.

mflt_run( 1A20 1A62 1A60 1A75 1A37) produced ( 1A20>774:0:1
1A62>853:0:3 1A75>889:0:3 0000>953:2:4)

Character 1 is mounted on character 0, and character 3 on character 1.
Characters 2 and 4 combine to form a spacing glyph.  

mflt_run( 1A36 1A66 1A75 1A60 1A37) produced ( 1A36>800:0:1
1A66>863:0:2 1A75>889:0:2 0000>953:3:4)

Character 1 is mounted on character 0. and character 2 on character 1.
Characters 3 and 4 form a spacing glyph.

There does appear to be a work around, which is to have m17n declare
the orthographic syllables it receives to be 'grapheme clusters'.  It
solves at least some of the problems above.  However, it then makes
editing of the 'clusters' more difficult.  Note that there are examples
above with 5 characters in a cluster, and this is by no means the limit.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-18 22:20 bug#20140: 24.4; M17n shaper output rejected Richard Wordingham
@ 2015-03-19  3:43 ` Eli Zaretskii
  2015-03-21  8:33 ` K. Handa
  2022-02-03 21:21 ` Lars Ingebrigtsen
  2 siblings, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2015-03-19  3:43 UTC (permalink / raw)
  To: Richard Wordingham, Kenichi Handa; +Cc: 20140

> Date: Wed, 18 Mar 2015 22:20:40 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> 
> I am running Emacs 24.4 in a Ubuntu 12.04 Precise Pangolin
> installation, for which the version of libm17n-0 is 1.6.3-1.  I am
> attempting to induce Emacs to render the Tai Tham script.  There
> appears to be a bug/feature in Emacs which makes this unnecessarily
> difficult.

Thanks for the report.  I hope Handa-san (CC'ed) could look into it.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-18 22:20 bug#20140: 24.4; M17n shaper output rejected Richard Wordingham
  2015-03-19  3:43 ` Eli Zaretskii
@ 2015-03-21  8:33 ` K. Handa
  2015-03-21 17:20   ` Wolfgang Jenkner
  2015-03-21 17:58   ` Richard Wordingham
  2022-02-03 21:21 ` Lars Ingebrigtsen
  2 siblings, 2 replies; 35+ messages in thread
From: K. Handa @ 2015-03-21  8:33 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140

In article <20150318222040.4066e6e9@JRWUBU2>, Richard Wordingham <richard.wordingham@ntlworld.com> writes:
[...]
> I extract and analyse what was rendered as shaped ('accepted') and what
> was not ('rejected'), quoting the monitoring output.  I suspect the
> problem is the strict testing of the from and to fields in Lisp function
> font-shape-gstring, which is defined in file font.c.
[...]
> The shaping of the following, with vowels or MEDIAL RA that should be
> rendered before the consonant, was rejected:

> mflt_run( 1A3E 1A6E 1A6C 1A65) produced ( 1A6E>872:1:1 1A3E>810:0:3
1A6C>869:0:3 1A65>862:0:3) 

If U+1A6E is displayed before U+1A3E, and they are in
different grapheme cluster, when you move point forward one
step by one, the cursor must move back and forth as below
(cursor is indicated by dashes):

 display: SPC 1A6E 1A3E+1A6C+1A65 SPC
 step 1:  ---    
 step 2:           --------------
 step 3:      ----
 step 4:                          ---

Is that what you want?

At least, the support for all Indic scripts (they have
characters in logical order as your example of Tai Tham
text) treats re-ordered glyphs as one grapheme cluster.
That is not only Emacs but also gtk (pango) applications.
Please try to move cursor over this Devanagri text "हिंदी" on
Emacs, gedit, and, for instance, firefox.  They all treat
that text as 2 grapheme clusters "हिं" and "दी".  The first
one corresponds to character the sequence U+935 U+93F, and
U+93F (vowel I) is displayed before U+935 (base cosonant).

[...]

> There does appear to be a work around, which is to have m17n declare
> the orthographic syllables it receives to be 'grapheme clusters'.

I think that's the right solution; i.e. make all combined
and out-of-ordered glyphs as one cluster.

> It solves at least some of the problems above.

Which one is not solved by it?

> However, it then makes editing of the 'clusters' more
> difficult.  Note that there are examples above with 5
> characters in a cluster, and this is by no means the
> limit.

But, it seems that the current behavior is accepted, at
least, by Indic people.

By the way, I long ago proposed these commands which enables
you to move point into a grapheme cluster (by suppressing
composing of a cluster temporarily).  It worked in old Emacs (I
don't remember the version), but not in the latest Emacs.

(defun forward-char-intrusive ()
  (interactive)
  (setq disable-point-adjustment t)
  (forward-char 1))

(defun backward-char-intrusive ()
  (interactive)
  (setq disable-point-adjustment t)
  (forward-char -1))

(global-set-key (kbd "C-S-f") 'forward-char-intrusive)
(global-set-key (kbd "C-S-b") 'backward-char-intrusive)

---
K. Handa
handa@gnu.org





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-21  8:33 ` K. Handa
@ 2015-03-21 17:20   ` Wolfgang Jenkner
  2015-03-21 17:58   ` Richard Wordingham
  1 sibling, 0 replies; 35+ messages in thread
From: Wolfgang Jenkner @ 2015-03-21 17:20 UTC (permalink / raw)
  To: K. Handa; +Cc: 20140, Richard Wordingham

On Sat, Mar 21 2015, K. Handa wrote:

> By the way, I long ago proposed these commands which enables
> you to move point into a grapheme cluster (by suppressing
> composing of a cluster temporarily).  It worked in old Emacs (I
> don't remember the version), but not in the latest Emacs.
>
> (defun forward-char-intrusive ()
>   (interactive)
>   (setq disable-point-adjustment t)
>   (forward-char 1))

It actually works in trunk emacs, I think.  If we start with point at
the beginning of the word (I use the itrans transcription for clarity)

-!-hiMdI

then calling the function once /appears/ to leave point here

hiM-!-dI

but C-x = shows that it is really here

h-!-iMdI

And so on.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-21  8:33 ` K. Handa
  2015-03-21 17:20   ` Wolfgang Jenkner
@ 2015-03-21 17:58   ` Richard Wordingham
  2015-03-21 18:26     ` Eli Zaretskii
  2015-03-25 14:25     ` K. Handa
  1 sibling, 2 replies; 35+ messages in thread
From: Richard Wordingham @ 2015-03-21 17:58 UTC (permalink / raw)
  To: K. Handa; +Cc: 20140

On Sat, 21 Mar 2015 17:33:17 +0900
handa@gnu.org (K. Handa) wrote:

> In article <20150318222040.4066e6e9@JRWUBU2>, Richard Wordingham
> <richard.wordingham@ntlworld.com> writes: [...]
> > I extract and analyse what was rendered as shaped ('accepted') and
> > what was not ('rejected'), quoting the monitoring output.  I
> > suspect the problem is the strict testing of the from and to fields
> > in Lisp function font-shape-gstring, which is defined in file
> > font.c.
> [...]
> > The shaping of the following, with vowels or MEDIAL RA that should
> > be rendered before the consonant, was rejected:
> 
> > mflt_run( 1A3E 1A6E 1A6C 1A65) produced ( 1A6E>872:1:1 1A3E>810:0:3
> 1A6C>869:0:3 1A65>862:0:3) 
> 
> If U+1A6E is displayed before U+1A3E, and they are in
> different grapheme cluster, when you move point forward one
> step by one, the cursor must move back and forth as below
> (cursor is indicated by dashes):
> 
>  display: SPC 1A6E 1A3E+1A6C+1A65 SPC
>  step 1:  ---    
>  step 2:           --------------
>  step 3:      ----
>  step 4:                          ---
> 
> Is that what you want?

It gives me more control for editing in Emacs.  Another implementation
could choose to move in visual order. The policing function could
choose to merge the 'out of order' clusters
- that is what new HarfBuzz does, though I think that should only be
done if the client requests it.

What I ought to want is SIL's split cursor scheme, which indicated the
next ('point') and previous characters, even in bidirectional text.
Unfortunately, that's not compatible with m17n, which seems to assume
that cursor position will be a single number.  The Emacs functions
forward-char-intrusive and backward-char-intrusive provided a pleasant,
more intuitive, alternative, and I am sad to hear they are gone.
Perhaps I'll have to start using toggle-auto-composition.

The one consolation in Emacs is that delete-forward-char
deletes a single character, rather than a whole cluster.  That
greatly reduces the disadvantage of having clusters.  Also,
search still works by characters rather than by clusters.  If I want to
search for a character in LibreOffice, I have to go into the
special regular expression find and replace menu.  That is unpleasant.

> At least, the support for all Indic scripts (they have
> characters in logical order as your example of Tai Tham
> text) treats re-ordered glyphs as one grapheme cluster.
> That is not only Emacs but also gtk (pango) applications.

That's a nasty fault with HarfBuzz.

> Please try to move cursor over this Devanagri text "हिंदी" on
> Emacs, gedit, and, for instance, firefox.  They all treat
> that text as 2 grapheme clusters "हिं" and "दी".  The first
> one corresponds to character the sequence U+935 U+93F, and
> U+93F (vowel I) is displayed before U+935 (base cosonant).

Note that those clusters are only 3 and 2 characters long.  Retyping
them is tolerable.  Now consider the Sanskrit Devanagari text स्त्री,
which contains two consonant-combining viramas.  Emacs moves across it
in 1 step, but Claws e-mail (GTK-based, I believe) and LibreOffice
(HarfBuzz-based, at least for linux) both take 3 steps to move across
it.  Claws and LibreOffice use different algorithms to position the
cursor.  That of LibreOffice seems more reasonable, but that of
Claws works better!  The reason is that Unicode did not declare virama
as forming grapheme clusters.

> [...]
> 
> > There does appear to be a work around, which is to have m17n declare
> > the orthographic syllables it receives to be 'grapheme clusters'.
> 
> I think that's the right solution; i.e. make all combined
> and out-of-ordered glyphs as one cluster.
> 
> > It solves at least some of the problems above.
> 
> Which one is not solved by it?

It seems to have solved all of them.  When I reported the bug, I was
having problems with my font because libotf was silently ignoring half
the lookups in my font.

I though I might have problems with U+1A58 TAI THAM SIGN MAI KANG LAI,
which in Lao visually groups (usually) with the following base
consonant and in Tai Khuen groups with the preceding base consonant. My
clustering in Emacs follows the Tai Khuen scheme.  (I compose two
orthographic clusters together in Emacs, but declare two grapheme
clusters in the FLT processing.)  However, my font follows a major
Northern Thai dictionary and places it on the following base consonant
if there is nothing above it, but otherwise places it on the preceding
base consonant.  However, my implementation is too dirty to cause
problems - the second cluster is not reported as deriving from the
mai kang lai character.

I wonder, though, what will happen if I manage to implement the
Universal Shaping Engine's (USE) rphf feature. The author of a Lao-style
Tai Tham font wanted this feature in HarfBuzz.  The desired effect seems
easy to achieve in m17n-flt, but placing it under font control is more
difficult.  I'm studying MLM2-OTF.flt to see how to do it.

> > However, it then makes editing of the 'clusters' more
> > difficult.  Note that there are examples above with 5
> > characters in a cluster, and this is by no means the
> > limit.
> 
> But, it seems that the current behavior is accepted, at
> least, by Indic people.

Who do you mean by 'Indic people'?  I can see at least three groups:

1) Indian speakers of Indic languages who use Indic scripts, thus
including users of Hindi, Gujarati and Bengali.

See my comments above.

2) Indian users of Indic scripts, thus also including speakers of
Malayalam and Tamil.

In Tamil, a phonetically CVCCV word will normally naturally split into
clusters as CV.C+virama.CV.  I must admit I am surprised that they have
accepted CV.CCV - or do Tamils not use Emacs for Tamil?

Tamils are notorious for regarding their writing system as a syllabary
rather than as an abugida.

I haven't studied the Malayalam script - that does seem a fairly
complicated Indian script, as one might expect when Dravidians use a
script tailored to Middle Indic and stretched to cover Old Indic.

3) Users of Indic scripts, thus also including the Burmese, Thai,
Cambodians and Lao as well as the users of the Tai Tham script.

Rebellion is rampant.  The original Unicode encoding of Thai
followed the phonetic order (allegedly - it was probably the
collation order instead).  This was rapidly thrown out as
incompatible with the current, working encoding.  Unicode responded
with the derogatory property of 'logical order exception'.

Around Unicode 5.1, the preposed vowels of Thai and Lao were suddenly
included in grapheme clusters with the base consonant. As the
consequences started to appear in applications, there were howls of
rage from Thais, and the characters were restored to their original
status as fully independent characters.

It doesn't seem so long ago that the Cambodian government imposed
Unicode on Cambodia.  You'd have thought that access to applications
would have made Unicode the obvious choice.

New Tai Lue is an interesting case.  Microsoft delayed support for this
simple Indic script for so long that most apparently Unicode-encoded
New Tai Lue text was actually encoded in visual order.  With Unicode
8.0, New Tai Lue is changing from phonetic order to visual order, and
it will no longer need any clusters at all!  Emacs 23.3 (which is what
is in long-term support Ubuntu 12.04) offers no support for New Tai
Lue, so I am not sure that there is yet a New Tai Lue view on
composition in Emacs.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-21 17:58   ` Richard Wordingham
@ 2015-03-21 18:26     ` Eli Zaretskii
  2015-03-25 14:25     ` K. Handa
  1 sibling, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2015-03-21 18:26 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140

> Date: Sat, 21 Mar 2015 17:58:18 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: 20140@debbugs.gnu.org
> 
> Another implementation could choose to move in visual order.

Emacs 24.4 does have visual-order cursor movement.  Customize the
variable visual-order-cursor-movement to get that.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-21 17:58   ` Richard Wordingham
  2015-03-21 18:26     ` Eli Zaretskii
@ 2015-03-25 14:25     ` K. Handa
  2015-03-25 21:45       ` Richard Wordingham
  2015-04-05 19:48       ` Richard Wordingham
  1 sibling, 2 replies; 35+ messages in thread
From: K. Handa @ 2015-03-25 14:25 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140

Hi, thank you for the detailed explanation.

In article <20150321175818.1b125eba@JRWUBU2>, Richard Wordingham <richard.wordingham@ntlworld.com> writes:

> What I ought to want is SIL's split cursor scheme, which indicated the
> next ('point') and previous characters, even in bidirectional text.
> Unfortunately, that's not compatible with m17n, which seems to assume
> that cursor position will be a single number.  The Emacs functions
> forward-char-intrusive and backward-char-intrusive provided a pleasant,
> more intuitive, alternative, and I am sad to hear they are gone.
> Perhaps I'll have to start using toggle-auto-composition.

Those Emacs functions are just my idea for improving Emacs
for CTL users, and have never been included in the official
Emacs verison.  I check the code and found two problems:

(1) When the command sets disable-point-adjustment to t,
command_loop_1 should force updating the display if point is
within a grapheme cluster.  So we need this patch:

diff --git a/src/keyboard.c b/src/keyboard.c
index bf65df1..13125c1 100644
--- a/src/keyboard.c
+++ b/src/keyboard.c
@@ -1636,6 +1636,16 @@ command_loop_1 (void)
 	    adjust_point_for_property (last_point_position,
 				       MODIFF != prev_modiff);
 	}
+      else if (current_buffer == prev_buffer
+	       && last_point_position != PT)
+	{
+	  if (PT > BEGV && PT < ZV
+	      && (composition_adjust_point (last_point_position, PT) != PT))
+	    /* Now point is within a grapheme cluster.  We must update
+	       the display so that this cluster is discomosed on the
+	       screen and the cursor is correctly placed at point.  */
+	    windows_or_buffers_changed = 22;
+	}
 
       /* Install chars successfully executed in kbd macro.  */
 
(2) We should break a grapheme cluster at point.  So we need
this patch.

diff --git a/src/xdisp.c b/src/xdisp.c
index a17f5a9..0c56395 100644
--- a/src/xdisp.c
+++ b/src/xdisp.c
@@ -3408,6 +3408,9 @@ compute_stop_pos (struct it *it)
       pos = next_overlay_change (charpos);
       if (pos < it->stop_charpos)
 	it->stop_charpos = pos;
+      /* If point is in front of the current stop pos, stop there.  */
+      if (charpos < PT && PT < it->stop_charpos)
+	it->stop_charpos = PT;
 
       /* Set up variables for computing the stop position from text
          property changes.  */
@@ -8166,7 +8169,12 @@ next_element_from_buffer (struct it *it)
 	  && IT_CHARPOS (*it) >= it->redisplay_end_trigger_charpos)
 	run_redisplay_end_trigger_hook (it);
 
-      stop = it->bidi_it.scan_dir < 0 ? -1 : it->end_charpos;
+      /* Set stop position considering the bidi direction and point.  */
+      if (it->bidi_it.scan_dir < 0)
+	stop = (PT < IT_CHARPOS (*it)) ? PT : -1;
+      else
+	stop = ((IT_CHARPOS (*it) < PT && PT < it->end_charpos)
+		? PT : it->end_charpos);
       if (CHAR_COMPOSED_P (it, IT_CHARPOS (*it), IT_BYTEPOS (*it),
 			   stop)
 	  && next_element_from_composition (it))

Could you try these patches and test the usability of
forward-char-intrusive and backward-char-intrusive?

> > Please try to move cursor over this Devanagri text "हिंदी" on
> > Emacs, gedit, and, for instance, firefox.  They all treat
> > that text as 2 grapheme clusters "हिं" and "दी".  The first
> > one corresponds to character the sequence U+935 U+93F, and
> > U+93F (vowel I) is displayed before U+935 (base cosonant).

> Note that those clusters are only 3 and 2 characters long.  Retyping
> them is tolerable.  Now consider the Sanskrit Devanagari text स्त्री,
> which contains two consonant-combining viramas.  Emacs moves across it
> in 1 step, but Claws e-mail (GTK-based, I believe) and LibreOffice
> (HarfBuzz-based, at least for linux) both take 3 steps to move across
> it.  Claws and LibreOffice use different algorithms to position the
> cursor.  That of LibreOffice seems more reasonable, but that of
> Claws works better!  The reason is that Unicode did not declare virama
> as forming grapheme clusters.

Ah, hmmm, that a problem of DEVA-OTF.flt and DEV2-OTF.flt of
the m17n library.  I'll try to fix them.

> It seems to have solved all of them.  When I reported the bug, I was
> having problems with my font because libotf was silently ignoring half
> the lookups in my font.

Could you please send me (not on this list) an appropriate
bug/problem report if libotf should be fixed?

> I though I might have problems with U+1A58 TAI THAM SIGN MAI KANG LAI,
> which in Lao visually groups (usually) with the following base
> consonant and in Tai Khuen groups with the preceding base consonant. My
> clustering in Emacs follows the Tai Khuen scheme.  (I compose two
> orthographic clusters together in Emacs, but declare two grapheme
> clusters in the FLT processing.)  However, my font follows a major
> Northern Thai dictionary and places it on the following base consonant
> if there is nothing above it, but otherwise places it on the preceding
> base consonant.  However, my implementation is too dirty to cause
> problems - the second cluster is not reported as deriving from the
> mai kang lai character.

> I wonder, though, what will happen if I manage to implement the
> Universal Shaping Engine's (USE) rphf feature. The author of a Lao-style
> Tai Tham font wanted this feature in HarfBuzz.  The desired effect seems
> easy to achieve in m17n-flt, but placing it under font control is more
> difficult.  I'm studying MLM2-OTF.flt to see how to do it.

I've just started to study the Universal Shaping Engine.  It
seems that we can implement it by a proper FLT file.

> > > However, it then makes editing of the 'clusters' more
> > > difficult.  Note that there are examples above with 5
> > > characters in a cluster, and this is by no means the
> > > limit.
> > 
> > But, it seems that the current behavior is accepted, at
> > least, by Indic people.

> Who do you mean by 'Indic people'?

I just mean that I have not heard any complaints about that
"too long cluster problem" of Emacs.  No one is using Emacs
for Indic scripts?

> New Tai Lue is an interesting case.  Microsoft delayed support for this
> simple Indic script for so long that most apparently Unicode-encoded
> New Tai Lue text was actually encoded in visual order.  With Unicode
> 8.0, New Tai Lue is changing from phonetic order to visual order, and
> it will no longer need any clusters at all!  

Wow, I didn't know that.

> Emacs 23.3 (which is what is in long-term support Ubuntu
> 12.04) offers no support for New Tai Lue, so I am not sure
> that there is yet a New Tai Lue view on composition in
> Emacs.

We may be able to provide supports for new scripts in elpa.

---
K. Handa
handa@gnu.org





^ permalink raw reply related	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-25 14:25     ` K. Handa
@ 2015-03-25 21:45       ` Richard Wordingham
  2015-04-05 19:48       ` Richard Wordingham
  1 sibling, 0 replies; 35+ messages in thread
From: Richard Wordingham @ 2015-03-25 21:45 UTC (permalink / raw)
  To: 20140

On Wed, 25 Mar 2015 23:25:54 +0900
handa@gnu.org (K. Handa) wrote:

> Hi, thank you for the detailed explanation.
> 
> In article <20150321175818.1b125eba@JRWUBU2>, Richard Wordingham
> <richard.wordingham@ntlworld.com> writes:
> 
> > What I ought to want is SIL's split cursor scheme, which indicated
> > the next ('point') and previous characters, even in bidirectional
> > text. Unfortunately, that's not compatible with m17n, which seems
> > to assume that cursor position will be a single number.  

> > The Emacs
> > functions forward-char-intrusive and backward-char-intrusive
> > provided a pleasant, more intuitive, alternative, and I am sad to
> > hear they are gone. Perhaps I'll have to start using
> > toggle-auto-composition.
> 
> Those Emacs functions are just my idea for improving Emacs
> for CTL users, and have never been included in the official
> Emacs verison.

I think I must have confused them with the behaviour of Emacs 22.1
on Windows XP.  I didn't do anything to enable the visual decomposition
of the clusters - it just happened when moving with the arrow keys.
Indeed, it is conceivable that the characters weren't decomposed, but
were simply being rendered by Windows without any need for composition.

I haven't had time to try out the experimental code yet.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-25 14:25     ` K. Handa
  2015-03-25 21:45       ` Richard Wordingham
@ 2015-04-05 19:48       ` Richard Wordingham
  1 sibling, 0 replies; 35+ messages in thread
From: Richard Wordingham @ 2015-04-05 19:48 UTC (permalink / raw)
  To: K. Handa; +Cc: 20140

[-- Attachment #1: Type: text/plain, Size: 1546 bytes --]

On Wed, 25 Mar 2015 23:25:54 +0900
handa@gnu.org (K. Handa) wrote:

> Could you try these patches and test the usability of
> forward-char-intrusive and backward-char-intrusive?

The results weren't quite what I'd hoped for, but the results are
usable.  Thank you.

The text I principally tried the commands out on was Tai Tham text <LOW
PA, SIGN II, TONE-1, SAKOT, NA, OA BELOW, TONE-2>.  I used corrections
for the bugs that had been affecting its rendering.

It seems that the commands prevent shaping across the cursor, but do
not inhibit shaping within the former cluster.  I was only doing shaping
on complete orthographic syllables, so entering a cluster chiefly had
the effect of losing all positioning of marks and making the text
unreadable.  However, the behaviour may make a good teaching aid!

I then tried the command on Thai, and there the commands worked well.
I therefore added to LANA-OTF.flt, for marks not in complete syllables,
the command:

    ("(M)" [ (1 = ) ] ) ; For stepping through.

I attach the results of not stepping through (labelled 0), and
stepping through by 1 to 7 characters (labelled 1 to 7).  The result
is not so good at 3 steps - I think because the extra rendering
command does not handle SAKOT.  Aggressive use of dotted circles might
improve the display.  I don't know why, at the end, there is a delay in
TONE-2 rising to its proper height.

C-S-f is not a good key sequence for me.  C-S is one of my X-keyboard
switching combinations - I chose it for compatibility with the Xming X
server.

Richard.

[-- Attachment #2: emacs_entry.png --]
[-- Type: image/png, Size: 5776 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2015-03-18 22:20 bug#20140: 24.4; M17n shaper output rejected Richard Wordingham
  2015-03-19  3:43 ` Eli Zaretskii
  2015-03-21  8:33 ` K. Handa
@ 2022-02-03 21:21 ` Lars Ingebrigtsen
  2022-02-04  7:37   ` Eli Zaretskii
  2 siblings, 1 reply; 35+ messages in thread
From: Lars Ingebrigtsen @ 2022-02-03 21:21 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140

Richard Wordingham <richard.wordingham@ntlworld.com> writes:

> I am running Emacs 24.4 in a Ubuntu 12.04 Precise Pangolin
> installation, for which the version of libm17n-0 is 1.6.3-1.  I am
> attempting to induce Emacs to render the Tai Tham script.  There
> appears to be a bug/feature in Emacs which makes this unnecessarily
> difficult.

(I'm going through old bug reports that unfortunately weren't resolved
at the time.)

I vaguely remember there having been some fixes in this area since this
bug report was opened -- does this work better for you in more recent
versions of Emacs?

-- 
(domestic pets only, the antidote for overdose, milk.)
   bloggy blog: http://lars.ingebrigtsen.no





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-03 21:21 ` Lars Ingebrigtsen
@ 2022-02-04  7:37   ` Eli Zaretskii
  2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-04  7:37 UTC (permalink / raw)
  To: Lars Ingebrigtsen; +Cc: 20140, richard.wordingham

> From: Lars Ingebrigtsen <larsi@gnus.org>
> Date: Thu, 03 Feb 2022 22:21:28 +0100
> Cc: 20140@debbugs.gnu.org
> 
> Richard Wordingham <richard.wordingham@ntlworld.com> writes:
> 
> > I am running Emacs 24.4 in a Ubuntu 12.04 Precise Pangolin
> > installation, for which the version of libm17n-0 is 1.6.3-1.  I am
> > attempting to induce Emacs to render the Tai Tham script.  There
> > appears to be a bug/feature in Emacs which makes this unnecessarily
> > difficult.
> 
> (I'm going through old bug reports that unfortunately weren't resolved
> at the time.)
> 
> I vaguely remember there having been some fixes in this area since this
> bug report was opened -- does this work better for you in more recent
> versions of Emacs?

The most important change is that we now use HarfBuzz by default.

Richard didn't contribute the Tai Tham composition rules to us
(AFAIR), so I cannot test what happens now in Emacs with HarfBuzz.
Maybe we should revisit this issue, but first I hope Richard could
tell whether the issue still exists, and if so, what composition rules
he uses or suggests to use for Tai Tham.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-04  7:37   ` Eli Zaretskii
@ 2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-06  8:11       ` Eli Zaretskii
                         ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-05 22:52 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, Lars Ingebrigtsen

[-- Attachment #1: Type: text/plain, Size: 2805 bytes --]

On Fri, 04 Feb 2022 09:37:03 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > From: Lars Ingebrigtsen <larsi@gnus.org>
> > Date: Thu, 03 Feb 2022 22:21:28 +0100
> > Cc: 20140@debbugs.gnu.org
> > 
> > Richard Wordingham <richard.wordingham@ntlworld.com> writes:
> >   
> > > I am running Emacs 24.4 in a Ubuntu 12.04 Precise Pangolin
> > > installation, for which the version of libm17n-0 is 1.6.3-1.  I am
> > > attempting to induce Emacs to render the Tai Tham script.  There
> > > appears to be a bug/feature in Emacs which makes this
> > > unnecessarily difficult.  
> > 
> > (I'm going through old bug reports that unfortunately weren't
> > resolved at the time.)
> > 
> > I vaguely remember there having been some fixes in this area since
> > this bug report was opened -- does this work better for you in more
> > recent versions of Emacs?  

I'm currently using the vanilla emacs on Ubuntu Focal, which is
described as 'GNU Emacs 26.3 (build 2, x86_64-pc-linux-gnu, GTK+
Version 3.24.14) of 2020-03-26, modified by Debian'.  The key good news
is that the commands forward-char-intrusive and backward-char-intrusive
are now standard, so I can position the cursor by dead-reckoning.  You
can reasonably mark the issue as solved.

> The most important change is that we now use HarfBuzz by default.

Isn't that only true for Emacs 27.1 and above?

> Richard didn't contribute the Tai Tham composition rules to us
> (AFAIR), so I cannot test what happens now in Emacs with HarfBuzz.
> Maybe we should revisit this issue, but first I hope Richard could
> tell whether the issue still exists, and if so, what composition rules
> he uses or suggests to use for Tai Tham.

Sad to see that Khaled Hosny's suggestion not to use composition rules
seems not to have been taken.

You're welcome to include my composition rules.  They're complicated by
the facts that the 'regular expressions' are not interpreted as regular
expressions and they are not interpreted as closed under canonical
equivalence.  I therefore calculate the regular expression.  My
composition rules are attached as tai-tham.el, which was last modified
on 20 March 2015.  (It would need reformatting to paste into this
email.)

There are some deficiencies; I've a feeling there may be a problem with
adding ZWNJ and CGJ as marks; ZWJ should also be added for
completeness.  I need ZWNJ to write 4-column ᨴᩣᩴᨶ᩠ᩅ‌ᩣ᩠ᨿ as opposed to
3-column ᨴᩣᩴᨶ᩠ᩅᩣ᩠ᨿ, and even with my font, HarfBuzz will need CGJ for
the suppression of jack-booted dotted circles. Additionally, for
didactic text, what can I do for U+25CC for explicit display of marks
and their equivalents on a dotted circle, and for that matter, for
display on NBSP?

Richard.

Richard.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: tai-tham.el --]
[-- Type: text/x-emacs-lisp, Size: 4070 bytes --]

;;; tai-tham.el --- support for Tai Tham -*- coding: utf-8 -*-

;; Copyright (C) 2008, 2009, 2010, 2011
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H13PRO009

;; Keywords: multilingual, Tai Tham, i18n

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.

;;; Code:

;; (set-language-info-alist
;;  "Northern Thai" '((charset unicode)
;; 		   (coding-system utf-8)
;;		   (coding-priority utf-8)
;;		   (sample-text .
;;		     "Northern Thai (ᨣᩣᩴᨾᩮᩬᩥᨦ / ᨽᩣᩈᩣᩃ᩶ᩣ᩠ᨶᨶᩣ)	ᩈ᩠ᩅᩢᩔ᩠ᨯᩦᨣᩕᩢ᩠ᨸ")
;;		   (documentation . t)))

;; To load:
;; (load-file "~/tham/tai-tham.el") tai-tham-composable-pattern
;; 

(defvar tai-tham-composable-pattern
  (let ((table
	 ;; C is letters, independent vowels, digits, punctuation and symbols.
	 '(("C" . "[\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA0-\u1AAD]")
	   ("M" . "[\u1A55-\u1A57\u1A59-\u1A5E\u1A61-\u1A7C\u1A7F]"); Mark
	   ("H" . "\u1A60") ; sakot
           ("S" . "[\u1A75-\u1A7C]") ; Marks commuting with sakot
	   ("N" . "\u1A58"))) ; mai kang lai
;; The definition of a sequence of interacting Tai Tham characters is
;; surprisingly complicated.  The basic syllable structure should just be:
;;
;;                           C(M|HC)*
;;
;; There are three complications:
;;
;; 1. Emacs uses a backtracking regular expression engine, but it only
;;    backtracks if the characters accepted so far don't only match the regular
;;    expression.  Thus if M includes sakot, CHC will be parsed as CH and then
;;    C - there is no cause to backtrack!  On the other hand, missing consonants
;;    should not disrupt display - the glyph for sakot will normally alert the
;;    user that text entry is incomplete.
;;
;; 2. Some characters can be swapped round with sakot without changing the
;;    signification of the sequence of characters.  The regular expression
;;    works with strings of characters rather than traces of fully decomposed
;;    characters subject to Unicode's canonical equivalence.
;;
;; 3. Which syllable mai kang lai belongs to depends on the font.  Again, if
;;    M included mai kang lai, CNC would be parsed as CN and C.  The word
;;    ᨴᩘ᩠ᩃᩣ᩠ᨿ has mai kang lai in the middle of an orthographic syllable.
;	(basic_syllable "C\\(N*\\(M\\|HS*C?\\)\\)*")
	(basic_syllable "C\\(N*\\(M\\|HS*C\\)\\)*")
        (regexp "X\\(N\\(X\\)?\\)*H?")) ; X is basic syllable
    (let ((case-fold-search nil))
      (setq regexp (replace-regexp-in-string "X" basic_syllable regexp t t))
      (dolist (elt table)
	(setq regexp (replace-regexp-in-string (car elt) (cdr elt)
					       regexp t t))))
    regexp))

; Failed attempt to get proper composition for incomplete word ᨴᩘ᩠ᩃᩣ᩠.
;(let ((elt (list (vector tai-tham-composable-pattern 3 'font-shape-gstring)
;		 (vector tai-tham-composable-pattern 2 'font-shape-gstring)
;		 (vector tai-tham-composable-pattern 1 'font-shape-gstring)
;		 (vector tai-tham-composable-pattern 0 'font-shape-gstring)
;		 (vector "." 0 'font-shape-gstring)
;		 )))
;  (set-char-table-range composition-function-table '(#x1A20 . #x1AAD) elt))

(let ((elt (list (vector tai-tham-composable-pattern 0 'font-shape-gstring)
		 (vector "." 0 'font-shape-gstring)
		 )))
  (set-char-table-range composition-function-table '(#x1A20 . #x1AAD) elt))

^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-06  8:11       ` Eli Zaretskii
  2022-02-06 22:09         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-08 22:13         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-13 16:04       ` Eli Zaretskii
  2022-02-13 19:49       ` Eli Zaretskii
  2 siblings, 2 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-06  8:11 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, Kenichi Handa, larsi

> Date: Sat, 5 Feb 2022 22:52:51 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> 
> I'm currently using the vanilla emacs on Ubuntu Focal, which is
> described as 'GNU Emacs 26.3 (build 2, x86_64-pc-linux-gnu, GTK+
> Version 3.24.14) of 2020-03-26, modified by Debian'.  The key good news
> is that the commands forward-char-intrusive and backward-char-intrusive
> are now standard, so I can position the cursor by dead-reckoning.  You
> can reasonably mark the issue as solved.

I don't see the commands forward-char-intrusive and
backward-char-intrusive anywhere in Emacs, so I guess they are your
local changes, based on the code posted by Handa-san in this
discussion?

> > The most important change is that we now use HarfBuzz by default.
> 
> Isn't that only true for Emacs 27.1 and above?

That's true, but Emacs 26 is ancient history; Emacs 28.1 is about to
be released.  So from our perspective, HarfBuzz is the default shaping
engine, and since it's available on all the supported platforms we
care about, we are phasing out m17n-flt shapers.

> > Richard didn't contribute the Tai Tham composition rules to us
> > (AFAIR), so I cannot test what happens now in Emacs with HarfBuzz.
> > Maybe we should revisit this issue, but first I hope Richard could
> > tell whether the issue still exists, and if so, what composition rules
> > he uses or suggests to use for Tai Tham.
> 
> Sad to see that Khaled Hosny's suggestion not to use composition rules
> seems not to have been taken.

You mean, to pass all the text via HarfBuzz instead?  That makes the
Emacs redisplay painfully slow, and would require a complete redesign
of how we render text to be bearable.  So as long as such a redesign
is not available, we cannot use that advice.

> You're welcome to include my composition rules.

Thanks.

> They're complicated by the facts that the 'regular expressions' are
> not interpreted as regular expressions and they are not interpreted
> as closed under canonical equivalence.  I therefore calculate the
> regular expression.

I'm not sure I understand the issue: what you do seems to be very
similar to what we do for the Indic scripts in indian.el, so what kind
of complications are you talking about here?

Also, your rules seem to follow the description in the "Structuring
Tai Tham Unicode" document (Revision 7), a.k.a. "L2/19-365", dated Oct
2019, is that right?  Is that document the latest word on shaping Tai
Tham, or are there any additional sources?

> There are some deficiencies; I've a feeling there may be a problem with
> adding ZWNJ and CGJ as marks; ZWJ should also be added for
> completeness.

These are barely mentioned in the L2/19-365 document, and not
mentioned at all in the Tai Tham section of the Unicode Standard.
Does it mean they are not very important in contemporary Tai Tham
texts?

> I need ZWNJ to write 4-column ᨴᩣᩴᨶ᩠ᩅ‌ᩣ᩠ᨿ as opposed to
> 3-column ᨴᩣᩴᨶ᩠ᩅᩣ᩠ᨿ, and even with my font, HarfBuzz will need CGJ for
> the suppression of jack-booted dotted circles. Additionally, for
> didactic text, what can I do for U+25CC for explicit display of marks
> and their equivalents on a dotted circle, and for that matter, for
> display on NBSP?

At least for the dotted circle case, Emacs has a general composition
rule; see compose-gstring-for-dotted-circle and the corresponding rule
in composite.c.  So I'm not sure we need anything specific to Tai Tham
there.

Can you recommend good fonts for Tai Tham?  Are they free fonts?

Thanks.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-06  8:11       ` Eli Zaretskii
@ 2022-02-06 22:09         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-07 14:04           ` Eli Zaretskii
  2022-02-08 22:13         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  1 sibling, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-06 22:09 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, Kenichi Handa, larsi

On Sun, 06 Feb 2022 10:11:08 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sat, 5 Feb 2022 22:52:51 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> > 
> > I'm currently using the vanilla emacs on Ubuntu Focal, which is
> > described as 'GNU Emacs 26.3 (build 2, x86_64-pc-linux-gnu, GTK+
> > Version 3.24.14) of 2020-03-26, modified by Debian'.  The key good
> > news is that the commands forward-char-intrusive and
> > backward-char-intrusive are now standard, so I can position the
> > cursor by dead-reckoning.  You can reasonably mark the issue as
> > solved.  
> 
> I don't see the commands forward-char-intrusive and
> backward-char-intrusive anywhere in Emacs, so I guess they are your
> local changes, based on the code posted by Handa-san in this
> discussion?
> 
> > > The most important change is that we now use HarfBuzz by default.
> > >  
> > 
> > Isn't that only true for Emacs 27.1 and above?  
> 
> That's true, but Emacs 26 is ancient history; Emacs 28.1 is about to
> be released.  So from our perspective, HarfBuzz is the default shaping
> engine, and since it's available on all the supported platforms we
> care about, we are phasing out m17n-flt shapers.
> 
> > > Richard didn't contribute the Tai Tham composition rules to us
> > > (AFAIR), so I cannot test what happens now in Emacs with HarfBuzz.
> > > Maybe we should revisit this issue, but first I hope Richard could
> > > tell whether the issue still exists, and if so, what composition
> > > rules he uses or suggests to use for Tai Tham.  
> > 
> > Sad to see that Khaled Hosny's suggestion not to use composition
> > rules seems not to have been taken.  
> 
> You mean, to pass all the text via HarfBuzz instead?  That makes the
> Emacs redisplay painfully slow, and would require a complete redesign
> of how we render text to be bearable.  So as long as such a redesign
> is not available, we cannot use that advice.

Except for Malayalam!  (Subexpression XX* in indian.el at the moment.)

> > You're welcome to include my composition rules.  
> 
> Thanks.
> 
> > They're complicated by the facts that the 'regular expressions' are
> > not interpreted as regular expressions and they are not interpreted
> > as closed under canonical equivalence.  I therefore calculate the
> > regular expression.  
> 
> I'm not sure I understand the issue: what you do seems to be very
> similar to what we do for the Indic scripts in indian.el, so what kind
> of complications are you talking about here?

Well, those rules themselves are a bit odd.  Why are you composing
single clusters?  Why are you breaking clusters where Microsoft
imitators are likely to insert dotted circles?

The basic structure for most Indic scripts is R*C(M|HC)*(M|H)* where R
is miscellaneous prefixed forms (e.g. dot reph, visarga variants), C is
consonants (and things that can act like them), H is the conjoining
operator, and M is miscellaneous marks, including ZWJ and ZWNJ.
"(M|H)*" accounts for explicit viramas and isolated half-forms.
Jackboots are then applied on the ground that spell checkers cannot be
relied upon.

The first problem for Tai Tham is that marks with non-zero canonical
combining class (ccc) greater than 9 (note that script-specific nuktas
generally have ccc=7) do not mix with conjoining operators with ccc=9;
the conjoining operator (as opposed to visible virama) should not be
separated from the following consonant. Mark Davis ignored this
requirement from the proposals, so unless your 'regular' expression is
acting on traces under canonical equivalence rather than mere strings,
one has to complicate the expressions to cope.

The second issue is that the behaviour of U+1A58 TAI THAM SIGN MAI KANG
LAI is a stylistic variable.  It can act like a dot reph or a
phonetic-syllable-final mark. My composition rules therefore have to
treat it as gluing orthographic syllables together.

The third issue, that is less visible, is that I had a problem with
back-tracking.

> Also, your rules seem to follow the description in the "Structuring
> Tai Tham Unicode" document (Revision 7), a.k.a. "L2/19-365", dated Oct
> 2019, is that right?  Is that document the latest word on shaping Tai
> Tham, or are there any additional sources?

No, the document's a crime.  I tried to minmise it's destructiveness,
which is why I got an acknowledgement in it.  I advocate sticking to
phonetic order, as in Khmer and Brahmi.  That scheme needs a couple of
formally unproposed characters to make some distinctions.

The best sources are the regular expressions in the proposals, but they
missed out the combination of tone mark and final consonant signs.
What do you mean by 'shaping'?  For Tai Tham, only positive service
provided by rendering engines is the movement of preposed vowels and
MEDIAL RA to the start of the glyph sequence; all the other
resequencing has to be done by the fonts themselves.

> > There are some deficiencies; I've a feeling there may be a problem
> > with adding ZWNJ and CGJ as marks; ZWJ should also be added for
> > completeness.  
> 
> These are barely mentioned in the L2/19-365 document, and not
> mentioned at all in the Tai Tham section of the Unicode Standard.
> Does it mean they are not very important in contemporary Tai Tham
> texts?

The Tai Tham section is based on information before grammar
nazification disabled Tai Tham texts, or at least, those that were to
be rendered using restrictive shapers based on alleged knowledge of the
languages.  ZWNJ is a standard mechanism for disabling ligatures in
non-cursive scripts, though I'm not sure of the balance of ZWJ and ZWNJ
in Fraktur, e.g. the different renderings of the two meanings
of Antiqua German Wachstube.

CGJ is needed where there is no other character to mark the boundary of
two chained syllables and concatenating the vowel and tone marks of the
two together violates the ordering rules for a single syllable.  It
would also be needed to mark other differences relevant to collation,
e.g. if syllable-initial BA were sorted according to its pronunciation,
as in one major dictionary.  Automating an inconsistent hand-sort is
hard, slow work, especially as the CLDR tools choke on an easy Lao
sort.  (By contrast, the official Thai sort is very machine-friendly.)

> > I need ZWNJ to write 4-column ᨴᩣᩴᨶ᩠ᩅ‌ᩣ᩠ᨿ as opposed to
> > 3-column ᨴᩣᩴᨶ᩠ᩅᩣ᩠ᨿ, and even with my font, HarfBuzz will need CGJ
> > for the suppression of jack-booted dotted circles. Additionally, for
> > didactic text, what can I do for U+25CC for explicit display of
> > marks and their equivalents on a dotted circle, and for that
> > matter, for display on NBSP?  

This, the main use of ZWNJ, was unknown to the authors of the Tai Tham
proposals.  In Lao texts of the 1930s, non-ligation seems to mark an
enthusiasm for the spelling reforms, which one normally thinks of as
only applying to the Lao script.

Having looked at indian.el, it seems that it will be easy to add these
controls (CGJ, ZWJ and ZWNJ) to the composition tables.

> At least for the dotted circle case, Emacs has a general composition
> rule; see compose-gstring-for-dotted-circle and the corresponding rule
> in composite.c.  So I'm not sure we need anything specific to Tai Tham
> there.

Does the 3-character Khmer sequence "◌្ក" <U+25CC, U+17D2, U+1780> work
in Version 28?  It doesn't in Version 26.3.  It should look like a
dotted circle with the lower part of ក្ក below it.  In Version 26.3, I
don't even get the consonant U+1780 subscripted!

With HarfBuzz, if you don't compose U+25CC with the following mark, you
are very likely to get two dotted circles - are you deliberately
deleting one?  Doing so wouldn't be a reliable process.

Possibly I could fix the rendering problem by also composing sequences
starting with marks - to be investigated.  If it works, it might work
with NBSP, though it wouldn't help with my plan for <NBSP, ZWJ, spacing
mark> to render as just the spacing mark. 

> Can you recommend good fonts for Tai Tham?  Are they free fonts?

Almost all Tai Tham fonts have problems.  Probably the best is the
one used for the New Testament, which relies on the SIL Graphite
renderer. I'll dig into that one.

The nicest OTL shaper-based one for most words is Lamphun, which is
based on Hariphunchai.  Unfortunately, not even Lamphun distinguished
subscript HIGH RATHA from the subscript <HIGH RATA, SIGN HIGH RATHA OR
LOW PA>, and it is rather limited for interacting marks - Hariphunchai
lacks mark-to-mark positioning.  The commoner combinations of marks are
handled by glyph substitution, and Lamphun has made a start on
mark-to-mark positioning.  Hariphunchai and Lamphun are available under
the SIL Open Font licence.

For Lao and Pali, Khottabun is a nice font, but there are some
idiosyncrasies in its encoding of words.  (Unicode appears only to
define character encoding, and is largely silent on the encoding of Tai
Tham words.)  It is available under the SIL Open Font licence, so I
can and perhaps ought to add it to my renderer
(https://wrdingham.co.uk/renderer_test.htm) and font
(https://wrdingham.co.uk/font_test.htm) tests.  Unfortunately, it only
supports characters used for Lao or Pali.  It appears to evade the
jackboots of the HarfBuzz implementation of the Universal Shaping Engine
(USE) by not having a glyph for U+25CC - cunning!  I don't know whether
this trick works with the Windows renderers.

There's a clutch of Tai Khuen fonts released under the SIL Open Font
licence that are aesthetically satisfying, but have a tendency to rely
on Tai Khuen orthographic rules to avoid clashing glyphs, and don't
extend to supporting somewhat exceptional words like Pali _indriya_.
The fonts are:

A Tai Tham KH
A Tai Tham KH New
A Tai Tham KH New V3

They are unlikely to work with Uniscribe or DirectWrite, as they rely
on the ccmp or liga feature being enabled for the default script; I'm
not sure whether that's a problem for those using emacs on Windows.

If you don't mind the reactionary square nature of the glyphs, there
is also my Da Lekh family, with full coverage of the encoded character
set, and some support for language-specific glyphs that are very
different between the languages.  (Generally the glyphs aim to be an
'international' compromise.)  Features may be used instead of
language environment - I don't set out to punish Windows victims.  There
are four fonts:

Da Lekh
Da Lekh Si
Da Lekh Seri
Da Lekh Si Seri

The ones with Seri in the name have the same freedoms as the Deja Vu
fonts and none of the restrictions.  (I drew all their glyphs.) The
others have the same freedoms and, necessarily, restrictions as Deja
Vu Sans.  The Seri (meaning 'untrammelled') fonts were created for
unconstrained use by the Unicode Consortium and deliberately have no
defence against the jackboots of the Universal Shaping Engine.  They
should work fine with the M17n renderer.  Unfortunately, for its Latin
glyphs, one only gets what one pays for. The ones with 'Si' colour
conjoined syllables red so that one can see how words are spelt.  This
capability was added for use with spell checkers, and I use it
successfully for spell-checking in Firefox and LibreOffice.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-06 22:09         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-07 14:04           ` Eli Zaretskii
  2022-02-07 23:38             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-07 14:04 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, handa, larsi

> Date: Sun, 6 Feb 2022 22:09:58 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org, Kenichi Handa <handa@gnu.org>
> 
> > > Sad to see that Khaled Hosny's suggestion not to use composition
> > > rules seems not to have been taken.  
> > 
> > You mean, to pass all the text via HarfBuzz instead?  That makes the
> > Emacs redisplay painfully slow, and would require a complete redesign
> > of how we render text to be bearable.  So as long as such a redesign
> > is not available, we cannot use that advice.
> 
> Except for Malayalam!  (Subexpression XX* in indian.el at the moment.)

(That was changed lately.  But it is a tangent.)

> > > They're complicated by the facts that the 'regular expressions' are
> > > not interpreted as regular expressions and they are not interpreted
> > > as closed under canonical equivalence.  I therefore calculate the
> > > regular expression.  
> > 
> > I'm not sure I understand the issue: what you do seems to be very
> > similar to what we do for the Indic scripts in indian.el, so what kind
> > of complications are you talking about here?
> 
> Well, those rules themselves are a bit odd.  Why are you composing
> single clusters?  Why are you breaking clusters where Microsoft
> imitators are likely to insert dotted circles?

I'm not sure this is what I asked.  I asked why you think this way of
defining patterns for composition rules is in any way exceptional.  It
seems pretty much boilerplate to me.

> The best sources are the regular expressions in the proposals, but they
> missed out the combination of tone mark and final consonant signs.

Can you be more specific about those proposals?  Any specific
pointers?

Also, does this mean there's currently no widely accepted agreement
regarding Tai Tham shaping?  What do native readers of that script
expect?

> What do you mean by 'shaping'?

Whatever is needed to produce correct display from a sequence of
codepoints in a given script.

> > At least for the dotted circle case, Emacs has a general composition
> > rule; see compose-gstring-for-dotted-circle and the corresponding rule
> > in composite.c.  So I'm not sure we need anything specific to Tai Tham
> > there.
> 
> Does the 3-character Khmer sequence "◌្ក" <U+25CC, U+17D2, U+1780> work
> in Version 28?  It doesn't in Version 26.3.  It should look like a
> dotted circle with the lower part of ក្ក below it.  In Version 26.3, I
> don't even get the consonant U+1780 subscripted!

No, it doesn't produce what you want (though the 2nd and the 3rd
characters do combine), but that's not surprising: the general rules
for U+25CC that we have cover only a single combining mark after it:

  (aset composition-function-table #x25CC
	`([,(purecopy ".\\c^") 0 compose-gstring-for-dotted-circle]))

So a sequence of more than one character after U+25CC needs an
explicit rule to work.  What is the rule in this case?  (And what does
Khmer have to do with the question I asked, which is about Tai Tham?)

> With HarfBuzz, if you don't compose U+25CC with the following mark, you
> are very likely to get two dotted circles - are you deliberately
> deleting one?

No.  And I don't get 2 dotted circles with the above in Emacs 28 with
HarfBuzz.

Anyway, Khmer is a separate issue.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-07 14:04           ` Eli Zaretskii
@ 2022-02-07 23:38             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 0 replies; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-07 23:38 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, handa, larsi

On Mon, 07 Feb 2022 16:04:35 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sun, 6 Feb 2022 22:09:58 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: larsi@gnus.org, 20140@debbugs.gnu.org, Kenichi Handa
> > <handa@gnu.org> 
> > > > Sad to see that Khaled Hosny's suggestion not to use composition
> > > > rules seems not to have been taken.    
> > > 
> > > You mean, to pass all the text via HarfBuzz instead?  That makes
> > > the Emacs redisplay painfully slow, and would require a complete
> > > redesign of how we render text to be bearable.  So as long as
> > > such a redesign is not available, we cannot use that advice.  
> > 
> > Except for Malayalam!  (Subexpression XX* in indian.el at the
> > moment.)  
> 
> (That was changed lately.  But it is a tangent.)
> 
> > > > They're complicated by the facts that the 'regular expressions'
> > > > are not interpreted as regular expressions and they are not
> > > > interpreted as closed under canonical equivalence.  I therefore
> > > > calculate the regular expression.    
> > > 
> > > I'm not sure I understand the issue: what you do seems to be very
> > > similar to what we do for the Indic scripts in indian.el, so what
> > > kind of complications are you talking about here?  
> > 
> > Well, those rules themselves are a bit odd.  Why are you composing
> > single clusters?  Why are you breaking clusters where Microsoft
> > imitators are likely to insert dotted circles?  
> 
> I'm not sure this is what I asked.  I asked why you think this way of
> defining patterns for composition rules is in any way exceptional.  It
> seems pretty much boilerplate to me.

Your 'boilerplate' rules look like a straightforward derivation from
the DirectWrite rules for valid subsequences - I haven't checked for
repair work.  That seems unlikely to handle prohibited dittograms
nicely.  It also wouldn't work well when 'well-formed' adjacent
clusters need to interact, as with virama-terminated clusters in
Kharoshthi and some styles of Brahmi.  I haven't hunted for their
definitions - I should probably download a recent tarball.

The exceptional features were the calculation of the regular
expression, especially the expression

(replace-regexp-in-string "X" basic_syllable regexp t t))

> > The best sources are the regular expressions in the proposals, but
> > they missed out the combination of tone mark and final consonant
> > signs.  
> 
> Can you be more specific about those proposals?  Any specific
> pointers?
> 
> Also, does this mean there's currently no widely accepted agreement
> regarding Tai Tham shaping?  What do native readers of that script
> expect?
> 
> > What do you mean by 'shaping'?  
> 
> Whatever is needed to produce correct display from a sequence of
> codepoints in a given script.

The main shaper writers refused to maintain such a service for Tai Tham,
though HarfBuzz did briefly provide such a service with its South East
Asian Shaper.  Windows still confesses its inability to render the full
range of orthographic syllables.  To work, fonts have to engage in
dotted circle removal by some means or other.

It seems that native readers expect a font encoding, where the key
sequence for a mark (or subscript consonant) specifies its position and
shape.  I was badly shocked when I found the backing store for the Tai
Tham Northern Thai New Testament.  I found examples of marks above
entered in the reverse-order to what Unicode-savvy people would expect,
and the complete opposite to what one would type for Thai, for which
input systems generally enforce the rule of typing from base
character outwards.

The general pointer would be to look at the English Wikipedia entry for
<block_name>_(Unicode_block).  In this case, that becomes

https://www.unicode.org/L2/L2007/07007r-n3207r-lanna.pdf Section 13.

The codepoints have changed since then, but the names (apart from the
script name) and representative glyphs have been pretty stable.  The
relationship between the outermost subexpression and syllables needs
updating, and 'H' needs to be updated to include other subscript
consonants, but formally the expression as a whole still stands.

> > > At least for the dotted circle case, Emacs has a general
> > > composition rule; see compose-gstring-for-dotted-circle and the
> > > corresponding rule in composite.c.  So I'm not sure we need
> > > anything specific to Tai Tham there.  
> > 
> > Does the 3-character Khmer sequence "◌្ក" <U+25CC, U+17D2, U+1780>
> > work in Version 28?  It doesn't in Version 26.3.  It should look
> > like a dotted circle with the lower part of ក្ក below it.  In
> > Version 26.3, I don't even get the consonant U+1780 subscripted!  
> 
> No, it doesn't produce what you want (though the 2nd and the 3rd
> characters do combine), but that's not surprising: the general rules
> for U+25CC that we have cover only a single combining mark after it:
> 
>   (aset composition-function-table #x25CC
> 	`([,(purecopy ".\\c^") 0 compose-gstring-for-dotted-circle]))
> 
> So a sequence of more than one character after U+25CC needs an
> explicit rule to work.  What is the rule in this case?  (And what does
> Khmer have to do with the question I asked, which is about Tai Tham?)

You asked if there were any Tai Tham specific requirements.  The
requirement is general, but the need for Khmer is the most obvious.  The
rule for Brahmi, Kharoshthi and their descendants is fairly close to
'take any existing composition, and substitute dotted circle for the
first letter (Lo)'. For the important cases, it is:

(i) Dotted circle plus any sequence of marks (Let the shaper worry
about validity);
(ii) Dotted circle, conjoining operator, consonant, VS?; and
(iii) Dotted circle, conjoining operator, consonant, VS?, any sequence
of marks.
(iv) (i)-(iii) preceded by anything repha-like.

'Conjoining operator' is a virama or pure stacker optionally preceded or
followed by ZWJ or ZWNJ.  VS is a variation selector.

'Repha-like' includes U+0D4E MALAYALAM LETTER DOT REPH, the Mymr script
kinzi sequences, and the prototypical <U+0930 DEVANAGARI LETTER RA,
U+094D DEVANAGARI SIGN VIRAMA>.

The entire sequence would be best handled in the renderer, though you
may have problems with selecting the font and script.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-06  8:11       ` Eli Zaretskii
  2022-02-06 22:09         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-08 22:13         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-12 18:54           ` Eli Zaretskii
  1 sibling, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-08 22:13 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, Kenichi Handa, larsi

On Sun, 06 Feb 2022 10:11:08 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sat, 5 Feb 2022 22:52:51 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> > 
> > I'm currently using the vanilla emacs on Ubuntu Focal, which is
> > described as 'GNU Emacs 26.3 (build 2, x86_64-pc-linux-gnu, GTK+
> > Version 3.24.14) of 2020-03-26, modified by Debian'.  The key good
> > news is that the commands forward-char-intrusive and
> > backward-char-intrusive are now standard, so I can position the
> > cursor by dead-reckoning.  You can reasonably mark the issue as
> > solved.  
> 
> I don't see the commands forward-char-intrusive and
> backward-char-intrusive anywhere in Emacs, so I guess they are your
> local changes, based on the code posted by Handa-san in this
> discussion?

That's a shame; they are indeed local, sitting in my initialisation
file (.emacs).  (I future-proofed myself too well.)  They are well worth
adding to the general store of emacs commands, and mentioning in
documentation next to forward-char and backward-char.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-08 22:13         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-12 18:54           ` Eli Zaretskii
  0 siblings, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-12 18:54 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, handa, larsi

> Date: Tue, 8 Feb 2022 22:13:10 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org, Kenichi Handa <handa@gnu.org>
> 
> On Sun, 06 Feb 2022 10:11:08 +0200
> Eli Zaretskii <eliz@gnu.org> wrote:
> 
> > I don't see the commands forward-char-intrusive and
> > backward-char-intrusive anywhere in Emacs, so I guess they are your
> > local changes, based on the code posted by Handa-san in this
> > discussion?
> 
> That's a shame; they are indeed local, sitting in my initialisation
> file (.emacs).  (I future-proofed myself too well.)  They are well worth
> adding to the general store of emacs commands, and mentioning in
> documentation next to forward-char and backward-char.

I've now added a similar feature to what will become Emacs 29 at some
future point.  The code is based on that old post by Handa-san, but I
decided to change its user-facing aspects: instead of new commands, I
added a new user option, which, if set non-nil, disables
auto-composition at point, and thus allows point to "enter" the
composed sequence.  I think this is better for 2 reasons:

  . no need to introduce new cursor motion commands, for which it will
    be hard to find a convenient key binding (using C-S-f/C-S-b will
    conflict with the shift-selection feature, for example);

  . the user option affects cursor motion by any means, so it's more
    general thus I hope will be more convenient.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-06  8:11       ` Eli Zaretskii
@ 2022-02-13 16:04       ` Eli Zaretskii
  2022-02-13 20:53         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-13 19:49       ` Eli Zaretskii
  2 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-13 16:04 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Sat, 5 Feb 2022 22:52:51 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> 
> You're welcome to include my composition rules.

Thanks.  I started with your code:

> (defvar tai-tham-composable-pattern
>   (let ((table
> 	 ;; C is letters, independent vowels, digits, punctuation and symbols.
> 	 '(("C" . "[\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA0-\u1AAD]")
> 	   ("M" . "[\u1A55-\u1A57\u1A59-\u1A5E\u1A61-\u1A7C\u1A7F]"); Mark
> 	   ("H" . "\u1A60") ; sakot
>            ("S" . "[\u1A75-\u1A7C]") ; Marks commuting with sakot
> 	   ("N" . "\u1A58"))) ; mai kang lai
> 	(basic_syllable "C\\(N*\\(M\\|HS*C\\)\\)*")
>         (regexp "X\\(N\\(X\\)?\\)*H?")) ; X is basic syllable
>     (let ((case-fold-search nil))
>       (setq regexp (replace-regexp-in-string "X" basic_syllable regexp t t))
>       (dolist (elt table)
> 	(setq regexp (replace-regexp-in-string (car elt) (cdr elt)
> 					       regexp t t))))
>     regexp))
> 
> (let ((elt (list (vector tai-tham-composable-pattern 0 'font-shape-gstring)
> 		 (vector "." 0 'font-shape-gstring)
> 		 )))
>   (set-char-table-range composition-function-table '(#x1A20 . #x1AAD) elt))

But that didn't seem to work well enough: e.g., some marks in your
"sample text" didn't combine with letters, as I think they should.
Then I tried this simplistic setting:

  (set-char-table-range composition-function-table
			'(#x1a20 . #x1aaf)
			(list (vector "[\u1a20-\u1aaf]+" 0 'font-shape-gstring)))

and it worked much better, including passing a small number of the
tests from your renderer test page that I threw on Emacs.  This is on
MS-Windows with Emacs 29 and HarfBuzz 2.4.0 (which is not even the
latest release of HarfBuzz), and with the A Tai Tham KH New V3 font.

Any reason not to use the above simple setup for Tai Tham text
composition?

I needed a couple more additions to Emacs to make Tai Tham support
work OOTB: for example, script-representative-chars lacked an entry
for Tai Tham, and the default fontset needed an addition.  (And on
MS-Windows, one needs to run the w32-find-non-USB-fonts magic once, to
notice the newly installed Tai Tham font.)

Other than that, assuming the above setting of
composition-function-table is okay, we are ready to officially add Tai
Tham to scripts supported by Emacs.

Btw, is there a way to get all the examples from your
https://wrdingham.co.uk/lanna/renderer_test.htm as a UTF-8 encoded
text file?  I'd like to test the Emacs rendering with all of the
examples, but copy-pasting each example separately from the browser is
not my idea of useful time investment.  So if you could provide the
examples as a downloadable text file, I'd appreciate.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-06  8:11       ` Eli Zaretskii
  2022-02-13 16:04       ` Eli Zaretskii
@ 2022-02-13 19:49       ` Eli Zaretskii
  2022-02-13 21:11         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-13 19:49 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Sat, 5 Feb 2022 22:52:51 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> 
> Sad to see that Khaled Hosny's suggestion not to use composition rules
> seems not to have been taken.

Btw, the _only_ reason Handa-san and now myself were able to implement
something like the forward/backward-char-intrusive commands is that we
DO control which parts of text are composed and which aren't.  If we
were to follow HarfBuzz developers' advice, and were to hand all the
text to HarfBuzz for shaping, we would need the HarfBuzz cooperation
to implement such features in the editor.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-13 16:04       ` Eli Zaretskii
@ 2022-02-13 20:53         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-14 13:19           ` Eli Zaretskii
  2022-02-16 15:11           ` Eli Zaretskii
  0 siblings, 2 replies; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-13 20:53 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Sun, 13 Feb 2022 18:04:11 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sat, 5 Feb 2022 22:52:51 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> > 
> > You're welcome to include my composition rules.  
> 
> Thanks.  I started with your code:
> 
> > (defvar tai-tham-composable-pattern
> >   (let ((table
> > 	 ;; C is letters, independent vowels, digits, punctuation
> > and symbols. '(("C" .
> > "[\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA0-\u1AAD]") ("M" .
> > "[\u1A55-\u1A57\u1A59-\u1A5E\u1A61-\u1A7C\u1A7F]"); Mark ("H" .
> > "\u1A60") ; sakot ("S" . "[\u1A75-\u1A7C]") ; Marks commuting with
> > sakot ("N" . "\u1A58"))) ; mai kang lai
> > 	(basic_syllable "C\\(N*\\(M\\|HS*C\\)\\)*")
> >         (regexp "X\\(N\\(X\\)?\\)*H?")) ; X is basic syllable
> >     (let ((case-fold-search nil))
> >       (setq regexp (replace-regexp-in-string "X" basic_syllable
> > regexp t t)) (dolist (elt table)
> > 	(setq regexp (replace-regexp-in-string (car elt) (cdr elt)
> > 					       regexp t t))))
> >     regexp))
> > 
> > (let ((elt (list (vector tai-tham-composable-pattern 0
> > 'font-shape-gstring) (vector "." 0 'font-shape-gstring)
> > 		 )))
> >   (set-char-table-range composition-function-table '(#x1A20 .
> > #x1AAD) elt))  
> 
> But that didn't seem to work well enough: e.g., some marks in your
> "sample text" didn't combine with letters, as I think they should.

Which ones?  Are you sure they didn't combine at the Emacs level?
I did suspect the problem was writing '\u1A7C' instead of
'\u1a7c', but I'm no longer so sure.  (The 'C' might get expanded, but
I'm beginning to think not.)

> Then I tried this simplistic setting:
> 
>   (set-char-table-range composition-function-table
> 			'(#x1a20 . #x1aaf)
> 			(list (vector "[\u1a20-\u1aaf]+" 0
> 'font-shape-gstring)))
> 
> and it worked much better, including passing a small number of the
> tests from your renderer test page that I threw on Emacs.  This is on
> MS-Windows with Emacs 29 and HarfBuzz 2.4.0 (which is not even the
> latest release of HarfBuzz), and with the A Tai Tham KH New V3 font.

> Any reason not to use the above simple setup for Tai Tham text
> composition?

Mostly only that you would have to edit the text with "autocomposition
at point disabled" or mark word boundaries, e.g. with U+200B ZERO WIDTH
SPACE. The Tai languages that use Tai Tham use scriptio continua.  While
modern Pali does separate words with visible white space, its words
tend to be polysyllabic; with discerning composition, it would be about
as tolerable as editing Hindi in Devanagari with autocomposition
enabled. (Quite a few people edit Devanagari in transliteration to
Latin!)

You should also add CGJ and ZWNJ, and some people may appreciate ZWJ -
the Khottabun font has ligatures involving ZWJ, though it may just be
an experimental feature - and ultimately WJ, for when someone writes a
Tai Tham word breaker. Oh, and Thai and Lao mai t(r)i and mai
chat(t)awa and U+0324 COMBINING DIAERESIS BELOW turn up occasionally -
U+0324 is supported in Thep's Khottabun font, and my Da Lekh series
supports Thai mai tri and mai chattawa. These characters seem to work
with HarfBuzz.

If using the native Windows renderer is an option with Emacs, then 'A
Tai Tham KH New' works better than 'A Tai Tham KH New V3'.  I've
created https://wrdingham.co.uk/lanna/font_test.htm to do _font_
comparisons.  I'd delayed because I've only recently satisfied myself
that it is lawful, at least under English law.  (The qualms were
with the samples taken from books.)  It's still very much a work in
progress.

> I needed a couple more additions to Emacs to make Tai Tham support
> work OOTB: for example, script-representative-chars lacked an entry
> for Tai Tham, and the default fontset needed an addition.  (And on
> MS-Windows, one needs to run the w32-find-non-USB-fonts magic once, to
> notice the newly installed Tai Tham font.)

> Other than that, assuming the above setting of
> composition-function-table is okay, we are ready to officially add Tai
> Tham to scripts supported by Emacs.

> Btw, is there a way to get all the examples from your
> https://wrdingham.co.uk/lanna/renderer_test.htm as a UTF-8 encoded
> text file?  I'd like to test the Emacs rendering with all of the
> examples, but copy-pasting each example separately from the browser is
> not my idea of useful time investment.  So if you could provide the
> examples as a downloadable text file, I'd appreciate.

As buried (you're not the only one to have overlooked it) in the
penultimate paragraph of 'Content and Layout' section, "The test words
may, in principle, be extracted quite simply from this web page. Each
test 'word' is the content of the first cell in each row whose class is
tst1. For convenience*, I have extracted the first two cells in such
rows, along with titles, to a CSV file."  The file is rt.csv in the
same directory.  I included the meaning and pronunciation as those who
don't know the script may find it easier to refer to the words by
translation or transcription.  You may prefer to use the file more or
less as it is, but one can easily knock up an Emacs macro sequence to
delete the first comma and the rest of the line.  I left the
section titles in for easier navigation to the renderer test file.

*Some people claim to find XML files easy to use, they should then be
able to analyse a file conforming to HTML4 syntax.

Dodgy spellings go in pink rows whose class is 'tst2'.  The alternative
encodings demanded by the USE go in orange rows whose class is 'tst3'.
I have not extracted these.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-13 19:49       ` Eli Zaretskii
@ 2022-02-13 21:11         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-14 13:26           ` Eli Zaretskii
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-13 21:11 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Sun, 13 Feb 2022 21:49:04 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sat, 5 Feb 2022 22:52:51 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: Lars Ingebrigtsen <larsi@gnus.org>, 20140@debbugs.gnu.org
> > 
> > Sad to see that Khaled Hosny's suggestion not to use composition
> > rules seems not to have been taken.  
> 
> Btw, the _only_ reason Handa-san and now myself were able to implement
> something like the forward/backward-char-intrusive commands is that we
> DO control which parts of text are composed and which aren't.  If we
> were to follow HarfBuzz developers' advice, and were to hand all the
> text to HarfBuzz for shaping, we would need the HarfBuzz cooperation
> to implement such features in the editor.

You mean the more sophisticated mechanisms which position the cursor
intelligently.  Those two commands you named work by completely
ignoring the composition mechanism.

Correct me if I am wrong, but for Arabic, is not Emacs restricted to
typewriter-like fonts?

There would be a similar problem with the use of Tai Khuen or other
tunnelling fonts for Northern Thai if you used the current mechanism
for advancing character by character.  Tunnelling fonts write parts of
one cluster under the next.  The Tai Khuen fonts I've seen do this by
relying on characteristics of Tai Khuen spelling.  The rules don't hold
for Northern Thai, and consequently the subscript portions of
successive orthographic syllables can overwrite one another.  A
sophisticated font could check for clashes, but that needs the
orthographic syllables to be passed to the shaper together.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-13 20:53         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-14 13:19           ` Eli Zaretskii
  2022-02-14 22:14             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-16 15:11           ` Eli Zaretskii
  1 sibling, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-14 13:19 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Sun, 13 Feb 2022 20:53:10 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> On Sun, 13 Feb 2022 18:04:11 +0200
> Eli Zaretskii <eliz@gnu.org> wrote:
> 
> > But that didn't seem to work well enough: e.g., some marks in your
> > "sample text" didn't combine with letters, as I think they should.
> 
> Which ones?

Sorry, that was my faulty testing: I tested a half-baked change.  Your
rules do work correctly, AFAICT.

But I have 2 questions:

 1) Why do we need this part of the composition rules:

     (vector "." 0 'font-shape-gstring)

    This matches just one character, so what do we want to accomplish
    by this rule?  A single character cannot "self-compose", can it?

 2) Since tai-tham-composable-pattern always starts with what you
    denote as "C", how about setting up only entries of
    composition-function-table that correspond to those characters,
    i.e.:

     (let ((elt (list (vector tai-tham-composable-pattern 0 'font-shape-gstring)
		      )))
       (set-char-table-range composition-function-table '(#x1A20 . #x1A54) elt)
       (set-char-table-range composition-function-table '(#x1A80 . #x1A89) elt)
       (set-char-table-range composition-function-table '(#x1A90 . #x1A99) elt)
       (set-char-table-range composition-function-table '(#x1AA0 . #x1AAD) elt))

    Do you see any problems with that?

> I did suspect the problem was writing '\u1A7C' instead of
> '\u1a7c', but I'm no longer so sure.

No, that's not a problem.

> You should also add CGJ and ZWNJ, and some people may appreciate ZWJ -
> the Khottabun font has ligatures involving ZWJ, though it may just be
> an experimental feature - and ultimately WJ, for when someone writes a
> Tai Tham word breaker.

How should I add CGJ and ZWNJ?  What are the rules?

> Oh, and Thai and Lao mai t(r)i and mai chat(t)awa and U+0324
> COMBINING DIAERESIS BELOW turn up occasionally - U+0324 is supported
> in Thep's Khottabun font, and my Da Lekh series supports Thai mai
> tri and mai chattawa. These characters seem to work with HarfBuzz.

Not sure I understand: what patterns/rules should be added for these?

> If using the native Windows renderer is an option with Emacs, then 'A
> Tai Tham KH New' works better than 'A Tai Tham KH New V3'.

We still support Uniscribe, but prefer HarfBuzz, because MS deprecated
Uniscribe.  We cannot support DirectWrite, because its APIs are
C++-only, and no one has shown whether and how to call them from C.

> > Btw, is there a way to get all the examples from your
> > https://wrdingham.co.uk/lanna/renderer_test.htm as a UTF-8 encoded
> > text file?  I'd like to test the Emacs rendering with all of the
> > examples, but copy-pasting each example separately from the browser is
> > not my idea of useful time investment.  So if you could provide the
> > examples as a downloadable text file, I'd appreciate.
> 
> As buried (you're not the only one to have overlooked it) in the
> penultimate paragraph of 'Content and Layout' section, "The test words
> may, in principle, be extracted quite simply from this web page. Each
> test 'word' is the content of the first cell in each row whose class is
> tst1. For convenience*, I have extracted the first two cells in such
> rows, along with titles, to a CSV file."  The file is rt.csv in the
> same directory.

Thanks, I will use that.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-13 21:11         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-14 13:26           ` Eli Zaretskii
  2022-02-14 23:26             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-14 13:26 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Sun, 13 Feb 2022 21:11:52 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > Btw, the _only_ reason Handa-san and now myself were able to implement
> > something like the forward/backward-char-intrusive commands is that we
> > DO control which parts of text are composed and which aren't.  If we
> > were to follow HarfBuzz developers' advice, and were to hand all the
> > text to HarfBuzz for shaping, we would need the HarfBuzz cooperation
> > to implement such features in the editor.
> 
> You mean the more sophisticated mechanisms which position the cursor
> intelligently.  Those two commands you named work by completely
> ignoring the composition mechanism.

Yes.  And the reason we can ignore compositions in certain portions
of the text is that we have control on what is passed to HarfBuzz.

> Correct me if I am wrong, but for Arabic, is not Emacs restricted to
> typewriter-like fonts?

No, that's not true.  I'm not aware of any such limitation; AFAIK
Arabic shaping works correctly in Emacs, certainly with HarfBuzz and
Emacs 27 or later.

Or maybe I misunderstand what you mean by "typewriter-like" fonts?
Can you give an example of a non-typewriter-like font for Arabic that
I can find on MS-Windows and try?

> There would be a similar problem with the use of Tai Khuen or other
> tunnelling fonts for Northern Thai if you used the current mechanism
> for advancing character by character.  Tunnelling fonts write parts of
> one cluster under the next.  The Tai Khuen fonts I've seen do this by
> relying on characteristics of Tai Khuen spelling.  The rules don't hold
> for Northern Thai, and consequently the subscript portions of
> successive orthographic syllables can overwrite one another.  A
> sophisticated font could check for clashes, but that needs the
> orthographic syllables to be passed to the shaper together.

I'm not sure I understand.  Does HarfBuzz know about these advancement
features?  We rely on HarfBuzz to give us back as many grapheme
clusters as it sees fit for a given chunk of text, and we expect each
grapheme cluster to include glyphs with relative offsets as needed by
the script and the font.

IOW, this job is delegated to the shaping engine, such as HarfBuzz;
Emacs just takes the glyphs and offsets HarfBuzz gives us and blindly
obeys them.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-14 13:19           ` Eli Zaretskii
@ 2022-02-14 22:14             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-15  1:27               ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-16 15:12               ` Eli Zaretskii
  0 siblings, 2 replies; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-14 22:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Mon, 14 Feb 2022 15:19:36 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sun, 13 Feb 2022 20:53:10 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> > 
> > On Sun, 13 Feb 2022 18:04:11 +0200
> > Eli Zaretskii <eliz@gnu.org> wrote:
> >   
> > > But that didn't seem to work well enough: e.g., some marks in your
> > > "sample text" didn't combine with letters, as I think they
> > > should.  
> > 
> > Which ones?  
> 
> Sorry, that was my faulty testing: I tested a half-baked change.  Your
> rules do work correctly, AFAICT.
> 
> But I have 2 questions:
> 
>  1) Why do we need this part of the composition rules:
> 
>      (vector "." 0 'font-shape-gstring)
> 
>     This matches just one character, so what do we want to accomplish
>     by this rule?  A single character cannot "self-compose", can it?

No, but in general it may need shaping, e.g. to take advantage of the
locl feature.  If that's not needed for shaping to happen, then
dispense with it - unless it was need for general consistency.

>  2) Since tai-tham-composable-pattern always starts with what you
>     denote as "C", how about setting up only entries of
>     composition-function-table that correspond to those characters,
>     i.e.:
> 
>      (let ((elt (list (vector tai-tham-composable-pattern 0
> 'font-shape-gstring) )))
>        (set-char-table-range composition-function-table '(#x1A20 .
> #x1A54) elt) (set-char-table-range composition-function-table
> '(#x1A80 . #x1A89) elt) (set-char-table-range
> composition-function-table '(#x1A90 . #x1A99) elt)
> (set-char-table-range composition-function-table '(#x1AA0 . #x1AAD)
> elt))
> 
>     Do you see any problems with that?

It may affect the rendering of isolated marks, particularly the
preposed ones like U+1A55 TAI THAM CONSONANT SIGN MEDIAL RA and
U+1A6E TAI THAM VOWEL SIGN E.  I'll have to investigate HarfBuzz-using
Emacs.  I can't think of any other possible problems.  My first thought
is that it is unnecessarily complicated, and sets up work for when (if?)
TAI THAM LAO LOW HA gets added.

> > You should also add CGJ and ZWNJ, and some people may appreciate
> > ZWJ - the Khottabun font has ligatures involving ZWJ, though it may
> > just be an experimental feature - and ultimately WJ, for when
> > someone writes a Tai Tham word breaker.  
> 
> How should I add CGJ and ZWNJ?  What are the rules?
> 
> > Oh, and Thai and Lao mai t(r)i and mai chat(t)awa and U+0324
> > COMBINING DIAERESIS BELOW turn up occasionally - U+0324 is supported
> > in Thep's Khottabun font, and my Da Lekh series supports Thai mai
> > tri and mai chattawa. These characters seem to work with HarfBuzz.  
> 
> Not sure I understand: what patterns/rules should be added for these?

Add them all to "M" in the definition of tai-tham-composable-pattern.
Strictly, U+0324 should also be added to "S", but I'd be surprised to
see it in a genuine spelling.

Richard.






^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-14 13:26           ` Eli Zaretskii
@ 2022-02-14 23:26             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-15 14:40               ` Eli Zaretskii
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-14 23:26 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Mon, 14 Feb 2022 15:26:07 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Sun, 13 Feb 2022 21:11:52 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: larsi@gnus.org, 20140@debbugs.gnu.org

> No, that's not true.  I'm not aware of any such limitation; AFAIK
> Arabic shaping works correctly in Emacs, certainly with HarfBuzz and
> Emacs 27 or later.
> 
> Or maybe I misunderstand what you mean by "typewriter-like" fonts?
> Can you give an example of a non-typewriter-like font for Arabic that
> I can find on MS-Windows and try?

Not off the top of my head, but compare لحج with the presentation form
‎ﳊ U+FCCA ARABIC LIGATURE LAM WITH HAH INITIAL FORM for the first two
letters.  The lam part is a vertical line in the middle of the glyph;
the 'hah' part forms the lower part of the glyph.

> > There would be a similar problem with the use of Tai Khuen or other
> > tunnelling fonts for Northern Thai if you used the current mechanism
> > for advancing character by character.  Tunnelling fonts write parts
> > of one cluster under the next.  The Tai Khuen fonts I've seen do
> > this by relying on characteristics of Tai Khuen spelling.  The
> > rules don't hold for Northern Thai, and consequently the subscript
> > portions of successive orthographic syllables can overwrite one
> > another.  A sophisticated font could check for clashes, but that
> > needs the orthographic syllables to be passed to the shaper
> > together.  
> 
> I'm not sure I understand.  Does HarfBuzz know about these advancement
> features?  We rely on HarfBuzz to give us back as many grapheme
> clusters as it sees fit for a given chunk of text, and we expect each
> grapheme cluster to include glyphs with relative offsets as needed by
> the script and the font.

No, the fonts rely on the grammar of Tai Khuen.  If an orthographic
syllable contains U+1A6C TAI THAM VOWEL SIGN OA BELOW, there will be a
following orthographic syllable in the same phonetic syllable, and
it will consist of a single consonant with no tail and possible some
marks above.  The font designers therefore do not worry about the
effect on the advance width; there will be room for U+1A6C below the
next orthographic syllable.  If you want to see details now, enter
ᩉ᩠ᨾᩬᩁ ᩉ᩠ᨾᩳᨶᩥ᩠ᨯ ᩉ᩠ᨾᩬᩴᨶᩥ᩠ᨯ in the 'Play Area' text box of
https://wrdingham.co.uk/lanna/renderer_test.htm.  The first word is
spelt the same in Northern Thai and Tai Khuen.  As you switch the font
from Lamphun to A Tai Tham KH (with ccmp enabled if you are using IE
11), the glyphs at the bottom of the word spread out to use the
available space.  The next two words are 'Dr Nit' written in Tai Khuen
and Northern Thai.  The word for 'Dr', /mɔː/, is spelt quite
differently in the two languages, though the consonants are the same.
Both have a vowel above, but the Northern Thai also has U+1A6C below,
as in the first word. When A Tai Tham KH is selected as the font, it
clashes badly with the bottom of the second syllable, 'Nit'. 

This phenomenon of a vowel below expanding below the next consonant
also occurs in Northern Thai, but I don't know of any Northern Thai
font that is clever enough to do this, because checking for space below
the next consonant is fiddly.

> IOW, this job is delegated to the shaping engine, such as HarfBuzz;
> Emacs just takes the glyphs and offsets HarfBuzz gives us and blindly
> obeys them.

The problem is that font writers tend to make assumptions about the
language their font will be used for.  The second is that with a good
tunnelling font, HarfBuzz needs to know what comes in the next
syllable.  At present, using a tunnelling font for Tai Tham risks
clashes when used with Emacs.  The Tai Khuen fonts look good, but are
not suitable for writing Northern Thai.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-14 22:14             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-15  1:27               ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-16 15:13                 ` Eli Zaretskii
  2022-02-16 15:12               ` Eli Zaretskii
  1 sibling, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-15  1:27 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Mon, 14 Feb 2022 22:14:27 +0000
Richard Wordingham <richard.wordingham@ntlworld.com> wrote:

> On Mon, 14 Feb 2022 15:19:36 +0200
> Eli Zaretskii <eliz@gnu.org> wrote:
> 
> > > Date: Sun, 13 Feb 2022 20:53:10 +0000
> > > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > > Cc: larsi@gnus.org, 20140@debbugs.gnu.org

> > > You should also add CGJ and ZWNJ, and some people may appreciate
> > > ZWJ - the Khottabun font has ligatures involving ZWJ, though it
> > > may just be an experimental feature - and ultimately WJ, for when
> > > someone writes a Tai Tham word breaker.    
> > 
> > How should I add CGJ and ZWNJ?  What are the rules?
> >   
> > > Oh, and Thai and Lao mai t(r)i and mai chat(t)awa and U+0324
> > > COMBINING DIAERESIS BELOW turn up occasionally - U+0324 is
> > > supported in Thep's Khottabun font, and my Da Lekh series
> > > supports Thai mai tri and mai chattawa. These characters seem to
> > > work with HarfBuzz.    
> > 
> > Not sure I understand: what patterns/rules should be added for
> > these?  
> 
> Add them all to "M" in the definition of tai-tham-composable-pattern.
> Strictly, U+0324 should also be added to "S", but I'd be surprised to
> see it in a genuine spelling.

In view of Wyn Owen's report (A Description and Linguistic Analysis of
the Tai Khuen Writing System, JSEALS 10.1 (2017)
https://evols.library.manoa.hawaii.edu/bitstream/10524/52403/1/09_Owen2017description.pdf)
on Tai Khuen spelling, one should also add U+0E49 THAI CHARACTER MAI
THO to "M". And, of course, as all 5 non-Tai Tham tone marks used with
the Tai Tham script have canonical combining class greater than 9, they
should be added to "S" - i.e. add U+0E49 to U+0E4B and U+0EC9 and
U+0ECB to "S".

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-14 23:26             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-15 14:40               ` Eli Zaretskii
  2022-02-15 21:06                 ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-15 14:40 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Mon, 14 Feb 2022 23:26:23 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > No, that's not true.  I'm not aware of any such limitation; AFAIK
> > Arabic shaping works correctly in Emacs, certainly with HarfBuzz and
> > Emacs 27 or later.
> > 
> > Or maybe I misunderstand what you mean by "typewriter-like" fonts?
> > Can you give an example of a non-typewriter-like font for Arabic that
> > I can find on MS-Windows and try?
> 
> Not off the top of my head, but compare لحج with the presentation form
> ‎ﳊ U+FCCA ARABIC LIGATURE LAM WITH HAH INITIAL FORM for the first two
> letters.  The lam part is a vertical line in the middle of the glyph;
> the 'hah' part forms the lower part of the glyph.

They look identical here (using the default Courier New font).  With
what font did you think they will look wrong?





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-15 14:40               ` Eli Zaretskii
@ 2022-02-15 21:06                 ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-16 13:15                   ` Eli Zaretskii
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-15 21:06 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Tue, 15 Feb 2022 16:40:09 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Mon, 14 Feb 2022 23:26:23 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> >   
> > > No, that's not true.  I'm not aware of any such limitation; AFAIK
> > > Arabic shaping works correctly in Emacs, certainly with HarfBuzz
> > > and Emacs 27 or later.
> > > 
> > > Or maybe I misunderstand what you mean by "typewriter-like" fonts?
> > > Can you give an example of a non-typewriter-like font for Arabic
> > > that I can find on MS-Windows and try?  
> > 
> > Not off the top of my head, but compare لحج with the presentation
> > form ‎ﳊ U+FCCA ARABIC LIGATURE LAM WITH HAH INITIAL FORM for the
> > first two letters.  The lam part is a vertical line in the middle
> > of the glyph; the 'hah' part forms the lower part of the glyph.  
> 
> They look identical here (using the default Courier New font).  With
> what font did you think they will look wrong?

In the Courier New font in Windows 10 of 2017 (+ automatic updates),
U+FCCA looks like the image in the Unicode code chart, and bears little
resemblance to the righthand two thirds of <U+0644, U+062D, U+062C>.
In keeping with its Latin part, the sequence of three characters looks
as one would expect from a typewriter when one enters text letter by
letter.  I must admit I'm having trouble laying my hand on a font which
does these ligatures.  I wanted to find a font that would render the
three characters to look the same as ﳊﺞ <U+FCCA, U+FE9E>.  (Sticking
them together isn't working in the email client I'm using, but does
work in some fallback font.)

Richard.






^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-15 21:06                 ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-16 13:15                   ` Eli Zaretskii
  2022-02-16 19:01                     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  0 siblings, 1 reply; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-16 13:15 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Tue, 15 Feb 2022 21:06:05 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > > Not off the top of my head, but compare لحج with the presentation
> > > form ‎ﳊ U+FCCA ARABIC LIGATURE LAM WITH HAH INITIAL FORM for the
> > > first two letters.  The lam part is a vertical line in the middle
> > > of the glyph; the 'hah' part forms the lower part of the glyph.  
> > 
> > They look identical here (using the default Courier New font).  With
> > what font did you think they will look wrong?
> 
> In the Courier New font in Windows 10 of 2017 (+ automatic updates),
> U+FCCA looks like the image in the Unicode code chart, and bears little
> resemblance to the righthand two thirds of <U+0644, U+062D, U+062C>.
> In keeping with its Latin part, the sequence of three characters looks
> as one would expect from a typewriter when one enters text letter by
> letter.

It sounds like Courier New in Windows 10 was "improved" by removing
the capability of ligating those 2 characters.  On Windows XP, their
standard Courier New shows the first 2 characters ligate into a single
glyph, which looks just like U+FCCA, but on Windows 10 they don't
ligate.  I don't know why is that; perhaps Arabic typesetting experts
decided these should not ligate?

> I must admit I'm having trouble laying my hand on a font which
> does these ligatures.

Try the Arabic Typesetting font, there I see on Windows 10 that the
first 2 characters look like U+FCCA.

IOW, this is a font issue, not an Emacs or HarfBuzz issue.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-13 20:53         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-14 13:19           ` Eli Zaretskii
@ 2022-02-16 15:11           ` Eli Zaretskii
  1 sibling, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-16 15:11 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Sun, 13 Feb 2022 20:53:10 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > Btw, is there a way to get all the examples from your
> > https://wrdingham.co.uk/lanna/renderer_test.htm as a UTF-8 encoded
> > text file?  I'd like to test the Emacs rendering with all of the
> > examples, but copy-pasting each example separately from the browser is
> > not my idea of useful time investment.  So if you could provide the
> > examples as a downloadable text file, I'd appreciate.
> 
> As buried (you're not the only one to have overlooked it) in the
> penultimate paragraph of 'Content and Layout' section, "The test words
> may, in principle, be extracted quite simply from this web page. Each
> test 'word' is the content of the first cell in each row whose class is
> tst1. For convenience*, I have extracted the first two cells in such
> rows, along with titles, to a CSV file."  The file is rt.csv in the
> same directory.  I included the meaning and pronunciation as those who
> don't know the script may find it easier to refer to the words by
> translation or transcription.  You may prefer to use the file more or
> less as it is, but one can easily knock up an Emacs macro sequence to
> delete the first comma and the rest of the line.  I left the
> section titles in for easier navigation to the renderer test file.

Thanks, I've reviewed the results of rendering that file, and it looks
reasonably well: some examples don't show correctly, but most do.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-14 22:14             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-15  1:27               ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-16 15:12               ` Eli Zaretskii
  1 sibling, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-16 15:12 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Mon, 14 Feb 2022 22:14:27 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: 20140@debbugs.gnu.org, larsi@gnus.org
> 
> > > You should also add CGJ and ZWNJ, and some people may appreciate
> > > ZWJ - the Khottabun font has ligatures involving ZWJ, though it may
> > > just be an experimental feature - and ultimately WJ, for when
> > > someone writes a Tai Tham word breaker.  
> > 
> > How should I add CGJ and ZWNJ?  What are the rules?
> > 
> > > Oh, and Thai and Lao mai t(r)i and mai chat(t)awa and U+0324
> > > COMBINING DIAERESIS BELOW turn up occasionally - U+0324 is supported
> > > in Thep's Khottabun font, and my Da Lekh series supports Thai mai
> > > tri and mai chattawa. These characters seem to work with HarfBuzz.  
> > 
> > Not sure I understand: what patterns/rules should be added for these?
> 
> Add them all to "M" in the definition of tai-tham-composable-pattern.
> Strictly, U+0324 should also be added to "S", but I'd be surprised to
> see it in a genuine spelling.

Thanks, done.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-15  1:27               ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-16 15:13                 ` Eli Zaretskii
  0 siblings, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-16 15:13 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140-done, larsi

> Date: Tue, 15 Feb 2022 01:27:34 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: 20140@debbugs.gnu.org, larsi@gnus.org
> 
> On Mon, 14 Feb 2022 22:14:27 +0000
> Richard Wordingham <richard.wordingham@ntlworld.com> wrote:
> 
> > On Mon, 14 Feb 2022 15:19:36 +0200
> > Eli Zaretskii <eliz@gnu.org> wrote:
> > 
> > > > Date: Sun, 13 Feb 2022 20:53:10 +0000
> > > > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > > > Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > > > You should also add CGJ and ZWNJ, and some people may appreciate
> > > > ZWJ - the Khottabun font has ligatures involving ZWJ, though it
> > > > may just be an experimental feature - and ultimately WJ, for when
> > > > someone writes a Tai Tham word breaker.    
> > > 
> > > How should I add CGJ and ZWNJ?  What are the rules?
> > >   
> > > > Oh, and Thai and Lao mai t(r)i and mai chat(t)awa and U+0324
> > > > COMBINING DIAERESIS BELOW turn up occasionally - U+0324 is
> > > > supported in Thep's Khottabun font, and my Da Lekh series
> > > > supports Thai mai tri and mai chattawa. These characters seem to
> > > > work with HarfBuzz.    
> > > 
> > > Not sure I understand: what patterns/rules should be added for
> > > these?  
> > 
> > Add them all to "M" in the definition of tai-tham-composable-pattern.
> > Strictly, U+0324 should also be added to "S", but I'd be surprised to
> > see it in a genuine spelling.
> 
> In view of Wyn Owen's report (A Description and Linguistic Analysis of
> the Tai Khuen Writing System, JSEALS 10.1 (2017)
> https://evols.library.manoa.hawaii.edu/bitstream/10524/52403/1/09_Owen2017description.pdf)
> on Tai Khuen spelling, one should also add U+0E49 THAI CHARACTER MAI
> THO to "M". And, of course, as all 5 non-Tai Tham tone marks used with
> the Tai Tham script have canonical combining class greater than 9, they
> should be added to "S" - i.e. add U+0E49 to U+0E4B and U+0EC9 and
> U+0ECB to "S".

Thanks, done that as well, and installed the changes for Emacs 29.

And with that, I'm closing this bug report.  Thanks a lot for your
code and helpful discussions.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-16 13:15                   ` Eli Zaretskii
@ 2022-02-16 19:01                     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
  2022-02-16 19:20                       ` Eli Zaretskii
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors @ 2022-02-16 19:01 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: 20140, larsi

On Wed, 16 Feb 2022 15:15:46 +0200
Eli Zaretskii <eliz@gnu.org> wrote:

> > Date: Tue, 15 Feb 2022 21:06:05 +0000
> > From: Richard Wordingham <richard.wordingham@ntlworld.com>
> > Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> >   
> > > > Not off the top of my head, but compare لحج with the
> > > > presentation form ‎ﳊ U+FCCA ARABIC LIGATURE LAM WITH HAH
> > > > INITIAL FORM for the first two letters.  The lam part is a
> > > > vertical line in the middle of the glyph; the 'hah' part forms
> > > > the lower part of the glyph.    
> > > 
> > > They look identical here (using the default Courier New font).
> > > With what font did you think they will look wrong?  
> > 
> > In the Courier New font in Windows 10 of 2017 (+ automatic updates),
> > U+FCCA looks like the image in the Unicode code chart, and bears
> > little resemblance to the righthand two thirds of <U+0644, U+062D,
> > U+062C>. In keeping with its Latin part, the sequence of three
> > characters looks as one would expect from a typewriter when one
> > enters text letter by letter.  
> 
> It sounds like Courier New in Windows 10 was "improved" by removing
> the capability of ligating those 2 characters.  On Windows XP, their
> standard Courier New shows the first 2 characters ligate into a single
> glyph, which looks just like U+FCCA, but on Windows 10 they don't
> ligate.  I don't know why is that; perhaps Arabic typesetting experts
> decided these should not ligate?
> 
> > I must admit I'm having trouble laying my hand on a font which
> > does these ligatures.  
> 
> Try the Arabic Typesetting font, there I see on Windows 10 that the
> first 2 characters look like U+FCCA.
> 
> IOW, this is a font issue, not an Emacs or HarfBuzz issue.

Arabic Typesetting seems not to come in an evaluation copy of Windows
10.  And yes, the issue is that some fonts probably don't work well with
Emacs.  Irritating, but mostly not a big problem.

Richard.





^ permalink raw reply	[flat|nested] 35+ messages in thread

* bug#20140: 24.4; M17n shaper output rejected
  2022-02-16 19:01                     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
@ 2022-02-16 19:20                       ` Eli Zaretskii
  0 siblings, 0 replies; 35+ messages in thread
From: Eli Zaretskii @ 2022-02-16 19:20 UTC (permalink / raw)
  To: Richard Wordingham; +Cc: 20140, larsi

> Date: Wed, 16 Feb 2022 19:01:12 +0000
> From: Richard Wordingham <richard.wordingham@ntlworld.com>
> Cc: larsi@gnus.org, 20140@debbugs.gnu.org
> 
> > It sounds like Courier New in Windows 10 was "improved" by removing
> > the capability of ligating those 2 characters.  On Windows XP, their
> > standard Courier New shows the first 2 characters ligate into a single
> > glyph, which looks just like U+FCCA, but on Windows 10 they don't
> > ligate.  I don't know why is that; perhaps Arabic typesetting experts
> > decided these should not ligate?
> > 
> > > I must admit I'm having trouble laying my hand on a font which
> > > does these ligatures.  
> > 
> > Try the Arabic Typesetting font, there I see on Windows 10 that the
> > first 2 characters look like U+FCCA.
> > 
> > IOW, this is a font issue, not an Emacs or HarfBuzz issue.
> 
> Arabic Typesetting seems not to come in an evaluation copy of Windows
> 10.

You can easily install it from the Internet.  I did.

> And yes, the issue is that some fonts probably don't work well with
> Emacs.

??? These issues with fonts have nothing to do with Emacs.  HarfBuzz
will produce the same results outside of Emacs; e.g., try hb-view.  Or
view your message with those characters in a Web browser (by pointing
it to the bug-gnu-emacs archives) -- you will see the same results.
AFAIU, the fonts simply don't want to produce a ligature from those
two characters.  Arabic Typesetting does, so the result is what you
expect, in Emacs and elsewhere.





^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2022-02-16 19:20 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-03-18 22:20 bug#20140: 24.4; M17n shaper output rejected Richard Wordingham
2015-03-19  3:43 ` Eli Zaretskii
2015-03-21  8:33 ` K. Handa
2015-03-21 17:20   ` Wolfgang Jenkner
2015-03-21 17:58   ` Richard Wordingham
2015-03-21 18:26     ` Eli Zaretskii
2015-03-25 14:25     ` K. Handa
2015-03-25 21:45       ` Richard Wordingham
2015-04-05 19:48       ` Richard Wordingham
2022-02-03 21:21 ` Lars Ingebrigtsen
2022-02-04  7:37   ` Eli Zaretskii
2022-02-05 22:52     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-06  8:11       ` Eli Zaretskii
2022-02-06 22:09         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-07 14:04           ` Eli Zaretskii
2022-02-07 23:38             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-08 22:13         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-12 18:54           ` Eli Zaretskii
2022-02-13 16:04       ` Eli Zaretskii
2022-02-13 20:53         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-14 13:19           ` Eli Zaretskii
2022-02-14 22:14             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-15  1:27               ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-16 15:13                 ` Eli Zaretskii
2022-02-16 15:12               ` Eli Zaretskii
2022-02-16 15:11           ` Eli Zaretskii
2022-02-13 19:49       ` Eli Zaretskii
2022-02-13 21:11         ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-14 13:26           ` Eli Zaretskii
2022-02-14 23:26             ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-15 14:40               ` Eli Zaretskii
2022-02-15 21:06                 ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-16 13:15                   ` Eli Zaretskii
2022-02-16 19:01                     ` Richard Wordingham via Bug reports for GNU Emacs, the Swiss army knife of text editors
2022-02-16 19:20                       ` Eli Zaretskii

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).