"Raw" string literals for elisp

unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed

* "Raw" string literals for elisp
@ 2021-09-08  1:49 Anna Glasgall
  2021-09-08  7:10 ` Po Lu
                   ` (5 more replies)
  0 siblings, 6 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08  1:49 UTC (permalink / raw)
  To: emacs-devel; +Cc: Anna Glasgall

[-- Attachment #1: Type: text/plain, Size: 2419 bytes --]

[My previous message appears to have been eaten, or at least it's not
showing up in the archive; resending from a different From: address.
Apologies for any duplication]

Hello Emacs developers,

I've long been annoyed by the number of backslashes needed when using
string literals in elisp for certain things (regexes, UNC paths, etc),
so I started work on a patch (WIP attached) to implement support for
"raw" string literals, a la Python r-strings. These are string literals
that work exactly like normal string literals, with the exception that
backslash escapes (except for \") are not processed; \ may freely
appear in the string without need to escape. I've made good progress,
but unfortunately I've run into a roadblock and am not sure what to do
next.

I've successfully taught the elisp reader (read1 in lread.c) how to
read r-strings. I thought I had managed to make lisp-mode/elisp-mode
happy by allowing "r" to be a prefix character (C-x C-e and the
underlying forward-sexp/backward-sexp seemed to work fine at first),
but realized that I ran into trouble with strings containing the
sequence of characters '\\"'.

The reader correctly reads r"a\\"" as a string containing the sequence
of characters 'a', '\', '"', and M-: works. Unfortunately, if I try
sexp-based navigation or e.g. C-x C-e, it falls apart. The parser in
syntax.c, which afaict is what lisp-mode is using to try and find sexps
in buffer text, doesn't seem to know what to do with this expression.
I've spent some time staring at syntax.c, but I must confess that I'm
entirely defeated in terms of what changes need to be made here to
teach this other parser about prefixed strings in where the prefix has
meaning that affects the interpretation of the characters between
string fences.

I've attached a copy of my WIP patch; it's definitely not near final
code quality and doesn't have documentation yet, all of which I would
take care of before submitting for inclusion. I also haven't filled out
the copyright assignment paperwork yet, but should this work reach a
point where it was likely to be accepted, I'd be happy to do that.

I'd very much appreciate some pointers on what to try next here, or
some explanation of how syntax.c/syntax.el works beyond what's in the
reference manual. If this is a fool's errand I'm tilting at here, I'd
also appreciate being told that before I sink more time into it :)

thanks,

Anna Glasgall

[-- Attachment #2: rstrings.patch --]
[-- Type: text/x-patch, Size: 3044 bytes --]

diff --git a/lisp/progmodes/elisp-mode.el b/lisp/progmodes/elisp-mode.el
index 7ed2d3d08c..e91d81de6d 100644
--- a/lisp/progmodes/elisp-mode.el
+++ b/lisp/progmodes/elisp-mode.el
@@ -39,6 +39,7 @@ 'emacs-lisp-mode-abbrev-table

 (defvar emacs-lisp-mode-syntax-table
   (let ((table (make-syntax-table lisp-data-mode-syntax-table)))
+    (modify-syntax-entry ?r "_ p" table)
     ;; These are redundant, now.
     ;;(modify-syntax-entry ?\[ "(]  " table)
     ;;(modify-syntax-entry ?\] ")[  " table)
diff --git a/src/lread.c b/src/lread.c
index a6c2db5d99..8222c17d0b 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2970,10 +2970,11 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
   bool multibyte;
   char stackbuf[stackbufsize];
   current_thread->stack_top = stackbuf;
-
+  bool raw_literal = false;
   *pch = 0;

  retry:
+  raw_literal = false;

   c = READCHAR_REPORT_MULTIBYTE (&multibyte);
   if (c < 0)
@@ -3564,7 +3565,23 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)

 	invalid_syntax ("?", readcharfun);
       }
-
+      /* "raw" string literal syntax, a la Python; "raw" literals do
+	 not process escapes except for \" */
+    case 'r':
+      {
+	int next_ch;
+	next_ch = READCHAR;
+	if (next_ch == '\"')
+	  {
+	    raw_literal = true;
+	    /* fall through to string reading */
+	  }
+	else
+	  {
+	    UNREAD (next_ch);
+	    goto read_symbol;
+	  }
+      }
     case '"':
       {
 	ptrdiff_t count = SPECPDL_INDEX ();
@@ -3599,7 +3616,21 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
 	    if (ch == '\\')
 	      {
 		int modifiers;
-
+		if (raw_literal)
+		  {
+		    /* still have to handle backslash followed by
+		       double quote even in a raw literal */
+		    int next_ch = READCHAR;
+		    if (next_ch == '\"')
+		      {
+			ch = next_ch;
+		      }
+		    else
+		      {
+			UNREAD(next_ch);
+		      }
+		    goto read_normal_char;
+		  }
 		ch = read_escape (readcharfun, 1);

 		/* CH is -1 if \ newline or \ space has just been seen.  */
@@ -3653,6 +3684,7 @@ read1 (Lisp_Object readcharfun, int *pch, bool first_in_list)
 	      }
 	    else
 	      {
+	      read_normal_char:
 		p += CHAR_STRING (ch, (unsigned char *) p);
 		if (CHAR_BYTE8_P (ch))
 		  force_singlebyte = true;
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index dac8f95bc4..964f3da91b 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -262,5 +262,15 @@ lread-float
   (should (equal (read "-0.e-5") -0.0))
   )

+(ert-deftest lread-string-raw-syntax ()
+  ;; syntax r"a\bc" => string composed of ?a, ?\\, ?b, ?c
+  (should (equal (read "r\"a\\bc\"") "a\\bc"))
+  ;; syntax "a\bc" => string composed of ?a, ?\b, ?c 
+  (should (equal (read "\"a\\bc\"") "a\C-hc"))
+  ;; syntax r"a\"b\"a" => string composed of ?a, ?\", ?b, ?\", ?a
+  (should (equal (read "r\"a\\\"b\\\"a\"") "a\"b\"a"))
+  ;; syntax r"a\\b" => string composed of ?a, ?\\, ?\\, ?b
+  (should (equal (read "r\"a\\\\b\"") "a\\\\b"))
+  )

 ;;; lread-tests.el ends here

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
@ 2021-09-08  7:10 ` Po Lu
  2021-09-08 14:19   ` Anna Glasgall
  2021-09-08  7:12 ` Lars Ingebrigtsen
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 120+ messages in thread
From: Po Lu @ 2021-09-08  7:10 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

Anna Glasgall <anna@crossproduct.net> writes:

> I've successfully taught the elisp reader (read1 in lread.c) how to
> read r-strings.

Previously, (r"ab") would be read as a list of the atoms r and "ab".
Does your modification retain compatibility with that behaviour?
Thanks.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
  2021-09-08  7:10 ` Po Lu
@ 2021-09-08  7:12 ` Lars Ingebrigtsen
  2021-09-08 14:20   ` Anna Glasgall
  2021-09-08 11:30 ` Alan Mackenzie
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 120+ messages in thread
From: Lars Ingebrigtsen @ 2021-09-08  7:12 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

Anna Glasgall <anna@crossproduct.net> writes:

> I've long been annoyed by the number of backslashes needed when using
> string literals in elisp for certain things (regexes, UNC paths, etc),
> so I started work on a patch (WIP attached) to implement support for
> "raw" string literals, a la Python r-strings.

Great!  This would be very welcome, and has been proposed a number of
times before, but nobody has actually implemented it.  As you've found
out, making the reader support the syntax is pretty easy, but the
problem is with getting the rest of the Emacs tooling to understand the
new syntax.  (Which is, in general, the stumbling block when introducing
any new syntax in Emacs Lisp.)

> The reader correctly reads r"a\\"" as a string containing the sequence
> of characters 'a', '\', '"', and M-: works.

I think we'd prefer #r"..." -- # is used in Lisps to introduce most
special syntaxes (and it's more backwards-compatible, since "r" by
itself is a valid read syntax, but #r isn't today).

> Unfortunately, if I try sexp-based navigation or e.g. C-x C-e, it
> falls apart. The parser in syntax.c, which afaict is what lisp-mode is
> using to try and find sexps in buffer text, doesn't seem to know what
> to do with this expression.  I've spent some time staring at syntax.c,
> but I must confess that I'm entirely defeated in terms of what changes
> need to be made here to teach this other parser about prefixed strings
> in where the prefix has meaning that affects the interpretation of the
> characters between string fences.

Hopefully somebody else can give some insights here, because I'm not
overly familiar with syntax.c, either.

-- 
(domestic pets only, the antidote for overdose, milk.)
   bloggy blog: http://lars.ingebrigtsen.no

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
  2021-09-08  7:10 ` Po Lu
  2021-09-08  7:12 ` Lars Ingebrigtsen
@ 2021-09-08 11:30 ` Alan Mackenzie
  2021-09-08 14:27   ` Anna Glasgall
  2021-09-08 11:34 ` Adam Porter
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-09-08 11:30 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

Hello, Anna.

Just as a matter of context, I implemented C++ raw strings, and recently
enhanced the code also to handle other CC Mode derived languages such as
C# and Vala.

On Tue, Sep 07, 2021 at 21:49:33 -0400, Anna Glasgall wrote:
> [My previous message appears to have been eaten, or at least it's not
> showing up in the archive; resending from a different From: address.
> Apologies for any duplication]

> Hello Emacs developers,

> I've long been annoyed by the number of backslashes needed when using
> string literals in elisp for certain things (regexes, UNC paths, etc),
> so I started work on a patch (WIP attached) to implement support for
> "raw" string literals, a la Python r-strings. These are string literals
> that work exactly like normal string literals, with the exception that
> backslash escapes (except for \") are not processed; \ may freely
> appear in the string without need to escape. I've made good progress,
> but unfortunately I've run into a roadblock and am not sure what to do
> next.

One not so small point.  How do you put a backslash as the _last_
character in a raw string?

If this is difficult, it may well be worth comparing other languages
with raw strings.  C++ Mode has a complicated system of identifiers at
each end of the raw string (I'm sure you know this).  C# represents a "
inside a multi-line string as "".  Vala (and, I believe, Python) have
triple quote delimters """ and cannot represent three quotes in a row
inside the multi-line string.

It is probably worth while stating explicitly that Elisp raw strings can
be continued across line breaks without having to escape the \n.

> I've successfully taught the elisp reader (read1 in lread.c) how to
> read r-strings. I thought I had managed to make lisp-mode/elisp-mode
> happy by allowing "r" to be a prefix character (C-x C-e and the
> underlying forward-sexp/backward-sexp seemed to work fine at first),
> but realized that I ran into trouble with strings containing the
> sequence of characters '\\"'.

> The reader correctly reads r"a\\"" as a string containing the sequence
> of characters 'a', '\', '"', and M-: works. Unfortunately, if I try
> sexp-based navigation or e.g. C-x C-e, it falls apart. The parser in
> syntax.c, which afaict is what lisp-mode is using to try and find sexps
> in buffer text, doesn't seem to know what to do with this expression.
> I've spent some time staring at syntax.c, but I must confess that I'm
> entirely defeated in terms of what changes need to be made here to
> teach this other parser about prefixed strings in where the prefix has
> meaning that affects the interpretation of the characters between
> string fences.

You probably want to use syntax-table text properties.  See the page
"Syntax Properties" in the Elisp manual.  In short, you would put, say,
a "punctuation" property on most backslashes to nullify their normal
action.  Possibly, you might want such a property on a double quote
inside the string.  You might also want a property on the linefeeds
inside a raw string.  With these properties, C-M-n and friends will work
properly.

Bear in mind that you will also need to apply and remove these
properties as the user changes the Lisp text, for example by removing a
\ before a ".  There is an established mechanism in Emacs for this sort
of action (which CC Mode doesn't use) which I would advise you to use.

> I've attached a copy of my WIP patch; it's definitely not near final
> code quality and doesn't have documentation yet, all of which I would
> take care of before submitting for inclusion. I also haven't filled out
> the copyright assignment paperwork yet, but should this work reach a
> point where it was likely to be accepted, I'd be happy to do that.

Thanks!

> I'd very much appreciate some pointers on what to try next here, or
> some explanation of how syntax.c/syntax.el works beyond what's in the
> reference manual. If this is a fool's errand I'm tilting at here, I'd
> also appreciate being told that before I sink more time into it :)

It is definitely NOT a fool's errand.  There may be some resistance to
the idea of raw strings from traditionalists, but I hope not.  It would
be worth your while really to understand the section in the Elisp manual
on syntax and all the things it can (and can't) do.

Help is always available on emacs-devel.

You're going to have quite a bit of Lisp programming to do.  For
example, font-lock needs to be taught how to fontify a raw string.

But at the end of the exercise, you will have learnt so much about Emacs
that you will qualify as a fully fledged contributor.  :-)

> thanks,

> Anna Glasgall

-- 
Alan Mackenzie (Nuremberg, Germany).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
                   ` (2 preceding siblings ...)
  2021-09-08 11:30 ` Alan Mackenzie
@ 2021-09-08 11:34 ` Adam Porter
  2021-09-08 13:59   ` Clément Pit-Claudel
  2021-09-09  3:09   ` Richard Stallman
  2021-09-08 13:10 ` Stefan Monnier
  2021-09-08 20:40 ` Anna Glasgall
  5 siblings, 2 replies; 120+ messages in thread
From: Adam Porter @ 2021-09-08 11:34 UTC (permalink / raw)
  To: emacs-devel

This would be an exciting feature to have in Elisp.  Thanks for working
on it.

If I may make a somewhat bold, and hopefully not too bikesheddy
suggestion (and one that's probably been made before, when this topic
has come up): Rather than having double-quote characters requiring
escaping in raw strings, would it be possible to do something slightly
Perlescent (I know) and allow the delimiters to be specified?  That
would make them much more useful.  For example, maybe something like:

  #r"foo bar baz"  ;; => "foo bar baz"
  #r|foo "bar" baz|  ;; => "foo \"bar\" baz"

For extra bonus points, it would be nice if leading indentation at the
beginning of a line in a raw string were omitted, similar to Python's
triple-quoted strings.  That would allow docstrings to look something
like:

  (defun foo (arg &optional bar)
    #r|Frobnicate ARG.
       If BAR, require ARG to be "baz".|
    (if (and bar (equal arg "baz"))
        (frobnicate arg)
      (frobnicate arg)))

The delimiter could even be repeated, like:

  #r"""
    foo
    "bar"
    baz
    """
    ;; => "foo\n\"bar\"\nbaz"

Though perhaps I'm getting too far ahead.  :)




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
                   ` (3 preceding siblings ...)
  2021-09-08 11:34 ` Adam Porter
@ 2021-09-08 13:10 ` Stefan Monnier
  2021-09-08 14:31   ` Anna Glasgall
  2021-09-08 20:40 ` Anna Glasgall
  5 siblings, 1 reply; 120+ messages in thread
From: Stefan Monnier @ 2021-09-08 13:10 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

> I've long been annoyed by the number of backslashes needed when using
> string literals in elisp for certain things (regexes, UNC paths, etc),

I most other discussions around this in the past, regexps were the only
significant cases.  I don't know what you have in mind behind the
"etc.", but as for UNC: how often od you use them in ELisp and do you
really need backslashes there (I thought slashes work almost as well in
most of Windows)?

And AFAIC adding raw strings just to halve the number of backslashes in
regexps seems both too much and too little: you'd likely prefer a new
regexp syntax which doesn't require backslashes for grouping
and alternation.

        Stefan "not a big fan of raw strings in ELisp"

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 11:34 ` Adam Porter
@ 2021-09-08 13:59   ` Clément Pit-Claudel
  2021-09-08 14:12     ` Adam Porter
  2021-09-09  3:09   ` Richard Stallman
  1 sibling, 1 reply; 120+ messages in thread
From: Clément Pit-Claudel @ 2021-09-08 13:59 UTC (permalink / raw)
  To: emacs-devel

On 9/8/21 7:34 AM, Adam Porter wrote:
> For extra bonus points, it would be nice if leading indentation at the
> beginning of a line in a raw string were omitted, similar to Python's
> triple-quoted strings.

Python's triple quoted strings don't omit leading indentation, do they?

  $ python3
  >>> s = """
  ...     aaa
  ...     """
  >>> s
  '\n    aaa\n    '



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 13:59   ` Clément Pit-Claudel
@ 2021-09-08 14:12     ` Adam Porter
  0 siblings, 0 replies; 120+ messages in thread
From: Adam Porter @ 2021-09-08 14:12 UTC (permalink / raw)
  To: emacs-devel

Clément Pit-Claudel <cpitclaudel@gmail.com> writes:

> On 9/8/21 7:34 AM, Adam Porter wrote:
>> For extra bonus points, it would be nice if leading indentation at the
>> beginning of a line in a raw string were omitted, similar to Python's
>> triple-quoted strings.
>
> Python's triple quoted strings don't omit leading indentation, do they?
>
>   $ python3
>   >>> s = """
>   ...     aaa
>   ...     """
>   >>> s
>   '\n    aaa\n    '

You're right, they only have that feature when used as docstrings:
<https://www.python.org/dev/peps/pep-0257/#multi-line-docstrings>




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  7:10 ` Po Lu
@ 2021-09-08 14:19   ` Anna Glasgall
  0 siblings, 0 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08 14:19 UTC (permalink / raw)
  To: Po Lu; +Cc: emacs-devel

On Wed, 2021-09-08 at 15:10 +0800, Po Lu wrote:
> Anna Glasgall <anna@crossproduct.net> writes:
> 
> > I've successfully taught the elisp reader (read1 in lread.c) how to
> > read r-strings.
> 
> Previously, (r"ab") would be read as a list of the atoms r and "ab".
> Does your modification retain compatibility with that behaviour?
> Thanks.

Oops, that's an edge case I hadn't considered. If we want to retain
that behavior I'm going to have to rethink the syntax here (which it
seems like I may want to do anyway based on some other mails on this
thread :/ ). Thanks for giving me another test case!

Anna




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  7:12 ` Lars Ingebrigtsen
@ 2021-09-08 14:20   ` Anna Glasgall
  0 siblings, 0 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08 14:20 UTC (permalink / raw)
  To: Lars Ingebrigtsen; +Cc: emacs-devel

On Wed, 2021-09-08 at 09:12 +0200, Lars Ingebrigtsen wrote:
> Anna Glasgall <anna@crossproduct.net> writes:
> 
> > I've long been annoyed by the number of backslashes needed when
> > using
> > string literals in elisp for certain things (regexes, UNC paths,
> > etc),
> > so I started work on a patch (WIP attached) to implement support
> > for
> > "raw" string literals, a la Python r-strings.
> 
> Great!  This would be very welcome, and has been proposed a number of
> times before, but nobody has actually implemented it.  As you've
> found
> out, making the reader support the syntax is pretty easy, but the
> problem is with getting the rest of the Emacs tooling to understand
> the
> new syntax.  (Which is, in general, the stumbling block when
> introducing
> any new syntax in Emacs Lisp.)
> 
> > The reader correctly reads r"a\\"" as a string containing the
> > sequence
> > of characters 'a', '\', '"', and M-: works.
> 
> I think we'd prefer #r"..." -- # is used in Lisps to introduce most
> special syntaxes (and it's more backwards-compatible, since "r" by
> itself is a valid read syntax, but #r isn't today).
> 

That would be fine by me; I'll see about revising this to change the
syntax that.

> > Unfortunately, if I try sexp-based navigation or e.g. C-x C-e, it
> > falls apart. The parser in syntax.c, which afaict is what lisp-mode
> > is
> > using to try and find sexps in buffer text, doesn't seem to know
> > what
> > to do with this expression.  I've spent some time staring at
> > syntax.c,
> > but I must confess that I'm entirely defeated in terms of what
> > changes
> > need to be made here to teach this other parser about prefixed
> > strings
> > in where the prefix has meaning that affects the interpretation of
> > the
> > characters between string fences.
> 
> Hopefully somebody else can give some insights here, because I'm not
> overly familiar with syntax.c, either.
> 

thanks,

Anna




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 11:30 ` Alan Mackenzie
@ 2021-09-08 14:27   ` Anna Glasgall
  0 siblings, 0 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08 14:27 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: emacs-devel

On Wed, 2021-09-08 at 11:30 +0000, Alan Mackenzie wrote:
> Hello, Anna.
> 
> Just as a matter of context, I implemented C++ raw strings, and
> recently
> enhanced the code also to handle other CC Mode derived languages such
> as
> C# and Vala.
> 

Great, I'll definitely take a look at that.

> On Tue, Sep 07, 2021 at 21:49:33 -0400, Anna Glasgall wrote:
> > [My previous message appears to have been eaten, or at least it's
> > not
> > showing up in the archive; resending from a different From:
> > address.
> > Apologies for any duplication]
> 
> > Hello Emacs developers,
> 
> > I've long been annoyed by the number of backslashes needed when
> > using
> > string literals in elisp for certain things (regexes, UNC paths,
> > etc),
> > so I started work on a patch (WIP attached) to implement support
> > for
> > "raw" string literals, a la Python r-strings. These are string
> > literals
> > that work exactly like normal string literals, with the exception
> > that
> > backslash escapes (except for \") are not processed; \ may freely
> > appear in the string without need to escape. I've made good
> > progress,
> > but unfortunately I've run into a roadblock and am not sure what to
> > do
> > next.
> 
> One not so small point.  How do you put a backslash as the _last_
> character in a raw string?

That is an excellent question. I'll need to take a look at how some
other languages handle that :/

Thanks for giving me another test case!

> 
> If this is difficult, it may well be worth comparing other languages
> with raw strings.  C++ Mode has a complicated system of identifiers
> at
> each end of the raw string (I'm sure you know this).  C# represents a
> "
> inside a multi-line string as "".  Vala (and, I believe, Python) have
> triple quote delimters """ and cannot represent three quotes in a row
> inside the multi-line string.
> 
> It is probably worth while stating explicitly that Elisp raw strings
> can
> be continued across line breaks without having to escape the \n.
> 
> > I've successfully taught the elisp reader (read1 in lread.c) how to
> > read r-strings. I thought I had managed to make lisp-mode/elisp-
> > mode
> > happy by allowing "r" to be a prefix character (C-x C-e and the
> > underlying forward-sexp/backward-sexp seemed to work fine at
> > first),
> > but realized that I ran into trouble with strings containing the
> > sequence of characters '\\"'.
> 
> > The reader correctly reads r"a\\"" as a string containing the
> > sequence
> > of characters 'a', '\', '"', and M-: works. Unfortunately, if I try
> > sexp-based navigation or e.g. C-x C-e, it falls apart. The parser
> > in
> > syntax.c, which afaict is what lisp-mode is using to try and find
> > sexps
> > in buffer text, doesn't seem to know what to do with this
> > expression.
> > I've spent some time staring at syntax.c, but I must confess that
> > I'm
> > entirely defeated in terms of what changes need to be made here to
> > teach this other parser about prefixed strings in where the prefix
> > has
> > meaning that affects the interpretation of the characters between
> > string fences.
> 
> You probably want to use syntax-table text properties.  See the page
> "Syntax Properties" in the Elisp manual.  In short, you would put,
> say,
> a "punctuation" property on most backslashes to nullify their normal
> action.  Possibly, you might want such a property on a double quote
> inside the string.  You might also want a property on the linefeeds
> inside a raw string.  With these properties, C-M-n and friends will
> work
> properly.
> 
> Bear in mind that you will also need to apply and remove these
> properties as the user changes the Lisp text, for example by removing
> a
> \ before a ".  There is an established mechanism in Emacs for this
> sort
> of action (which CC Mode doesn't use) which I would advise you to
> use.
> 

It was unclear to me how much additional processing during typing would
be acceptable here as opposed to just running the existing C code.
Hopefully native compilation support will to some extent nullify any
penalty from adding additional logic in Lisp here?

> > I've attached a copy of my WIP patch; it's definitely not near
> > final
> > code quality and doesn't have documentation yet, all of which I
> > would
> > take care of before submitting for inclusion. I also haven't filled
> > out
> > the copyright assignment paperwork yet, but should this work reach
> > a
> > point where it was likely to be accepted, I'd be happy to do that.
> 
> Thanks!
> 
> > I'd very much appreciate some pointers on what to try next here, or
> > some explanation of how syntax.c/syntax.el works beyond what's in
> > the
> > reference manual. If this is a fool's errand I'm tilting at here,
> > I'd
> > also appreciate being told that before I sink more time into it :)
> 
> It is definitely NOT a fool's errand.  There may be some resistance
> to
> the idea of raw strings from traditionalists, but I hope not.  It
> would
> be worth your while really to understand the section in the Elisp
> manual
> on syntax and all the things it can (and can't) do.
> 
> Help is always available on emacs-devel.
> 
> You're going to have quite a bit of Lisp programming to do.  For
> example, font-lock needs to be taught how to fontify a raw string.
> 

I am already moderately familiar with writing elisp at this point, but
yes, I still have a lot to learn :)

> But at the end of the exercise, you will have learnt so much about
> Emacs
> that you will qualify as a fully fledged contributor.  :-)
> 

thanks,

Anna


> > thanks,
> 
> > Anna Glasgall
> 





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 13:10 ` Stefan Monnier
@ 2021-09-08 14:31   ` Anna Glasgall
  2021-09-08 15:27     ` Mattias Engdegård
                       ` (2 more replies)
  0 siblings, 3 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08 14:31 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: emacs-devel

On Wed, 2021-09-08 at 09:10 -0400, Stefan Monnier wrote:
> > I've long been annoyed by the number of backslashes needed when
> > using
> > string literals in elisp for certain things (regexes, UNC paths,
> > etc),
> 
> I most other discussions around this in the past, regexps were the
> only
> significant cases.  I don't know what you have in mind behind the
> "etc.", but as for UNC: how often od you use them in ELisp and do you
> really need backslashes there (I thought slashes work almost as well
> in
> most of Windows)?
> 

Cards on the table here: yes, regexes are 99.999% of the motivation
here :) UNC paths were a somewhat contrived example.

During the course of working on this, I came across the following in I
think syntax.el:


               (while (re-search-forward
"\\(\\\\\\\\\\)\\(?:\\(\\\\\\\\\\)\\|\\((\\(?:\\?[0-
9]*:\\)?\\|[|)]\\)\\)" bound t)

which I feel by itself rather justifies this work.

> And AFAIC adding raw strings just to halve the number of backslashes
> in
> regexps seems both too much and too little: you'd likely prefer a new
> regexp syntax which doesn't require backslashes for grouping
> and alternation.
> 

I would be _thrilled_ to have that, but that seemed like it'd be even
_more_ work than this is already ballooning into. And it does seem to
me that raw-literal syntax is something that'd be generally useful even
outside the use case of regexes.

> 
>         Stefan "not a big fan of raw strings in ELisp"
> 

thanks,

Anna




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 14:31   ` Anna Glasgall
@ 2021-09-08 15:27     ` Mattias Engdegård
  2021-09-08 15:41       ` Stefan Kangas
  2021-09-08 16:01       ` Alan Mackenzie
  2021-09-08 15:54     ` Stefan Kangas
  2021-09-08 16:05     ` tomas
  2 siblings, 2 replies; 120+ messages in thread
From: Mattias Engdegård @ 2021-09-08 15:27 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: Stefan Monnier, emacs-devel

8 sep. 2021 kl. 16.31 skrev Anna Glasgall <anna@crossproduct.net>:

> Cards on the table here: yes, regexes are 99.999% of the motivation
> here

Elisp actually has a much better regexp syntax than most other languages:

> "\\(\\\\\\\\\\)\\(?:\\(\\\\\\\\\\)\\|\\((\\(?:\\?[0-9]*:\\)?\\|[|)]\\)\\)"

Today that would be written

(rx (group "\\\\")
    (or (group "\\\\")
        (group
         (or (seq "("
                  (? "?" (* digit) ":"))
             (in ")|")))))

which is much more readable and maintainable and less error-prone than what you would get with a new string syntax.




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 15:27     ` Mattias Engdegård
@ 2021-09-08 15:41       ` Stefan Kangas
  2021-09-08 16:45         ` Mattias Engdegård
  2021-09-08 16:01       ` Alan Mackenzie
  1 sibling, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-09-08 15:41 UTC (permalink / raw)
  To: Mattias Engdegård; +Cc: Emacs developers, Anna Glasgall, Stefan Monnier

Mattias Engdegård <mattiase@acm.org> writes:

> Elisp actually has a much better regexp syntax than most other languages:
>
> > "\\(\\\\\\\\\\)\\(?:\\(\\\\\\\\\\)\\|\\((\\(?:\\?[0-9]*:\\)?\\|[|)]\\)\\)"
>
> Today that would be written
>
> (rx (group "\\\\")
>     (or (group "\\\\")
>         (group
>          (or (seq "("
>                   (? "?" (* digit) ":"))
>              (in ")|")))))
>
> which is much more readable and maintainable and less error-prone than what you would get with a new string syntax.

That's true.  I hope that we can preload rx and use it more.

But ELisp is also intended for end-users that want to hack together
some quick command.  They may be completely uninterested in spending
even a minimum of time to learn rx, and prefer instead to use the more
standard form they already know so they can just get their job done.

I'm not even sure that rx is the unequivocal first choice among
hardcore ELisp programmers.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 14:31   ` Anna Glasgall
  2021-09-08 15:27     ` Mattias Engdegård
@ 2021-09-08 15:54     ` Stefan Kangas
  2021-09-08 16:05     ` tomas
  2 siblings, 0 replies; 120+ messages in thread
From: Stefan Kangas @ 2021-09-08 15:54 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: Stefan Monnier, Emacs developers

Anna Glasgall <anna@crossproduct.net> writes:

> During the course of working on this, I came across the following in I
> think syntax.el:
>
>
>                (while (re-search-forward
> "\\(\\\\\\\\\\)\\(?:\\(\\\\\\\\\\)\\|\\((\\(?:\\?[0-
> 9]*:\\)?\\|[|)]\\)\\)" bound t)
>
> which I feel by itself rather justifies this work.

I didn't study your patch, but I for one hope that this feature will
eventually get accepted.  IMHO, it would reduce a not insignificant
pain point for (some types of) programming in Emacs Lisp.

The argument against having "raw" strings in ELisp, if I understand,
it is that it will make some code in core more complex.  This is true,
but it will at the same time make even more code out there less
complex, or at least easier to read and understand.

Once it is fully implemented, there will of course be bugs, but over
time the feature will stabilize.  In the long run I think we will win
more by making Emacs Lisp more attractive (or at least less
unpleasant) to a generation that have already been spoiled by using
raw strings in other languages.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 15:27     ` Mattias Engdegård
  2021-09-08 15:41       ` Stefan Kangas
@ 2021-09-08 16:01       ` Alan Mackenzie
  2021-09-08 18:24         ` Mattias Engdegård
  2021-09-08 19:22         ` Philip Kaludercic
  1 sibling, 2 replies; 120+ messages in thread
From: Alan Mackenzie @ 2021-09-08 16:01 UTC (permalink / raw)
  To: Mattias Engdegård; +Cc: emacs-devel, Anna Glasgall, Stefan Monnier

Hello, Mattias.

On Wed, Sep 08, 2021 at 17:27:58 +0200, Mattias Engdegård wrote:
> 8 sep. 2021 kl. 16.31 skrev Anna Glasgall <anna@crossproduct.net>:

> > Cards on the table here: yes, regexes are 99.999% of the motivation
> > here

> Elisp actually has a much better regexp syntax than most other languages:

> > "\\(\\\\\\\\\\)\\(?:\\(\\\\\\\\\\)\\|\\((\\(?:\\?[0-9]*:\\)?\\|[|)]\\)\\)"

> Today that would be written

> (rx (group "\\\\")
>     (or (group "\\\\")
>         (group
>          (or (seq "("
>                   (? "?" (* digit) ":"))
>              (in ")|")))))

> which is much more readable and maintainable and less error-prone than
> what you would get with a new string syntax.

It is more readable in the same way Cobol was very readable; each small
grouping of text is immediately understandable.  But the thing as a
whole?  The rx form of that regexp takes up 6 lines, the string form 1
line.  If there are several regexps in a function rx can lead to a lot of
bloat.  Having the function fit entirely on one's screen contributes a
lot towards readability and maintainability.

It is true the rx form could be squashed onto 1 or 2 lines, but then that
readability is lost.

I have nothing against people who want to use rx, but personally, I
prefer the string form.  How much better a raw string form would be is
difficult to say.

-- 
Alan Mackenzie (Nuremberg, Germany).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 14:31   ` Anna Glasgall
  2021-09-08 15:27     ` Mattias Engdegård
  2021-09-08 15:54     ` Stefan Kangas
@ 2021-09-08 16:05     ` tomas
  2021-09-08 16:42       ` Lars Ingebrigtsen
  2021-09-08 20:18       ` Stefan Monnier
  2 siblings, 2 replies; 120+ messages in thread
From: tomas @ 2021-09-08 16:05 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 950 bytes --]

On Wed, Sep 08, 2021 at 10:31:26AM -0400, Anna Glasgall wrote:
> On Wed, 2021-09-08 at 09:10 -0400, Stefan Monnier wrote:

[...]

> > And AFAIC adding raw strings just to halve the number of backslashes
> > in
> > regexps seems both too much and too little: you'd likely prefer a new
> > regexp syntax which doesn't require backslashes for grouping
> > and alternation.
> > 
> 
> I would be _thrilled_ to have that, but that seemed like it'd be even
> _more_ work than this is already ballooning into [...]

I just think these are two separate dimensions which happen to align
in the "regexp and backslash" case. Fixing each of those has its own
ergonomic potential.

I for one would be very thrilled with raw strings.

> >         Stefan "not a big fan of raw strings in ELisp"

One of those rare cases I tend to disagree with Stefan. Which always
feels funny, because he's most of the time right ;-)

Cheers
 - t

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 16:05     ` tomas
@ 2021-09-08 16:42       ` Lars Ingebrigtsen
  2021-09-08 20:08         ` Stefan Monnier
  2021-09-08 20:18       ` Stefan Monnier
  1 sibling, 1 reply; 120+ messages in thread
From: Lars Ingebrigtsen @ 2021-09-08 16:42 UTC (permalink / raw)
  To: tomas; +Cc: emacs-devel

<tomas@tuxteam.de> writes:

> One of those rare cases I tend to disagree with Stefan. Which always
> feels funny, because he's most of the time right ;-)

Yes, it's very scary the first time it happens.

-- 
(domestic pets only, the antidote for overdose, milk.)
   bloggy blog: http://lars.ingebrigtsen.no



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 15:41       ` Stefan Kangas
@ 2021-09-08 16:45         ` Mattias Engdegård
  0 siblings, 0 replies; 120+ messages in thread
From: Mattias Engdegård @ 2021-09-08 16:45 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: Emacs developers, Anna Glasgall, Stefan Monnier

8 sep. 2021 kl. 17.41 skrev Stefan Kangas <stefan@marxist.se>:

> But ELisp is also intended for end-users that want to hack together
> some quick command.  They may be completely uninterested in spending
> even a minimum of time to learn rx, and prefer instead to use the more
> standard form they already know so they can just get their job done.

New reader syntax isn't something to be added lightly. Shouldn't there be a better reason than a belief that some people may be unwilling to learn already existing facilities of the language, especially when those are superior to the proposed addition?

> I'm not even sure that rx is the unequivocal first choice among
> hardcore ELisp programmers.

Their loss then. (And good to know I'm not hardcore!)

It's unfortunate that we've spent the last 50 years teaching generations of programmers that regexps are strings and they use backslashes, to the point that many actually believe it. Failure of education, and lack of proper tools!

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 16:01       ` Alan Mackenzie
@ 2021-09-08 18:24         ` Mattias Engdegård
  2021-09-08 19:00           ` Alan Mackenzie
  2021-09-08 19:22         ` Philip Kaludercic
  1 sibling, 1 reply; 120+ messages in thread
From: Mattias Engdegård @ 2021-09-08 18:24 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: Anna Glasgall, Stefan Monnier, Emacs developers

8 sep. 2021 kl. 18.01 skrev Alan Mackenzie <acm@muc.de>:

> It is more readable in the same way Cobol was very readable

That comparison is absurdly wrong in many ways: Cobol is not considered readable at all and it was made back when nobody knew how to design languages. I could go on all day about a comparative design history of Cobol, Rx and conventional regexp syntax but will spare you the boredom.

Rx is actually not verbose, definitely not by Lisp standards. You can't use it in CC Mode for reasons of compatibility and that's fine -- engineers often work under constraints not of their own choosing.

I do suggest you give it an honest try in a project where you are permitted to do so. You will be better informed, better equipped to read other people's code, and may come to like it. Even if you don't, you may have something interesting to report from the attempt.

And I'll be there to answer questions!

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 18:24         ` Mattias Engdegård
@ 2021-09-08 19:00           ` Alan Mackenzie
  0 siblings, 0 replies; 120+ messages in thread
From: Alan Mackenzie @ 2021-09-08 19:00 UTC (permalink / raw)
  To: Mattias Engdegård; +Cc: Emacs developers, Anna Glasgall, Stefan Monnier

Hello, Mattias.

On Wed, Sep 08, 2021 at 20:24:34 +0200, Mattias Engdegård wrote:
> 8 sep. 2021 kl. 18.01 skrev Alan Mackenzie <acm@muc.de>:

> > It is more readable in the same way Cobol was very readable

> That comparison is absurdly wrong

Oh no it's not.

> Rx is actually not verbose .....

The example you gave showed a single line string regexp being equivalent
to a six line rx expression.  That's a factor of 6.  That's verbose by
any measure.

> I do suggest you give it an honest try in a project where you are
> permitted to do so.

rx has been in Emacs, I think, since 21.1.  That was around 20 years
ago.  It hasn't caught on, to any great extent.  If it were that good,
it would have caught on.

> You will be better informed, better equipped to read other people's
> code, and may come to like it. Even if you don't, you may have
> something interesting to report from the attempt.

There are lots of ways of broadening my horizons, even within the scope
of Emacs, but there are only so many hours in a day.

> And I'll be there to answer questions!

OK, here's one: why do think rx is so little used, compared with the
string representation of regular expressions?

-- 
Alan Mackenzie (Nuremberg, Germany).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 16:01       ` Alan Mackenzie
  2021-09-08 18:24         ` Mattias Engdegård
@ 2021-09-08 19:22         ` Philip Kaludercic
  2021-09-08 19:36           ` Alan Mackenzie
  2021-09-08 21:11           ` Stefan Kangas
  1 sibling, 2 replies; 120+ messages in thread
From: Philip Kaludercic @ 2021-09-08 19:22 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: Mattias Engdegård, Anna Glasgall, Stefan Monnier,
	emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> It is more readable in the same way Cobol was very readable; each small
> grouping of text is immediately understandable.  But the thing as a
> whole?  The rx form of that regexp takes up 6 lines, the string form 1
> line.  If there are several regexps in a function rx can lead to a lot of
> bloat.  Having the function fit entirely on one's screen contributes a
> lot towards readability and maintainability.

The reason I use rx in a lot of my scripts is that I can add comments,
explanations, formatting, etc. when it gets complicated. I think that is
a significant advantage, that even raw strings wouldn't have (unless a
comment syntax were to be added into the regular expression language,
which is unlikely).

-- 
	Philip Kaludercic



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 19:22         ` Philip Kaludercic
@ 2021-09-08 19:36           ` Alan Mackenzie
  2021-09-08 21:11           ` Stefan Kangas
  1 sibling, 0 replies; 120+ messages in thread
From: Alan Mackenzie @ 2021-09-08 19:36 UTC (permalink / raw)
  To: Philip Kaludercic
  Cc: Mattias Engdegård, Anna Glasgall, Stefan Monnier,
	emacs-devel

Hello, Philip.

On Wed, Sep 08, 2021 at 19:22:18 +0000, Philip Kaludercic wrote:
> Alan Mackenzie <acm@muc.de> writes:

> > It is more readable in the same way Cobol was very readable; each small
> > grouping of text is immediately understandable.  But the thing as a
> > whole?  The rx form of that regexp takes up 6 lines, the string form 1
> > line.  If there are several regexps in a function rx can lead to a lot of
> > bloat.  Having the function fit entirely on one's screen contributes a
> > lot towards readability and maintainability.

> The reason I use rx in a lot of my scripts is that I can add comments,
> explanations, formatting, etc. when it gets complicated. I think that is
> a significant advantage, that even raw strings wouldn't have (unless a
> comment syntax were to be added into the regular expression language,
> which is unlikely).

Yes, I can see that.  rx certainly has advantages.  But it has
disadvantages too, which Mattias appears not to want to admit exist.  To
be honest, I suspect the differences in readability/maintainability
between the two forms will be small.  Maintaining Emacs, it is most
helpful to have at least a reading competence with both forms.

> -- 
> 	Philip Kaludercic

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 16:42       ` Lars Ingebrigtsen
@ 2021-09-08 20:08         ` Stefan Monnier
  0 siblings, 0 replies; 120+ messages in thread
From: Stefan Monnier @ 2021-09-08 20:08 UTC (permalink / raw)
  To: Lars Ingebrigtsen; +Cc: tomas, emacs-devel

Lars Ingebrigtsen [2021-09-08 18:42:41] wrote:
> <tomas@tuxteam.de> writes:
>> One of those rare cases I tend to disagree with Stefan. Which always
>> feels funny, because he's most of the time right ;-)
> Yes, it's very scary the first time it happens.

Indeed, but I got used to it,


        Stefan




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 16:05     ` tomas
  2021-09-08 16:42       ` Lars Ingebrigtsen
@ 2021-09-08 20:18       ` Stefan Monnier
  2021-09-09  7:04         ` tomas
  2021-09-09 10:30         ` Mattias Engdegård
  1 sibling, 2 replies; 120+ messages in thread
From: Stefan Monnier @ 2021-09-08 20:18 UTC (permalink / raw)
  To: tomas; +Cc: emacs-devel

> I just think these are two separate dimensions which happen to align
> in the "regexp and backslash" case.

BTW, they can align in somewhat funny ways sometimes.
E.g. the raw-string version of the regexp "[ \t\n]" turns into something like

    #r"[
]"

which is not ideal in terms of clarity.  Similarly a regexp that matches
the NUL character will be problematic when written as a raw string
because it will need to embed the NUL character in the source code,
which in turn will cause tools like `grep` to treat the file as binary.

For the first problem above we can/should extend our regexp syntax to
include \t and \n as regexps that match TAB and LF respectively (that
would also be handy when writing regexps in the minibuffer).
But \0 is already used for other things so there's no such "obvious"
workaround for the second case :-(

        Stefan

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
                   ` (4 preceding siblings ...)
  2021-09-08 13:10 ` Stefan Monnier
@ 2021-09-08 20:40 ` Anna Glasgall
  2021-09-08 21:28   ` Alan Mackenzie
  2021-10-02 21:03   ` Daniel Brooks
  5 siblings, 2 replies; 120+ messages in thread
From: Anna Glasgall @ 2021-09-08 20:40 UTC (permalink / raw)
  To: emacs-devel

On Tue, 2021-09-07 at 21:49 -0400, Anna Glasgall wrote:
> 
[snip]
> I've successfully taught the elisp reader (read1 in lread.c) how to
> read r-strings. I thought I had managed to make lisp-mode/elisp-mode
> happy by allowing "r" to be a prefix character (C-x C-e and the
> underlying forward-sexp/backward-sexp seemed to work fine at first),
> but realized that I ran into trouble with strings containing the
> sequence of characters '\\"'.
> 
> The reader correctly reads r"a\\"" as a string containing the
> sequence
> of characters 'a', '\', '"', and M-: works. Unfortunately, if I try
> sexp-based navigation or e.g. C-x C-e, it falls apart. The parser in
> syntax.c, which afaict is what lisp-mode is using to try and find
> sexps
> in buffer text, doesn't seem to know what to do with this expression.
> I've spent some time staring at syntax.c, but I must confess that I'm
> entirely defeated in terms of what changes need to be made here to
> teach this other parser about prefixed strings in where the prefix
> has
> meaning that affects the interpretation of the characters between
> string fences.
> 

I've taken the suggestion Lars Ingebrigtsen made further up-thread and
implemented another draft of this using #r"" instead of r""; not only
does this not break existing elisp syntax (as noted by Po Lu earlier),
forward-sexp and backwards-sexp understand the new syntax without
having to be taught about it, to my considerable surprise. Or at least
they understand it to the same extent that they understand #s() or
#^[], anyway.

Alan (Dr. Mackenzie? Forgive me, not sure what standards are here),
your point about strings ending in \ is very well taken and I'm frankly
not sure what the easiest path forward here is. Having "raw literals
cannot end in a \" is a weird and unpleasant restriction, although the
fact that it is one that Python places on r-strings (to my considerable
surprise; I've been using Python since the mid-00s and have never run
across this particular syntax oddity before) may mean that it is
perhaps not so bad. The C++ concept of allowing r-strings to specify
their own delimiters is perhaps maximally flexible, but is definitely
going to be a heavier lift to implement than any of the above. I'd love
to hear people's opinions on the merits of the various possible
approaches here.

You've all given me a great deal of food for thought, which I will
attempt to digest before spinning a new revision of this patch.

thanks,

Anna

> I've attached a copy of my WIP patch; it's definitely not near final
> code quality and doesn't have documentation yet, all of which I would
> take care of before submitting for inclusion. I also haven't filled
> out
> the copyright assignment paperwork yet, but should this work reach a
> point where it was likely to be accepted, I'd be happy to do that.
> 
> 
> I'd very much appreciate some pointers on what to try next here, or
> some explanation of how syntax.c/syntax.el works beyond what's in the
> reference manual. If this is a fool's errand I'm tilting at here, I'd
> also appreciate being told that before I sink more time into it :)
> 
> thanks,
> 
> Anna Glasgall
> 
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 19:22         ` Philip Kaludercic
  2021-09-08 19:36           ` Alan Mackenzie
@ 2021-09-08 21:11           ` Stefan Kangas
  2021-09-08 21:24             ` Philip Kaludercic
  2021-09-09  6:52             ` tomas
  1 sibling, 2 replies; 120+ messages in thread
From: Stefan Kangas @ 2021-09-08 21:11 UTC (permalink / raw)
  To: Philip Kaludercic
  Cc: Alan Mackenzie, Emacs developers, Anna Glasgall, Stefan Monnier,
	Mattias Engdegård

Philip Kaludercic <philipk@posteo.net> writes:

> The reason I use rx in a lot of my scripts is that I can add comments,
> explanations, formatting, etc. when it gets complicated. I think that is
> a significant advantage, that even raw strings wouldn't have (unless a
> comment syntax were to be added into the regular expression language,
> which is unlikely).

Perl has this:

    perl -e '$foo = "bar"; print "yes" if $foo =~ / bar # comment /x;'

is equivalent to

    perl -e '$foo = "bar"; print "yes" if $foo =~ /bar/;'



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 21:11           ` Stefan Kangas
@ 2021-09-08 21:24             ` Philip Kaludercic
  2021-09-09  6:52             ` tomas
  1 sibling, 0 replies; 120+ messages in thread
From: Philip Kaludercic @ 2021-09-08 21:24 UTC (permalink / raw)
  To: Stefan Kangas
  Cc: Alan Mackenzie, Emacs developers, Anna Glasgall, Stefan Monnier,
	Mattias Engdegård

Stefan Kangas <stefan@marxist.se> writes:

> Philip Kaludercic <philipk@posteo.net> writes:
>
>> The reason I use rx in a lot of my scripts is that I can add comments,
>> explanations, formatting, etc. when it gets complicated. I think that is
>> a significant advantage, that even raw strings wouldn't have (unless a
>> comment syntax were to be added into the regular expression language,
>> which is unlikely).
>
> Perl has this:
>
>     perl -e '$foo = "bar"; print "yes" if $foo =~ / bar # comment /x;'
>
> is equivalent to
>
>     perl -e '$foo = "bar"; print "yes" if $foo =~ /bar/;'

Yes, but taking backwards compatibility into account, you cannot just
define some character to be used for comments.

-- 
	Philip Kaludercic



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 20:40 ` Anna Glasgall
@ 2021-09-08 21:28   ` Alan Mackenzie
  2021-10-02 21:03   ` Daniel Brooks
  1 sibling, 0 replies; 120+ messages in thread
From: Alan Mackenzie @ 2021-09-08 21:28 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

Hello, Anna.

On Wed, Sep 08, 2021 at 16:40:09 -0400, Anna Glasgall wrote:
> On Tue, 2021-09-07 at 21:49 -0400, Anna Glasgall wrote:

[ .... ]

> Alan (Dr. Mackenzie? Forgive me, not sure what standards are here),

Just "Alan" is fine.  No, I don't have a PhD.  :-)  Young or old, novice
or experienced, we just use first names around here.  What is unwanted is
unkind or hostile language, and curse words are not accepted at all, and
never appear.  But you don't seem like you want to write that way anyhow.
;-)

> your point about strings ending in \ is very well taken and I'm frankly
> not sure what the easiest path forward here is. Having "raw literals
> cannot end in a \" is a weird and unpleasant restriction, although the
> fact that it is one that Python places on r-strings (to my considerable
> surprise; I've been using Python since the mid-00s and have never run
> across this particular syntax oddity before) may mean that it is
> perhaps not so bad.

I think it would be bad in Emacs.  Sooner or later somebody will be
writing a regexp to match another regexp, and not being able to end in \
could be quite awkward.

Maybe giving consideration to using the C# convention of representing a "
in a raw string by "" would be advantageous.  Even this isn't entirely
simple, since a raw string with two "s in it would look something like
#r"foo""""".  From after the #r", to find the end of the raw string,
you'd have to search for the first occurrence of an odd number of
consecutive "s, which isn't entirely trivial.  It might be a rare
occurrence, but you've still got to handle it.

Or, maybe something like the python convention: #r"......""", though this
looks and feels somewhat wierd.

> The C++ concept of allowing r-strings to specify their own delimiters
> is perhaps maximally flexible, but is definitely going to be a heavier
> lift to implement than any of the above. I'd love to hear people's
> opinions on the merits of the various possible approaches here.

When implementing the C++ raw strings, that flexibility caused me a lot
of grief.  For example, changing text in the middle of a C++ raw string,
I had to check the new text didn't, by chance, form a closing delimiter
matching the opening one.  I would recommend not implementing anything
like the C++ raw string identifiers.

> You've all given me a great deal of food for thought, which I will
> attempt to digest before spinning a new revision of this patch.

> thanks,

> Anna

[ .... ]

-- 
Alan Mackenzie (Nuremberg, Germany).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 11:34 ` Adam Porter
  2021-09-08 13:59   ` Clément Pit-Claudel
@ 2021-09-09  3:09   ` Richard Stallman
  1 sibling, 0 replies; 120+ messages in thread
From: Richard Stallman @ 2021-09-09  3:09 UTC (permalink / raw)
  To: Adam Porter; +Cc: emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

    > #r"foo bar baz"  ;; => "foo bar baz"
    > #r|foo "bar" baz|  ;; => "foo \"bar\" baz"

To implement this, it is not enough to change the Lisp reader.
The code for parsing sexps and moving over them needs to handle
the new construct too.  With luck, maybe they will need no change,
but people have to verify that.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 21:11           ` Stefan Kangas
  2021-09-08 21:24             ` Philip Kaludercic
@ 2021-09-09  6:52             ` tomas
  1 sibling, 0 replies; 120+ messages in thread
From: tomas @ 2021-09-09  6:52 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 822 bytes --]

On Wed, Sep 08, 2021 at 11:11:00PM +0200, Stefan Kangas wrote:
> Philip Kaludercic <philipk@posteo.net> writes:
> 
> > The reason I use rx in a lot of my scripts is that I can add comments,
> > explanations, formatting, etc. when it gets complicated. I think that is
> > a significant advantage, that even raw strings wouldn't have (unless a
> > comment syntax were to be added into the regular expression language,
> > which is unlikely).
> 
> Perl has this:
> 
>     perl -e '$foo = "bar"; print "yes" if $foo =~ / bar # comment /x;'

Yes, Perl's "extended" regexps. They are very handy for when the concise
language is too cunfusing.

I'd tend to the position that this "ecological niche" is already (very
well) covered by `rx'. But I am aware that this is a very subjective
topic :)

Cheers
 - t

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 20:18       ` Stefan Monnier
@ 2021-09-09  7:04         ` tomas
  2021-09-09 10:30         ` Mattias Engdegård
  1 sibling, 0 replies; 120+ messages in thread
From: tomas @ 2021-09-09  7:04 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1560 bytes --]

On Wed, Sep 08, 2021 at 04:18:23PM -0400, Stefan Monnier wrote:
> > I just think these are two separate dimensions which happen to align
> > in the "regexp and backslash" case.
> 
> BTW, they can align in somewhat funny ways sometimes.
> E.g. the raw-string version of the regexp "[ \t\n]" turns into something like
> 
>     #r"[
> ]"

Actually I was using "align" in a rather metaphorical sense, but you
are making a very good point: one might want to have some ot the
"classical C escapes" (\n, \r and some of its ilk, perhaps even \b),
but then `raw' wouldn't be raw anymore.

> which is not ideal in terms of clarity.  Similarly a regexp that matches
> the NUL character will be problematic when written as a raw string
> because it will need to embed the NUL character in the source code,
> which in turn will cause tools like `grep` to treat the file as binary.
> 
> For the first problem above we can/should extend our regexp syntax to
> include \t and \n as regexps that match TAB and LF respectively (that
> would also be handy when writing regexps in the minibuffer).

For regexps proper there's an escape hatch, since there is a language "on
top" that could be extended a bit (e.g. via the [:...:] character class
notation or something). But that would be unwieldy indeed.

> But \0 is already used for other things so there's no such "obvious"
> workaround for the second case :-(

Yes, the very handy `\x' notation has much history. Hard to move whithin
that cupboard without breaking anything :-)

Cheers
 - t

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 20:18       ` Stefan Monnier
  2021-09-09  7:04         ` tomas
@ 2021-09-09 10:30         ` Mattias Engdegård
  2021-09-09 11:36           ` Stefan Kangas
  1 sibling, 1 reply; 120+ messages in thread
From: Mattias Engdegård @ 2021-09-09 10:30 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Emacs developers

8 sep. 2021 kl. 22.18 skrev Stefan Monnier <monnier@iro.umontreal.ca>:

> For the first problem above we can/should extend our regexp syntax to
> include \t and \n as regexps that match TAB and LF respectively (that
> would also be handy when writing regexps in the minibuffer).

Wouldn't work in character alternatives without breaking compatibility.

I've used both Perl's /x and Python's re.X in raw triple-quoted strings extensively, and they are nowhere as nice as rx.

Lisp-syntax languages really have an advantage here -- I wish I could write an equally nice DSL in other languages. Some come close.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-09 10:30         ` Mattias Engdegård
@ 2021-09-09 11:36           ` Stefan Kangas
  2021-09-09 13:33             ` Mattias Engdegård
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-09-09 11:36 UTC (permalink / raw)
  To: Mattias Engdegård; +Cc: Stefan Monnier, Emacs developers

Mattias Engdegård <mattiase@acm.org> writes:

> I've used both Perl's /x and Python's re.X in raw triple-quoted strings extensively, and they are nowhere as nice as rx.

Agreed.  I mostly only pointed out that it exists for completion.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-09 11:36           ` Stefan Kangas
@ 2021-09-09 13:33             ` Mattias Engdegård
  2021-09-09 14:32               ` tomas
  2021-09-14 10:43               ` Augusto Stoffel
  0 siblings, 2 replies; 120+ messages in thread
From: Mattias Engdegård @ 2021-09-09 13:33 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: Anna Glasgall, Stefan Monnier, Emacs developers

9 sep. 2021 kl. 13.36 skrev Stefan Kangas <stefan@marxist.se>:

>> I've used both Perl's /x and Python's re.X in raw triple-quoted strings extensively, and they are nowhere as nice as rx.
> 
> Agreed.  I mostly only pointed out that it exists for completion.

Yes, sorry -- it wasn't meant as a rebuttal to what you wrote.

It's also likely that regexps in the suggested "raw" string literals would be inferior to what Python and Perl offer for reasons Stefan Monnier already mentioned. The basic string regexp syntax would need a complete change to even come close -- and then we have two slightly different string regexp notations.

Anna, I'm ashamed for welcoming a new friendly contributor as Mr. No. In a weak attempt to compensate, here are some related areas in Emacs that might benefit from your energy and skill:

* [Easy] There is currently no build-in way to print the string " \t\r\n" exactly that way -- the best we can do (setting print-escape-newlines and print-escape-control-characters) is " \11\15\n" which isn't as readable. Introduce an option to the Lisp string printer so that common control characters print as their special backslash sequences: \t, \r, \b, \e, maybe \a and \e, maybe not \v.

* Improve interactive regexp use. Maybe a user could configure a preferred regexp syntax (standard Emacs, PCRE, rx, etc) to be used when doing interactive search and replace. It would also affect how regexps are displayed and edited in customisation dialogues; right now, they are often difficult to read. Ideally a user should be able to switch syntax during entry at any time and have the regexp translated. Can syntax colouring (font-lock) improve regexp entry and display?

I'm sure other people have more suggestions (and I have some more demanding items should you wish for something chewy.)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-09 13:33             ` Mattias Engdegård
@ 2021-09-09 14:32               ` tomas
  2021-09-14 10:43               ` Augusto Stoffel
  1 sibling, 0 replies; 120+ messages in thread
From: tomas @ 2021-09-09 14:32 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 1333 bytes --]

On Thu, Sep 09, 2021 at 03:33:44PM +0200, Mattias Engdegård wrote:

[...]

> Anna, I'm ashamed for welcoming a new friendly contributor as Mr. No. In a weak attempt to compensate, here are some related areas in Emacs that might benefit from your energy and skill:

I wouldn't give up that quickly. Actually, I think this discussion
itself is invaluable, since it brings forward the issues involved
in the design of such a feature.

 - For one, as Stefan points out, you don't really want totally
   "raw" strings. Some escaping mechanism is desirable.
 - Then, it becomes clear that they will ease the biggest pain
   in regexps, but they won't "fix" everything. Some work in
   the regexp part seems desirable, too.
 - People have been looking at what other languages do. Some
   of them (e.g. Perl) let you choose the delimiter (the operators
   `q' and `qq').

There are a couple of design decisions: just "hard" raw strings, and
live with the limitation that tabs, newlines etc. are awkward? Use
an alternative escape char (e.g. tilde)? Other "crazy" ideas might
be examined ("here" documents? Perl has them, too).

The discussion comes up every now and then; even if no code ends up
being produced (I'd hope it does!) this might be a valuable resource
for the next courageous person giving it a try.

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-09 13:33             ` Mattias Engdegård
  2021-09-09 14:32               ` tomas
@ 2021-09-14 10:43               ` Augusto Stoffel
  2021-09-14 11:42                 ` Ihor Radchenko
  1 sibling, 1 reply; 120+ messages in thread
From: Augusto Stoffel @ 2021-09-14 10:43 UTC (permalink / raw)
  To: Mattias Engdegård
  Cc: Emacs developers, Stefan Kangas, Anna Glasgall, Stefan Monnier

On Thu,  9 Sep 2021 at 15:33, Mattias Engdegård <mattiase@acm.org> wrote:

> * Improve interactive regexp use. Maybe a user could configure a
> preferred regexp syntax (standard Emacs, PCRE, rx, etc) to be used
> when doing interactive search and replace.

IMO, interactive use is the one case where it's actually nice to have
parens etc interpreted literally by default.

> It would also affect how regexps are displayed and edited in
> customisation dialogues; right now, they are often difficult to
> read. Ideally a user should be able to switch syntax during entry at
> any time and have the regexp translated. Can syntax colouring
> (font-lock) improve regexp entry and display?

Maybe there could be a "glasses mode" for regexps, similar to the
existing one for came case?  One could display "(x)" and "\\(x\\)" in a
way that is indistinguishable except for the colors of the parenthesis.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 10:43               ` Augusto Stoffel
@ 2021-09-14 11:42                 ` Ihor Radchenko
  2021-09-14 13:18                   ` Stefan Monnier
  2021-09-14 17:49                   ` Jose E. Marchesi
  0 siblings, 2 replies; 120+ messages in thread
From: Ihor Radchenko @ 2021-09-14 11:42 UTC (permalink / raw)
  To: Augusto Stoffel
  Cc: Mattias Engdegård, Stefan Kangas, Anna Glasgall,
	Stefan Monnier, Emacs developers

Augusto Stoffel <arstoffel@gmail.com> writes:

> Maybe there could be a "glasses mode" for regexps, similar to the
> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
> way that is indistinguishable except for the colors of the parenthesis.

And indeed there is such (on MELPA though):
https://github.com/cpitclaudel/easy-escape



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 11:42                 ` Ihor Radchenko
@ 2021-09-14 13:18                   ` Stefan Monnier
  2021-09-14 13:22                     ` Stefan Kangas
  2021-09-14 17:49                   ` Jose E. Marchesi
  1 sibling, 1 reply; 120+ messages in thread
From: Stefan Monnier @ 2021-09-14 13:18 UTC (permalink / raw)
  To: Ihor Radchenko
  Cc: Augusto Stoffel, Mattias Engdegård, Emacs developers,
	Stefan Kangas, Anna Glasgall

>> Maybe there could be a "glasses mode" for regexps, similar to the
>> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
>> way that is indistinguishable except for the colors of the parenthesis.
> And indeed there is such (on MELPA though):
> https://github.com/cpitclaudel/easy-escape

Clément signed the paperwork, so maybe we could easily add it to
GNU ELPA?


        Stefan




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 13:18                   ` Stefan Monnier
@ 2021-09-14 13:22                     ` Stefan Kangas
  2021-09-14 14:01                       ` Ihor Radchenko
  2021-09-14 14:39                       ` Clément Pit-Claudel
  0 siblings, 2 replies; 120+ messages in thread
From: Stefan Kangas @ 2021-09-14 13:22 UTC (permalink / raw)
  To: Stefan Monnier
  Cc: Mattias Engdegård, Ihor Radchenko, Anna Glasgall,
	Clément Pit--Claudel, Emacs developers, Augusto Stoffel

Hi Clément,

Stefan Monnier <monnier@iro.umontreal.ca> writes:

> >> Maybe there could be a "glasses mode" for regexps, similar to the
> >> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
> >> way that is indistinguishable except for the colors of the parenthesis.
> > And indeed there is such (on MELPA though):
> > https://github.com/cpitclaudel/easy-escape
>
> Clément signed the paperwork, so maybe we could easily add it to
> GNU ELPA?

What do you think about this?



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 13:22                     ` Stefan Kangas
@ 2021-09-14 14:01                       ` Ihor Radchenko
  2021-09-14 14:39                       ` Clément Pit-Claudel
  1 sibling, 0 replies; 120+ messages in thread
From: Ihor Radchenko @ 2021-09-14 14:01 UTC (permalink / raw)
  To: Stefan Kangas
  Cc: Mattias Engdegård, Anna Glasgall, Emacs developers,
	Clément Pit--Claudel, Stefan Monnier, Augusto Stoffel

Stefan Kangas <stefan@marxist.se> writes:

>> >> Maybe there could be a "glasses mode" for regexps, similar to the
>> >> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
>> >> way that is indistinguishable except for the colors of the parenthesis.
>> > And indeed there is such (on MELPA though):
>> > https://github.com/cpitclaudel/easy-escape
>>
>> Clément signed the paperwork, so maybe we could easily add it to
>> GNU ELPA?

Sounds like a good idea. I asked the author on Github [1].

[1] https://github.com/cpitclaudel/easy-escape/issues/5



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 13:22                     ` Stefan Kangas
  2021-09-14 14:01                       ` Ihor Radchenko
@ 2021-09-14 14:39                       ` Clément Pit-Claudel
  2021-09-14 15:33                         ` Amin Bandali
  2021-09-14 16:05                         ` Eli Zaretskii
  1 sibling, 2 replies; 120+ messages in thread
From: Clément Pit-Claudel @ 2021-09-14 14:39 UTC (permalink / raw)
  To: Stefan Kangas, Stefan Monnier
  Cc: Mattias Engdegård, Augusto Stoffel, Ihor Radchenko,
	Anna Glasgall, Emacs developers

On 9/14/21 9:22 AM, Stefan Kangas wrote:
> Hi Clément,
> 
> Stefan Monnier <monnier@iro.umontreal.ca> writes:
> 
>>>> Maybe there could be a "glasses mode" for regexps, similar to the
>>>> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
>>>> way that is indistinguishable except for the colors of the parenthesis.
>>> And indeed there is such (on MELPA though):
>>> https://github.com/cpitclaudel/easy-escape
>>
>> Clément signed the paperwork, so maybe we could easily add it to
>> GNU ELPA?
> 
> What do you think about this?

Sounds good to me; please feel free to add it to ELPA (I would prefer core, but the last thread on this topic flopped: https://lists.gnu.org/archive/html/emacs-devel/2017-03/msg00266.html).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 14:39                       ` Clément Pit-Claudel
@ 2021-09-14 15:33                         ` Amin Bandali
  2021-09-14 16:05                         ` Eli Zaretskii
  1 sibling, 0 replies; 120+ messages in thread
From: Amin Bandali @ 2021-09-14 15:33 UTC (permalink / raw)
  To: Clément Pit-Claudel
  Cc: Mattias Engdegård, Stefan Kangas, Anna Glasgall,
	Ihor Radchenko, Stefan Monnier, Emacs developers, Augusto Stoffel

Clément Pit-Claudel writes:

> On 9/14/21 9:22 AM, Stefan Kangas wrote:
>> Hi Clément,
>> 
>> Stefan Monnier <monnier@iro.umontreal.ca> writes:
>> 
>>>>> Maybe there could be a "glasses mode" for regexps, similar to the
>>>>> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
>>>>> way that is indistinguishable except for the colors of the parenthesis.
>>>> And indeed there is such (on MELPA though):
>>>> https://github.com/cpitclaudel/easy-escape
>>>
>>> Clément signed the paperwork, so maybe we could easily add it to
>>> GNU ELPA?
>> 
>> What do you think about this?
>
> Sounds good to me; please feel free to add it to ELPA (I would prefer
> core, but the last thread on this topic flopped:
> https://lists.gnu.org/archive/html/emacs-devel/2017-03/msg00266.html).

Thanks, Clément.  I'd also very much like to see this in core.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 14:39                       ` Clément Pit-Claudel
  2021-09-14 15:33                         ` Amin Bandali
@ 2021-09-14 16:05                         ` Eli Zaretskii
  1 sibling, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-09-14 16:05 UTC (permalink / raw)
  To: Clément Pit-Claudel
  Cc: mattiase, stefan, anna, yantar92, monnier, emacs-devel, arstoffel

> From: Clément Pit-Claudel <cpitclaudel@gmail.com>
> Date: Tue, 14 Sep 2021 10:39:52 -0400
> Cc: Mattias Engdegård <mattiase@acm.org>,
>  Augusto Stoffel <arstoffel@gmail.com>, Ihor Radchenko <yantar92@gmail.com>,
>  Anna Glasgall <anna@crossproduct.net>, Emacs developers <emacs-devel@gnu.org>
> Sounds good to me; please feel free to add it to ELPA (I would prefer core, but the last thread on this topic flopped: https://lists.gnu.org/archive/html/emacs-devel/2017-03/msg00266.html).

FWIW, it makes very little sense to me to have only on ELPA code that
adds a new Emacs Lisp data type.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-14 11:42                 ` Ihor Radchenko
  2021-09-14 13:18                   ` Stefan Monnier
@ 2021-09-14 17:49                   ` Jose E. Marchesi
  1 sibling, 0 replies; 120+ messages in thread
From: Jose E. Marchesi @ 2021-09-14 17:49 UTC (permalink / raw)
  To: Ihor Radchenko
  Cc: Mattias Engdegård, Stefan Kangas, Anna Glasgall,
	Emacs developers, Stefan Monnier, Augusto Stoffel


> Augusto Stoffel <arstoffel@gmail.com> writes:
>
>> Maybe there could be a "glasses mode" for regexps, similar to the
>> existing one for came case?  One could display "(x)" and "\\(x\\)" in a
>> way that is indistinguishable except for the colors of the parenthesis.
>
> And indeed there is such (on MELPA though):
> https://github.com/cpitclaudel/easy-escape

This is awesome.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-09-08 20:40 ` Anna Glasgall
  2021-09-08 21:28   ` Alan Mackenzie
@ 2021-10-02 21:03   ` Daniel Brooks
  2021-10-04  0:13     ` Richard Stallman
  1 sibling, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-02 21:03 UTC (permalink / raw)
  To: Anna Glasgall; +Cc: emacs-devel

Anna Glasgall <anna@crossproduct.net> writes:

> Alan (Dr. Mackenzie? Forgive me, not sure what standards are here),
> your point about strings ending in \ is very well taken and I'm frankly
> not sure what the easiest path forward here is. Having "raw literals
> cannot end in a \" is a weird and unpleasant restriction, although the
> fact that it is one that Python places on r-strings (to my considerable
> surprise; I've been using Python since the mid-00s and have never run
> across this particular syntax oddity before) may mean that it is
> perhaps not so bad. The C++ concept of allowing r-strings to specify
> their own delimiters is perhaps maximally flexible, but is definitely
> going to be a heavier lift to implement than any of the above. I'd love
> to hear people's opinions on the merits of the various possible
> approaches here.

I’ve written a little about raw strings on this mailing list. You might
read 87zgzqz6mu.fsf@db48x.net, but I can summarize or restate the parts
dealing with delimiters.

I happen to love Raku’s choice: you can use any matched pair of
nonalphanumeric unicode characters. U+2603 SNOWMAN is a perfectly
cromulent choice of delimiter as far as Raku is concerned; an example
would be q☃foo☃. Since you can always choose a character that will not
appear in your string, this essentially eliminates all need for escaping
of the delimiter. Raku also lets you use characters that come in left–
and right–handed versions, as long as you order them correctly. For
example q«foo» is allowed, while q»foo« is not. There are unicode
properties that allow this to work without enumerating all of the
possibilities, making it future–proof. (There are only a couple of dozen
pairs, so enumerating them is not hard either.)

Then of course there are languages where the delimiters can be chosen by
the programmer but from a much more constrained set of
possibilities. C++ and Rust seem like good ones that we could mimic.

All of these delimiter styles are quite easy to implement in the reader,
but as Alan points out they can cause some complexity in the
corresponding language modes:

Alan Mackenzie <acm@muc.de> writes:

> When implementing the C++ raw strings, that flexibility caused me a lot
> of grief.  For example, changing text in the middle of a C++ raw string,
> I had to check the new text didn't, by chance, form a closing delimiter
> matching the opening one.  I would recommend not implementing anything
> like the C++ raw string identifiers.

As such, if we go this route I would recommend Rust–style over C++ style
raw strings. The Rust style is a lot like the C++ style, except that the
extra delimiter must be a sequence of # characters, matching on both
sides, rather than arbitrary source characters. Modes that want to check
for this will have an easier time with Rust–style than C++–style raw
strings.

But ultimately I prefer the exuberance and whimsy of Raku’s approach
over the more staid and pedestrian approaches taken by C++ and Rust.

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-02 21:03   ` Daniel Brooks
@ 2021-10-04  0:13     ` Richard Stallman
  2021-10-04  0:36       ` Daniel Brooks
  0 siblings, 1 reply; 120+ messages in thread
From: Richard Stallman @ 2021-10-04  0:13 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: anna, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

"Use any delimiters you like" has a major problem: most of them can't
be displayed on Linux ttys.

Indeed, using any non-ASCII characters in source files can have some
problems, which are very easy to avoid.  We should not give syntactic
roles to them.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-04  0:13     ` Richard Stallman
@ 2021-10-04  0:36       ` Daniel Brooks
  2021-10-04 12:00         ` Eli Zaretskii
  2021-10-04 22:29         ` "Raw" " Richard Stallman
  0 siblings, 2 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-04  0:36 UTC (permalink / raw)
  To: Richard Stallman; +Cc: anna, emacs-devel

Richard Stallman <rms@gnu.org> writes:

> [[[ To any NSA and FBI agents reading my email: please consider    ]]]
> [[[ whether defending the US Constitution against all enemies,     ]]]
> [[[ foreign or domestic, requires you to follow Snowden's example. ]]]
>
> "Use any delimiters you like" has a major problem: most of them can't
> be displayed on Linux ttys.
>
> Indeed, using any non-ASCII characters in source files can have some
> problems, which are very easy to avoid.  We should not give syntactic
> roles to them.

We should be improving the terminal then, rather than constraining
everything to the lowest common denominator. Unicode exists for an
important accessibility reason, even if some of it is frivolous
(emoji). Limiting Emacs source code to English and ASCII will ultimately
only limit the acceptibility of Emacs rather than improve it.

For example, if someone contributes a mode it will normally be accepted
as–is. But if they write the that mode using Japanese characters, would we
turn them away? I think that we should not.

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-04  0:36       ` Daniel Brooks
@ 2021-10-04 12:00         ` Eli Zaretskii
  2021-10-04 15:36           ` character sets as they relate to “Raw” " Daniel Brooks
  2021-10-04 22:29         ` "Raw" " Richard Stallman
  1 sibling, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-04 12:00 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: emacs-devel, rms, anna

> From: Daniel Brooks <db48x@db48x.net>
> Date: Sun, 03 Oct 2021 17:36:45 -0700
> Cc: anna@crossproduct.net, emacs-devel@gnu.org
> 
> Richard Stallman <rms@gnu.org> writes:
> 
> > Indeed, using any non-ASCII characters in source files can have some
> > problems, which are very easy to avoid.  We should not give syntactic
> > roles to them.
> 
> We should be improving the terminal then, rather than constraining
> everything to the lowest common denominator. Unicode exists for an
> important accessibility reason, even if some of it is frivolous
> (emoji). Limiting Emacs source code to English and ASCII will ultimately
> only limit the acceptibility of Emacs rather than improve it.

We can only do this much.  We don't develop any terminal emulators
here, except the two built into Emacs.  Given that even the Linux
console turns out to have staggering gaps in its support for Unicode,
I see no reason for us to pretend Unicode is supported well enough on
the terminals to ignore this issue.

> For example, if someone contributes a mode it will normally be accepted
> as–is. But if they write the that mode using Japanese characters, would we
> turn them away? I think that we should not.

Why is Japanese different from any other script in this context?  I
thin unnecessary use of non-ASCII characters, any non-ASCII
characters, should be avoided, for the reasons mentioned above.  See
bug#50865 for a recent example that left me astonished.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* character sets as they relate to “Raw” string literals for elisp
  2021-10-04 12:00         ` Eli Zaretskii
@ 2021-10-04 15:36           ` Daniel Brooks
  2021-10-04 16:34             ` Stefan Monnier
  2021-10-04 18:57             ` Eli Zaretskii
  0 siblings, 2 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-04 15:36 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: anna, rms, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> We can only do this much.  We don't develop any terminal emulators
> here, except the two built into Emacs.

I was referring broadly to the whole GNU project, not trying to assign
the work specifically to the Emacs project. :)

I was even pondering what it would take to do the work myself, now that
Rust is allowed in kernel modules…

> Given that even the Linux console turns out to have staggering gaps in
> its support for Unicode, I see no reason for us to pretend Unicode is
> supported well enough on the terminals to ignore this issue.

The Linux console is not representative of most terminal emulators. It
is neglected and rarely used, since it is intended only as a fall–back
in case X Windows (or sshd) fails to start. Ideally we should fix it
(again speaking broadly), but we (emacs) shouldn’t limit ourselves to
only what it can support.

>> For example, if someone contributes a mode it will normally be accepted
>> as–is. But if they write the that mode using Japanese characters, would we
>> turn them away? I think that we should not.
>
> Why is Japanese different from any other script in this context?

It isn’t; I simply picked one at random.

> I thin unnecessary use of non-ASCII characters, any non-ASCII
> characters, should be avoided, for the reasons mentioned above.  See
> bug#50865 for a recent example that left me astonished.

I think that your suggestion to set the terminal-coding-system to
latin-1 or us-ascii on the Linux console is the right one. Perhaps that
ought to be the default behavior when Emacs detects that it is running
in the Linux console, even if the LANG variable indicates that we should
be using utf-8. Or perhaps Emacs should instead issue a warning in that
case, since for all we know the Linux console could be fixed next week.

But in any case, back to my question:

Suppose our hypothetical contributor wanted to contribute a new mode
with this type of code in it:

    (defun 日本 () (message "日本"))

That is, all of the identifiers in the source code for this mode are
named in some horrible foreign script that you cannot read. Is it so
much more unreadable if it sometimes has to be displayed like this?

    (defun \u65E5\u672C () (message "\u65E5\u672C"))

More to the point, do we turn away this contributor or ask them to
rewrite their code? My preference is that we simply accept the
contribution as–is.

If we could see our way to accepting such code, then I don’t see why we
couldn’t accept code that uses Unicode in much smaller ways, such as
this:

    (defvar variable-containing-html #r｢<a href="foo.html">click here</a>｣)

db48x

PS: it occurs to me to wonder if my use of Unicode in the prose of this
message, outside of the examples, detracted from its readability in any
way?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 15:36           ` character sets as they relate to “Raw” " Daniel Brooks
@ 2021-10-04 16:34             ` Stefan Monnier
  2021-10-04 20:49               ` Daniel Brooks
  2021-10-04 18:57             ` Eli Zaretskii
  1 sibling, 1 reply; 120+ messages in thread
From: Stefan Monnier @ 2021-10-04 16:34 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Eli Zaretskii, anna, rms, emacs-devel

> Suppose our hypothetical contributor wanted to contribute a new mode
> with this type of code in it:
>
>     (defun 日本 () (message "日本"))
>
> That is, all of the identifiers in the source code for this mode are
> named in some horrible foreign script that you cannot read. Is it so
> much more unreadable if it sometimes has to be displayed like this?
>
>     (defun \u65E5\u672C () (message "\u65E5\u672C"))

FWIW, I consider this case quite different from your raw-string case,
because here the main issue for me is whether the code is maintainable
and reviewable by someone else.  So, in the context of Emacs, GNU
ELPA, and NonGNU ELPA, I find such uses problematic.  If I could count
on having someone else I trust do the reviewing, I might reconsider.

>     (defvar variable-containing-html #r｢<a href="foo.html">click here</a>｣)

I have no serious objection against uses of non-ASCII in a language's
syntax, as in the example above.  That's orthogonal to my opinion that
raw strings are better left out of ELisp.

        Stefan

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 15:36           ` character sets as they relate to “Raw” " Daniel Brooks
  2021-10-04 16:34             ` Stefan Monnier
@ 2021-10-04 18:57             ` Eli Zaretskii
  2021-10-04 19:14               ` Yuri Khan
  1 sibling, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-04 18:57 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: anna, rms, emacs-devel

> From: Daniel Brooks <db48x@db48x.net>
> Cc: emacs-devel@gnu.org,  rms@gnu.org,  anna@crossproduct.net
> Date: Mon, 04 Oct 2021 08:36:40 -0700
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> > We can only do this much.  We don't develop any terminal emulators
> > here, except the two built into Emacs.
> 
> I was referring broadly to the whole GNU project, not trying to assign
> the work specifically to the Emacs project. :)

Then this is not necessarily the best place to raise these issues.

> Suppose our hypothetical contributor wanted to contribute a new mode
> with this type of code in it:
> 
>     (defun 日本 () (message "日本"))
> 
> That is, all of the identifiers in the source code for this mode are
> named in some horrible foreign script that you cannot read. Is it so
> much more unreadable if it sometimes has to be displayed like this?
> 
>     (defun \u65E5\u672C () (message "\u65E5\u672C"))
> 
> More to the point, do we turn away this contributor or ask them to
> rewrite their code? My preference is that we simply accept the
> contribution as–is.

It would be very inconvenient to have such code.  We have that where
it's inevitable (like in some packages that define features specific
to some languages), but even there we prefer to use the likes of
\u672c instead of the literal characters.  At the very least, that
avoids the problem with not having a suitable font to display them.

> If we could see our way to accepting such code, then I don’t see why we
> couldn’t accept code that uses Unicode in much smaller ways, such as
> this:
> 
>     (defvar variable-containing-html #r｢<a href="foo.html">click here</a>｣)

If we avoid non-ASCII characters, we avoid some problems, so all else
being equal, it's better.

> PS: it occurs to me to wonder if my use of Unicode in the prose of this
> message, outside of the examples, detracted from its readability in any
> way?

If someone is reading this on a text-mode terminal, it could.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 18:57             ` Eli Zaretskii
@ 2021-10-04 19:14               ` Yuri Khan
  2021-10-05 21:20                 ` Richard Stallman
  0 siblings, 1 reply; 120+ messages in thread
From: Yuri Khan @ 2021-10-04 19:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Daniel Brooks, Emacs developers, anna, Richard Stallman

On Tue, 5 Oct 2021 at 01:58, Eli Zaretskii <eliz@gnu.org> wrote:
> > PS: it occurs to me to wonder if my use of Unicode in the prose of this
> > message, outside of the examples, detracted from its readability in any
> > way?
>
> If someone is reading this on a text-mode terminal, it could.

We should probably invent a term more accurate than “text-mode
terminal” for things that fail to display text.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 16:34             ` Stefan Monnier
@ 2021-10-04 20:49               ` Daniel Brooks
  2021-10-04 21:19                 ` Alan Mackenzie
                                   ` (3 more replies)
  0 siblings, 4 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-04 20:49 UTC (permalink / raw)
  To: Stefan Monnier, Eli Zaretskii; +Cc: emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> From: Daniel Brooks <db48x@db48x.net>
>> Cc: emacs-devel@gnu.org,  rms@gnu.org,  anna@crossproduct.net
>> Date: Mon, 04 Oct 2021 08:36:40 -0700
>> 
>> Eli Zaretskii <eliz@gnu.org> writes:
>> 
>> > We can only do this much.  We don't develop any terminal emulators
>> > here, except the two built into Emacs.
>> 
>> I was referring broadly to the whole GNU project, not trying to assign
>> the work specifically to the Emacs project. :)
>
> Then this is not necessarily the best place to raise these issues.

I was replying directly to RMS concerning his statement about non–ascii
characters. RMS is known to have opinions with a wider scope than will
fit in any single mailing list, and I was responding in kind. I
apologize for using “we” so broadly without thinking; it is certainly
the kind of thing that is confusing, so I should have been much more
explicit.

>> Suppose our hypothetical contributor wanted to contribute a new mode
>> with this type of code in it:
>> 
>>     (defun 日本 () (message "日本"))
>
> It would be very inconvenient to have such code.

Absolutely! Possibly almost as inconvenient as having to learn some
English in order to develop the thing. But it doesn’t answer my
question.

I see that prolog-mode only gets a few commits per year (9 last year and
5 so far this year; the high water mark is 10 in a single year). It
imposes a pretty minimal support burden and if it has bugs you can
simply ignore them until a Prolog user brings you a patch, because those
bugs can only affect Prolog users. There is a lot of code in Emacs which
fits this description.

Suppose this hypothetical contribution were a language mode for a
Japanese programming language, and thus had the same support profile?
Suppose also that all messages to the user have already been localized
into English, and that there is an English alias for the mode name (that
is, `日本-mode' toggles the mode, but there’s an alias like `ja-mode' or
something), while the rest of the identifiers are in Japanese.

Would there be any reason to turn away that contribution, or to make the
contributor rewrite it?

Stefan Monnier <monnier@iro.umontreal.ca> writes:

> FWIW, I consider this case quite different from your raw-string case,
> because here the main issue for me is whether the code is maintainable
> and reviewable by someone else.  So, in the context of Emacs, GNU
> ELPA, and NonGNU ELPA, I find such uses problematic.  If I could count
> on having someone else I trust do the reviewing, I might reconsider.

I think that if I read between the lines, you are saying that the Emacs
project _could_ grow to become multi–lingual at all levels, with a
sufficient number of invested contributors who could each review and
maintain different parts of the code. Also that like Eli, you would find
it inconvenient or problematic in the short term. Is that a fair
reading?

> We have that where it's inevitable (like in some packages that define
> features specific to some languages), but even there we prefer to use
> the likes of \u672c instead of the literal characters.  At the very
> least, that avoids the problem with not having a suitable font to
> display them.

As an aside, I think that this is a sensible enough choice, though I
would prefer to choose a more automatic solution. That is, relying on
particular viewers of the source code to tweak their Emacs settings to
present the source differently instead of relying on contributors to use
the codepoint numbers directly. As you suggested in bug#50865, changing
the encoding will automatically render those characters with their
codepoint numbers, which is nicer than forcing a human to type them in
before committing. This has the advantage of working on identifiers as
well as string literals.

>> If we could see our way to accepting such code, then I don’t see why we
>> couldn’t accept code that uses Unicode in much smaller ways, such as
>> this:
>> 
>>     (defvar variable-containing-html #r｢<a href="foo.html">click here</a>｣)
>
> If we avoid non-ASCII characters, we avoid some problems, so all else
> being equal, it's better.

Hmm. If we (speaking as broadly as possible!) avoid a problem forever,
how will the problem ever get fixed?

Personally, I think that the problems are now mostly fixed. Emacs has
very complete support for character sets, better than virtually all
other applications. Outside of Emacs, support for Unicode is practically
omnipresent as well. There are still notable gaps, like the Linux
console, but they are the exception rather than the rule. I don’t think
that there is much of a problem left to avoid!

>> PS: it occurs to me to wonder if my use of Unicode in the prose of this
>> message, outside of the examples, detracted from its readability in any
>> way?
>
> If someone is reading this on a text-mode terminal, it could.

I am asking if anyone reading my messages, either this one or any of the
last dozen I have sent to the list, have noticed any specific
problems. I have used non–ascii characters in all of them. I’m wondering
if anyone even noticed. If nobody noticed, or if they didn’t detract
from readability, then it is unlikely that Unicode is a problem in
general.

Yuri Khan <yuri.v.khan@gmail.com> writes:

> On Tue, 5 Oct 2021 at 01:58, Eli Zaretskii <eliz@gnu.org> wrote:
>
>> If someone is reading this on a text-mode terminal, it could.
>
> We should probably invent a term more accurate than “text-mode
> terminal” for things that fail to display text.

True! :D

I prefer to say “Linux console” in reference to the one terminal
emulator that we know has severe problems with Unicode. There are many
terminal emulators out there, and I’m sure a few of them have problems,
but for the most part I think all of them can handle Unicode pretty well
primarily because they all rely on OS libraries to do the heavy
lifting. The Linux console is handicapped in this area primarily because
it is inside the kernel, and thus cannot dynamically load libharfbuzz
and libfreetype. (But I can imagine a hypothetical future kernel module
which statically links against them in order to provide a full–featured
terminal in the console.)

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 20:49               ` Daniel Brooks
@ 2021-10-04 21:19                 ` Alan Mackenzie
  2021-10-04 22:19                   ` Daniel Brooks
  2021-10-05  8:55                 ` Yuri Khan
                                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-04 21:19 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Eli Zaretskii, Stefan Monnier, emacs-devel

Hello, Daniel.

On Mon, Oct 04, 2021 at 13:49:53 -0700, Daniel Brooks wrote:
> Eli Zaretskii <eliz@gnu.org> writes:

> >> From: Daniel Brooks <db48x@db48x.net>
> >> Cc: emacs-devel@gnu.org,  rms@gnu.org,  anna@crossproduct.net
> >> Date: Mon, 04 Oct 2021 08:36:40 -0700

> >> Eli Zaretskii <eliz@gnu.org> writes:

[ .... ]

> >> PS: it occurs to me to wonder if my use of Unicode in the prose of this
> >> message, outside of the examples, detracted from its readability in any
> >> way?

> > If someone is reading this on a text-mode terminal, it could.

It does for me.

> I am asking if anyone reading my messages, either this one or any of the
> last dozen I have sent to the list, have noticed any specific
> problems. I have used non–ascii characters in all of them. I’m wondering
> if anyone even noticed. If nobody noticed, or if they didn’t detract
> from readability, then it is unlikely that Unicode is a problem in
> general.

These characters displayed as inverse question marks on my Linux console.

I can understand people wanting their non-ascii names to be properly
spelt (just as I prefer my non-ascii home city Nürnberg to be correctly
spelt).

What I don't really understand is including punctuation characters which
can't be typed on the writer's keyboard, except by awkward workarounds.

> Yuri Khan <yuri.v.khan@gmail.com> writes:

> > On Tue, 5 Oct 2021 at 01:58, Eli Zaretskii <eliz@gnu.org> wrote:

> >> If someone is reading this on a text-mode terminal, it could.

> > We should probably invent a term more accurate than “text-mode
> > terminal” for things that fail to display text.

> True! :D

> I prefer to say “Linux console” in reference to the one terminal
> emulator that we know has severe problems with Unicode. There are many
> terminal emulators out there, and I’m sure a few of them have problems,
> but for the most part I think all of them can handle Unicode pretty well
> primarily because they all rely on OS libraries to do the heavy
> lifting. The Linux console is handicapped in this area primarily because
> it is inside the kernel, and thus cannot dynamically load libharfbuzz
> and libfreetype.

One of the reasons I use Linux is because I have a 16 x 8 dot fontset,
and don't have to cope with all the vagaries of fancy, sometimes blurred,
fonts used on GUIs.  There are quite a few others.  Why use a graphical
environment for doing text work?

> (But I can imagine a hypothetical future kernel module which statically
> links against them in order to provide a full–featured terminal in the
> console.)

I can't.  The Linux console has got to work to bring up a new machine,
should one be doing this from scratch rather than installing a
distribution with ready made X.  For this, it's _got_ to work directly in
the kernel.

> db48x

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 21:19                 ` Alan Mackenzie
@ 2021-10-04 22:19                   ` Daniel Brooks
  2021-10-05 11:20                     ` Alan Mackenzie
  0 siblings, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-04 22:19 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: Eli Zaretskii, Stefan Monnier, emacs-devel

Alan Mackenzie <acm@muc.de> writes:

>> PS: it occurs to me to wonder if my use of Unicode in the prose of this
>> message, outside of the examples, detracted from its readability in any
>> way?
>
> It does for me.

Aha! I’m rather astounded that this is the case, but happy to know that
we are talking about a use–case that actually affects real users, as
opposed to merely hypothetical ones. Thank you!

>> I am asking if anyone reading my messages, either this one or any of the
>> last dozen I have sent to the list, have noticed any specific
>> problems. I have used non–ascii characters in all of them. I’m wondering
>> if anyone even noticed. If nobody noticed, or if they didn’t detract
>> from readability, then it is unlikely that Unicode is a problem in
>> general.
>
> These characters displayed as inverse question marks on my Linux console.
>
> I can understand people wanting their non-ascii names to be properly
> spelt (just as I prefer my non-ascii home city Nürnberg to be correctly
> spelt).
>
> What I don't really understand is including punctuation characters which
> can't be typed on the writer's keyboard, except by awkward workarounds.

You are making unwarranted assumptions about my keyboard :D

But alas, it’s fairly ordinary; I don’t actually have the keyboard of my
dreams. Instead, there are some xkb options that I turn on to make it
more capable. To type a ｢"｣ I have to press S-', while to type ｢“｣ I
press Level3-k; it’s a different pair of fingers, but not really any
more difficult or awkward to type.

> One of the reasons I use Linux is because I have a 16 x 8 dot fontset,
> and don't have to cope with all the vagaries of fancy, sometimes blurred,
> fonts used on GUIs.  There are quite a few others.  Why use a graphical
> environment for doing text work?

I use a GUI precisely because the range of characters is so much wider,
making the text work more fun. Also, because the fonts aren’t blurry to
me, ever since I adjusted the font hinting slightly and bumped up the
minimum font sizes significantly (I agree that blurriness is somewhat
subjective).

>> (But I can imagine a hypothetical future kernel module which statically
>> links against them in order to provide a full–featured terminal in the
>> console.)
>
> I can't.  The Linux console has got to work to bring up a new machine,
> should one be doing this from scratch rather than installing a
> distribution with ready made X.  For this, it's _got_ to work directly in
> the kernel.

Yea, that’s why I said that it would need to be statically linked. The
console already uses the framebuffer, it just needs support for reading
TTF fonts (libfreetype) and shaping the text properly (libharfbuzz). I’m
sure some other handwavium would be needed too, but in principle there’s
no reason the Linux console shouldn’t be able to completely support
Unicode text display. It’s just that nobody has done the work.

Of course I hadn’t been thinking of input handling, but xkb does already
exist. While it is a problem that the name starts with an ‘x’, the core
logic of translating keycodes into characters via a keymap is all
there. Presumably with sufficient elbow grease the X protocol stuff
could be filed off and the important bits reused.

I can hear the laughter already, as we propose adding a 2 or 3 megabyte
kernel module. It would be hilarious. Can you imagine it now?

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-04  0:36       ` Daniel Brooks
  2021-10-04 12:00         ` Eli Zaretskii
@ 2021-10-04 22:29         ` Richard Stallman
  2021-10-05  5:39           ` Daniel Brooks
  1 sibling, 1 reply; 120+ messages in thread
From: Richard Stallman @ 2021-10-04 22:29 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: anna, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > > Indeed, using any non-ASCII characters in source files can have some
  > > problems, which are very easy to avoid.  We should not give syntactic
  > > roles to them.

  > We should be improving the terminal then, rather than constraining
  > everything to the lowest common denominator.

That would be a useful thing to do.  The code you would need to
improve is in Linux.  If and when it handles more of Unicode, and once
that change has had time to get into the system versions people use,
Emacs could take advantage of it.

  >  Limiting Emacs source code to English and ASCII will ultimately
  > only limit the acceptibility of Emacs rather than improve it.

The GNU Coding Standards says that code should be in English, and
comments too.  See nodes Names and Comments.  This is because Emacs is
the worldwide language of programming.  Using any other language for
identifiers and comments makes the program incomprehensible for most
of the world.

It's ok to include non-ASCII characters in strings and comments in
special circumstances, when you're talking about a particular
characters or operating on them.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-04 22:29         ` "Raw" " Richard Stallman
@ 2021-10-05  5:39           ` Daniel Brooks
  2021-10-05  5:43             ` Jean-Christophe Helary
  0 siblings, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-05  5:39 UTC (permalink / raw)
  To: Richard Stallman; +Cc: emacs-devel

Richard Stallman <rms@gnu.org> writes:

>   >  Limiting Emacs source code to English and ASCII will ultimately
>   > only limit the acceptibility of Emacs rather than improve it.
>
> The GNU Coding Standards says that code should be in English, and
> comments too.  See nodes Names and Comments.  This is because Emacs is
> the worldwide language of programming.  Using any other language for
> identifiers and comments makes the program incomprehensible for most
> of the world.
>
> It's ok to include non-ASCII characters in strings and comments in
> special circumstances, when you're talking about a particular
> characters or operating on them.

Oops, I should have expected that this would be written down
somewhere. Thanks for pointing it out for me.

db48x



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-05  5:39           ` Daniel Brooks
@ 2021-10-05  5:43             ` Jean-Christophe Helary
  2021-10-05  8:24               ` Richard Stallman
  2021-10-05 12:23               ` Eli Zaretskii
  0 siblings, 2 replies; 120+ messages in thread
From: Jean-Christophe Helary @ 2021-10-05  5:43 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Richard Stallman, emacs-devel



> On Oct 5, 2021, at 14:39, Daniel Brooks <db48x@db48x.net> wrote:
> 
>> The GNU Coding Standards says that code should be in English, and
>> comments too.  See nodes Names and Comments.  This is because *Emacs* is
>> the worldwide language of programming.

A very nice slip of the tongue/keyboard :-)

-- 
Jean-Christophe Helary @brandelune
https://mac4translators.blogspot.com
https://sr.ht/~brandelune/omegat-as-a-book/




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-05  5:43             ` Jean-Christophe Helary
@ 2021-10-05  8:24               ` Richard Stallman
  2021-10-05 12:23               ` Eli Zaretskii
  1 sibling, 0 replies; 120+ messages in thread
From: Richard Stallman @ 2021-10-05  8:24 UTC (permalink / raw)
  To: Jean-Christophe Helary; +Cc: db48x, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > >> The GNU Coding Standards says that code should be in English, and
  > >> comments too.  See nodes Names and Comments.  This is because *Emacs* is
  > >> the worldwide language of programming.

  > A very nice slip of the tongue/keyboard :-)

Yes, I meant to write "English."

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 20:49               ` Daniel Brooks
  2021-10-04 21:19                 ` Alan Mackenzie
@ 2021-10-05  8:55                 ` Yuri Khan
  2021-10-05 16:25                   ` Juri Linkov
  2021-10-05 17:13                   ` Daniel Brooks
  2021-10-05 12:04                 ` Eli Zaretskii
  2021-10-05 21:20                 ` Richard Stallman
  3 siblings, 2 replies; 120+ messages in thread
From: Yuri Khan @ 2021-10-05  8:55 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Eli Zaretskii, Stefan Monnier, Emacs developers

On Tue, 5 Oct 2021 at 03:51, Daniel Brooks <db48x@db48x.net> wrote:

> I prefer to say “Linux console” in reference to the one terminal
> emulator that we know has severe problems with Unicode. There are many
> terminal emulators out there, and I’m sure a few of them have problems,
> but for the most part I think all of them can handle Unicode pretty well
> primarily because they all rely on OS libraries to do the heavy
> lifting. The Linux console is handicapped in this area primarily because
> it is inside the kernel, and thus cannot dynamically load libharfbuzz
> and libfreetype. (But I can imagine a hypothetical future kernel module
> which statically links against them in order to provide a full–featured
> terminal in the console.)

fbterm already provides at least basic Unicode support. As in, it is
not limited to displaying 256 or even 512 characters, and it uses
libfreetype to draw glyphs. I have not tested whether it supports
complex text shaping or color emoji.

(It is not a kernel module, just a normal userspace binary talking to /dev/fbN.)

[By the way, you’re hypercorrecting a little bit. “full-featured” and
other hyphenated words are not normally spelt with an en dash. From
Wikipedia:

    In English, an en dash, –, sometimes replaces the hyphen
    in hyphenated compounds if either of its constituent parts
    is already hyphenated or contains a space (for example,
    San Francisco–area residents, hormone receptor–positive cells,
    cell cycle–related factors, and public-school–private-school
    rivalries).

]



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 22:19                   ` Daniel Brooks
@ 2021-10-05 11:20                     ` Alan Mackenzie
  2021-10-05 17:08                       ` Daniel Brooks
  0 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-05 11:20 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Eli Zaretskii, Stefan Monnier, emacs-devel

Hello, Daniel.

On Mon, Oct 04, 2021 at 15:19:22 -0700, Daniel Brooks wrote:
> Alan Mackenzie <acm@muc.de> writes:

> >> PS: it occurs to me to wonder if my use of Unicode in the prose of this
> >> message, outside of the examples, detracted from its readability in any
> >> way?

> > It does for me.

> Aha! I’m rather astounded that this is the case, but happy to know that
> we are talking about a use–case that actually affects real users, as
> opposed to merely hypothetical ones. Thank you!

> >> I am asking if anyone reading my messages, either this one or any of the
> >> last dozen I have sent to the list, have noticed any specific
> >> problems. I have used non–ascii characters in all of them. I’m wondering
> >> if anyone even noticed. If nobody noticed, or if they didn’t detract
> >> from readability, then it is unlikely that Unicode is a problem in
> >> general.

> > These characters displayed as inverse question marks on my Linux console.

> > I can understand people wanting their non-ascii names to be properly
> > spelt (just as I prefer my non-ascii home city Nürnberg to be correctly
> > spelt).

> > What I don't really understand is including punctuation characters which
> > can't be typed on the writer's keyboard, except by awkward workarounds.

> You are making unwarranted assumptions about my keyboard :D

Indeed, yes.  ;-)

> But alas, it’s fairly ordinary; I don’t actually have the keyboard of my
> dreams. Instead, there are some xkb options that I turn on to make it
> more capable. To type a ｢"｣ I have to press S-', while to type ｢“｣ I
> press Level3-k; it’s a different pair of fingers, but not really any
> more difficult or awkward to type.

So, you've set up your keyboard specially to be able to type these things
easily.  I think we can be sure that a lot of your readers won't have
done the same.  I've set up mine to be able to type ä, Ä, ö, Ö, ü, Ü, and
ß.

> > One of the reasons I use Linux is because I have a 16 x 8 dot fontset,
> > and don't have to cope with all the vagaries of fancy, sometimes blurred,
> > fonts used on GUIs.  There are quite a few others.  Why use a graphical
> > environment for doing text work?

> I use a GUI precisely because the range of characters is so much wider,
> making the text work more fun.

OK.  If the kernel's console also had the same range of characters, would
you then use that console for text work?

> Also, because the fonts aren’t blurry to me, ever since I adjusted the
> font hinting slightly and bumped up the minimum font sizes
> significantly (I agree that blurriness is somewhat subjective).

> >> (But I can imagine a hypothetical future kernel module which statically
> >> links against them in order to provide a full–featured terminal in the
> >> console.)

> > I can't.  The Linux console has got to work to bring up a new machine,
> > should one be doing this from scratch rather than installing a
> > distribution with ready made X.  For this, it's _got_ to work directly in
> > the kernel.

> Yea, that’s why I said that it would need to be statically linked. The
> console already uses the framebuffer, it just needs support for reading
> TTF fonts (libfreetype) and shaping the text properly (libharfbuzz). I’m
> sure some other handwavium would be needed too, but in principle there’s
> no reason the Linux console shouldn’t be able to completely support
> Unicode text display. It’s just that nobody has done the work.

Exactly so.  The Linux console code is exceptionally old gnarled code,
which is difficult to work with.  (I know, having recently hacked it to
restore "soft scrolling" via the S-<PgUp>, S-<PgDn> keys.)

> Of course I hadn’t been thinking of input handling, but xkb does already
> exist. While it is a problem that the name starts with an ‘x’, the core
> logic of translating keycodes into characters via a keymap is all
> there.

Actually, no it's not.  I had to hack its source a few years ago to be
able to configure C-A-S-Fn in X.  The corresponding program for the
console, loadkeys, can pretty much do anything.

> Presumably with sufficient elbow grease the X protocol stuff could be
> filed off and the important bits reused.

> I can hear the laughter already, as we propose adding a 2 or 3 megabyte
> kernel module. It would be hilarious. Can you imagine it now?

Actually, if it was an optional feature, I think it would be welcome in
the kernel, although most kernel hackers would probably continue not to
use it.  2 or 3 Mb isn't a large amount of RAM on a desktop machine any
more.

> db48x

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 20:49               ` Daniel Brooks
  2021-10-04 21:19                 ` Alan Mackenzie
  2021-10-05  8:55                 ` Yuri Khan
@ 2021-10-05 12:04                 ` Eli Zaretskii
  2021-10-05 21:20                 ` Richard Stallman
  3 siblings, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-05 12:04 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: monnier, emacs-devel

> From: Daniel Brooks <db48x@db48x.net>
> Cc: emacs-devel@gnu.org
> Date: Mon, 04 Oct 2021 13:49:53 -0700
> 
> I see that prolog-mode only gets a few commits per year (9 last year and
> 5 so far this year; the high water mark is 10 in a single year). It
> imposes a pretty minimal support burden and if it has bugs you can
> simply ignore them until a Prolog user brings you a patch, because those
> bugs can only affect Prolog users. There is a lot of code in Emacs which
> fits this description.
> 
> Suppose this hypothetical contribution were a language mode for a
> Japanese programming language, and thus had the same support profile?
> Suppose also that all messages to the user have already been localized
> into English, and that there is an English alias for the mode name (that
> is, `日本-mode' toggles the mode, but there’s an alias like `ja-mode' or
> something), while the rest of the identifiers are in Japanese.
> 
> Would there be any reason to turn away that contribution, or to make the
> contributor rewrite it?

I'm sorry, this is too abstract and theoretical issue, with many
important details missing.  So I don't think it will be useful to
seriously consider such a theoretical example.

> >>     (defvar variable-containing-html #r｢<a href="foo.html">click here</a>｣)
> >
> > If we avoid non-ASCII characters, we avoid some problems, so all else
> > being equal, it's better.
> 
> Hmm. If we (speaking as broadly as possible!) avoid a problem forever,
> how will the problem ever get fixed?

I don't think it needs fixing.

> Personally, I think that the problems are now mostly fixed. Emacs has
> very complete support for character sets, better than virtually all
> other applications. Outside of Emacs, support for Unicode is practically
> omnipresent as well. There are still notable gaps, like the Linux
> console, but they are the exception rather than the rule. I don’t think
> that there is much of a problem left to avoid!

It turns out there are more exception than we imagine.  We just now
had another bug report, this time about Kitty terminal emulator, which
has yet another set of issues with displaying non-ASCII characters
from Emacs.  So much so that I was prompted to add an entry in
etc/PROBLEMS with some workarounds for users of Kitty.  Granted, their
problems are not that they don't support recently added Unicode
characters, it's that they support them "too well".  B ut still, it
doesn't help when the result is a messed-up display.

> I prefer to say “Linux console” in reference to the one terminal
> emulator that we know has severe problems with Unicode. There are many
> terminal emulators out there, and I’m sure a few of them have problems,
> but for the most part I think all of them can handle Unicode pretty well
> primarily because they all rely on OS libraries to do the heavy
> lifting.

Unicode is not a static target, it's a moving one.  They issue a new
version of the standard twice a year, and each new version adds new
codepoints with new attributes.  If a new version of Unicode adds
double-width characters, and some terminal emulator doesn't keep up,
you will have problems displaying those new codepoints.  (AFAIK,
that's in essence the problem with the Linux console: they last
updated when Unicode 5.0 was released.)

So it might be possible to say that many terminals support substantial
portions of Unicode, but it definitely is NOT right to say that we can
freely use any character we want and think they will work everywhere.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: "Raw" string literals for elisp
  2021-10-05  5:43             ` Jean-Christophe Helary
  2021-10-05  8:24               ` Richard Stallman
@ 2021-10-05 12:23               ` Eli Zaretskii
  1 sibling, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-05 12:23 UTC (permalink / raw)
  To: Jean-Christophe Helary; +Cc: db48x, rms, emacs-devel

> From: Jean-Christophe Helary <lists@traduction-libre.org>
> Date: Tue, 5 Oct 2021 14:43:33 +0900
> Cc: Richard Stallman <rms@gnu.org>, emacs-devel@gnu.org
> 
> >> The GNU Coding Standards says that code should be in English, and
> >> comments too.  See nodes Names and Comments.  This is because *Emacs* is
> >> the worldwide language of programming.
> 
> A very nice slip of the tongue/keyboard :-)

Who says it's a slip?



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05  8:55                 ` Yuri Khan
@ 2021-10-05 16:25                   ` Juri Linkov
  2021-10-05 17:15                     ` Eli Zaretskii
  2021-10-05 18:23                     ` [External] : " Drew Adams
  2021-10-05 17:13                   ` Daniel Brooks
  1 sibling, 2 replies; 120+ messages in thread
From: Juri Linkov @ 2021-10-05 16:25 UTC (permalink / raw)
  To: Yuri Khan; +Cc: Daniel Brooks, Eli Zaretskii, Stefan Monnier, Emacs developers

> [By the way, you’re hypercorrecting a little bit. “full-featured” and
> other hyphenated words are not normally spelt with an en dash. From
> Wikipedia:
>
>     In English, an en dash, –, sometimes replaces the hyphen
>     in hyphenated compounds if either of its constituent parts
>     is already hyphenated or contains a space (for example,
>     San Francisco–area residents, hormone receptor–positive cells,
>     cell cycle–related factors, and public-school–private-school
>     rivalries).

Such ugly writing style where an en dash is not separated from the
nearby words by whitespace makes the Info manual less readable.
For example, in (info "(emacs) After a Crash"):

     As a last resort, if you had buffers with content which were not
  associated with any files, or if the autosave was not recent enough to
  have recorded important changes, you can use the ‘etc/emacs-buffer.gdb’
  script with GDB (the GNU Debugger) to retrieve them from a core
  dump–provided that a core dump was saved, and that the Emacs executable
  =============
  was not stripped of its debugging symbols.

This leaves one to wonder what does this word mean:
"dump-provided"?

But the dash surrounded by whitespace on both sides makes the text
much more readable:

     As a last resort, if you had buffers with content which were not
  associated with any files, or if the autosave was not recent enough to
  have recorded important changes, you can use the ‘etc/emacs-buffer.gdb’
  script with GDB (the GNU Debugger) to retrieve them from a core dump –
  provided that a core dump was saved, and that the Emacs executable
  was not stripped of its debugging symbols.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 11:20                     ` Alan Mackenzie
@ 2021-10-05 17:08                       ` Daniel Brooks
  2021-10-06 20:54                         ` Richard Stallman
  0 siblings, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-05 17:08 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: Eli Zaretskii, Stefan Monnier, emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> Hello, Daniel.
>
> On Mon, Oct 04, 2021 at 15:19:22 -0700, Daniel Brooks wrote:
>> But alas, it’s fairly ordinary; I don’t actually have the keyboard of my
>> dreams. Instead, there are some xkb options that I turn on to make it
>> more capable. To type a ｢"｣ I have to press S-', while to type ｢“｣ I
>> press Level3-k; it’s a different pair of fingers, but not really any
>> more difficult or awkward to type.
>
> So, you've set up your keyboard specially to be able to type these things
> easily.  I think we can be sure that a lot of your readers won't have
> done the same.  I've set up mine to be able to type ä, Ä, ö, Ö, ü, Ü, and
> ß.

Ah, I think I understand what you are getting at. You are asking why raw
strings should use ｢｣? I’m suggesting that like Raku, the elisp parser
allow _any_ nonalphanumeric character as a delimiter, and that if that
opening delimiter is part of a pair, then the closing delimiter would be
the closing character of the pair. So #r｢foo｣ would be a raw string, but
so would #r(foo) or #r[foo] or #r@foo@, or whatever else you wanted to
use.

The idea is that you would never be required to escape the
delimiter, because you can always choose a delimiter that is not part of
your string. As long as your keyboard has at least one or two
punctuation characters in addition to the usual quotes, then you would
be in good shape. Well, I suppose you would also rarely want to use
period or comma as the delimiter either; they are likely to show up in
the strings you might want to type. But we can be fairly certain that
the user at least has parentheses close by.

>> I use a GUI precisely because the range of characters is so much wider,
>> making the text work more fun.
>
> OK.  If the kernel's console also had the same range of characters, would
> you then use that console for text work?

I would miss some other features, such as multiple monitor support, but
nearly so.

>> I can hear the laughter already, as we propose adding a 2 or 3 megabyte
>> kernel module. It would be hilarious. Can you imagine it now?
>
> Actually, if it was an optional feature, I think it would be welcome in
> the kernel, although most kernel hackers would probably continue not to
> use it.  2 or 3 Mb isn't a large amount of RAM on a desktop machine any
> more.

Yes, but look at the size of the rest of the kernel on disk. Mine is 11
megabytes :)

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05  8:55                 ` Yuri Khan
  2021-10-05 16:25                   ` Juri Linkov
@ 2021-10-05 17:13                   ` Daniel Brooks
  1 sibling, 0 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-05 17:13 UTC (permalink / raw)
  To: Yuri Khan; +Cc: Eli Zaretskii, Stefan Monnier, Emacs developers

Yuri Khan <yuri.v.khan@gmail.com> writes:

> fbterm already provides at least basic Unicode support. As in, it is
> not limited to displaying 256 or even 512 characters, and it uses
> libfreetype to draw glyphs. I have not tested whether it supports
> complex text shaping or color emoji.
>
> (It is not a kernel module, just a normal userspace binary talking to /dev/fbN.)

Wow, I had no idea this existed. This would obviously be the place to
start, if anyone were crazy enough to try to make a better in–kernel
console. It’s already a lot more advanced than what I was imagining.

db48x



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 16:25                   ` Juri Linkov
@ 2021-10-05 17:15                     ` Eli Zaretskii
  2021-10-05 18:40                       ` [External] : " Drew Adams
  2021-10-06 20:54                       ` Richard Stallman
  2021-10-05 18:23                     ` [External] : " Drew Adams
  1 sibling, 2 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-05 17:15 UTC (permalink / raw)
  To: Juri Linkov; +Cc: db48x, emacs-devel, monnier, yuri.v.khan

> From: Juri Linkov <juri@linkov.net>
> Cc: Daniel Brooks <db48x@db48x.net>,  Eli Zaretskii <eliz@gnu.org>,  Stefan
>  Monnier <monnier@iro.umontreal.ca>,  Emacs developers
>  <emacs-devel@gnu.org>
> Date: Tue, 05 Oct 2021 19:25:40 +0300
> 
> Such ugly writing style where an en dash is not separated from the
> nearby words by whitespace makes the Info manual less readable.
> For example, in (info "(emacs) After a Crash"):
> 
>      As a last resort, if you had buffers with content which were not
>   associated with any files, or if the autosave was not recent enough to
>   have recorded important changes, you can use the ‘etc/emacs-buffer.gdb’
>   script with GDB (the GNU Debugger) to retrieve them from a core
>   dump–provided that a core dump was saved, and that the Emacs executable
>   =============
>   was not stripped of its debugging symbols.
> 
> This leaves one to wonder what does this word mean:
> "dump-provided"?
> 
> But the dash surrounded by whitespace on both sides makes the text
> much more readable:

It's a cultural thing.  People who come from English cultures are more
likely to like the style without whitespace.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* RE: [External] : Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 16:25                   ` Juri Linkov
  2021-10-05 17:15                     ` Eli Zaretskii
@ 2021-10-05 18:23                     ` Drew Adams
  2021-10-05 19:13                       ` Stefan Kangas
  1 sibling, 1 reply; 120+ messages in thread
From: Drew Adams @ 2021-10-05 18:23 UTC (permalink / raw)
  To: Juri Linkov, Yuri Khan
  Cc: Daniel Brooks, Eli Zaretskii, Stefan Monnier, Emacs developers

> > “full-featured” and other hyphenated words are not
> > normally spelt with an en dash.

+1.

> > From Wikipedia:
> >
> >     In English, an en dash, –, sometimes replaces the hyphen
> >     in hyphenated compounds if either of its constituent parts
> >     is already hyphenated or contains a space (for example,
> >     San Francisco–area residents, hormone receptor–positive cells,
> >     cell cycle–related factors, and public-school–private-school
> >     rivalries).
> 
> Such ugly writing style where an en dash is not separated from the
> nearby words by whitespace makes the Info manual less readable.
> For example, in (info "(emacs) After a Crash"):
> 
>   ...to retrieve them from a core dump–provided that a...
> 
> This leaves one to wonder what does this word mean:
> "dump-provided"?

Good example.

Use of an en dash that way is inappropriate (irregular,
not conventional).  An em dash _would_ be appropriate
there, but not an en dash.

https://www.scribbr.com/language-rules/dashes/

https://www.merriam-webster.com/words-at-play/em-dash-en-dash-how-to-use

> But the dash surrounded by whitespace on both sides makes the text
> much more readable

Agreed.

Generally, an em dash is surrounded by either a thin
space or no space at all.  But yes, depending on the
font etc., it can look weird to not have any space.
In particular, with monospaced fonts a space char
helps (IMO).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* RE: [External] : Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 17:15                     ` Eli Zaretskii
@ 2021-10-05 18:40                       ` Drew Adams
  2021-10-06 20:54                       ` Richard Stallman
  1 sibling, 0 replies; 120+ messages in thread
From: Drew Adams @ 2021-10-05 18:40 UTC (permalink / raw)
  To: Eli Zaretskii, Juri Linkov
  Cc: db48x@db48x.net, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca,
	emacs-devel@gnu.org

> > But the dash surrounded by whitespace on both sides makes the text
> > much more readable:
> 
> It's a cultural thing.  People who come from English cultures are more
> likely to like the style without whitespace.

I doubt that that's true for monospaced fonts.

It's true for much typeset text - e.g. books.,
but even there a thin space is often used.

And in any case, it's the em dash, not the en
dash, that's used in such contexts.  Use of
the en dash is quite different - see my other
mail.


^ permalink raw reply	[flat|nested] 120+ messages in thread

* RE: [External] : Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 18:23                     ` [External] : " Drew Adams
@ 2021-10-05 19:13                       ` Stefan Kangas
  2021-10-05 19:20                         ` Drew Adams
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-05 19:13 UTC (permalink / raw)
  To: Drew Adams, Juri Linkov, Yuri Khan
  Cc: Daniel Brooks, Eli Zaretskii, Stefan Monnier, Emacs developers

Drew Adams <drew.adams@oracle.com> writes:

>> Such ugly writing style where an en dash is not separated from the
>> nearby words by whitespace makes the Info manual less readable.
>> For example, in (info "(emacs) After a Crash"):
>>
>>   ...to retrieve them from a core dump–provided that a...
>>
>> This leaves one to wonder what does this word mean:
>> "dump-provided"?
>
> Good example.
>
> Use of an en dash that way is inappropriate (irregular,
> not conventional).  An em dash _would_ be appropriate
> there, but not an en dash.

First, that just looks like a typo, there should be an em dash (—) above
as Drew says.  IOW, you need to use "---" to get the correct symbol.
See (info "(texinfo) Conventions").  (The em dash is correctly used in
other places in trouble.texi.)

    "...to retrieve them from a core dump—provided that a..."

That makes it a little bit better.

In print, an em dash will look okay without any spaces, as it will be
longer (in any serious typeface).  Having no spaces is the more
traditional style, while most newspapers, for example, put spaces around
em dash for clarity and ease of reading.

Still, if you are not a native English speaker it does take some getting
used to this style.  But it is correct; white-space is *not* needed.

That said, I agree that with monospace fonts the em dash would really
need to have some white-space on each side.  I don't know why Texinfo
doesn't insert a space for info files.  It probably should.

BTW, try this: visit "(emacs) After a Crash" and then
M-x variable-pitch-mode RET.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* RE: [External] : Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 19:13                       ` Stefan Kangas
@ 2021-10-05 19:20                         ` Drew Adams
  0 siblings, 0 replies; 120+ messages in thread
From: Drew Adams @ 2021-10-05 19:20 UTC (permalink / raw)
  To: Stefan Kangas, Juri Linkov, Yuri Khan
  Cc: Daniel Brooks, Eli Zaretskii, Stefan Monnier, Emacs developers

> BTW, try this: visit "(emacs) After a Crash" and then
> M-x variable-pitch-mode RET.

Yes, that makes the difference clear.  There are two
occurrences of em dash in that node, plus the one
(mistaken) occurrence of en dash.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 20:49               ` Daniel Brooks
                                   ` (2 preceding siblings ...)
  2021-10-05 12:04                 ` Eli Zaretskii
@ 2021-10-05 21:20                 ` Richard Stallman
  2021-10-05 22:13                   ` Daniel Brooks
                                     ` (2 more replies)
  3 siblings, 3 replies; 120+ messages in thread
From: Richard Stallman @ 2021-10-05 21:20 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: eliz, monnier, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > Suppose this hypothetical contribution were a language mode for a
  > Japanese programming language, and thus had the same support profile?

I have to guess what a "Japanese programming language" would mean, but
I think you're talking about a mode for editing programs written in a
language whose symbols are meaningful in Japanese and perhaps written
in kana and kanji.

If so, this is a different issue from the one I thought we were
talking about.  What I said is that a GNU program must be written in
English, including its symbol names and comments.

The reason to require writing GNU programs in English is so that the
program can be clear to many programmers in every country in the
world.  In every country, a substantial fraction of programmers can
read English.  No other human language comes close.

We could conceivsably add to Emacs a library which implements a mode
to edit a "Japanese programming language", but its symbols and
comments should be written in English.  And we would need the comments
to explain in English what it does and how it works.  That would
follow our rules.

We could conceivably add such a program to Emacs, but should we?  I
think it is not worth the trouble; I'd say, let's not.

You can write and destribute the program, and people could run it.
But we should not distribute programs we can't read.

  > I think that if I read between the lines, you are saying that the Emacs
  > project _could_ grow to become multi–lingual at all levels, with a
  > sufficient number of invested contributors who could each review and
  > maintain different parts of the code.

It would be an enormous effort -- just consider translating the
manuals.  And updating the translations for each Emacs version.  It
would be a big burden.  We should urge volunteers to work on
other areas of improvement

What might be worth doing is to implement multilingual output
messages.  Many GNU packages support that, and Emacs could too.  With
GNU gettext, the program's developers don't need to get involved in
the translation, so it would not be a burden on us.

The hard part of this is to develop a way for Emacs to use gettext
and support translations of non-preloaded libraries.  This would
require gettext to be extensible in a new way.  But I am sure that
can be done, with some work.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-04 19:14               ` Yuri Khan
@ 2021-10-05 21:20                 ` Richard Stallman
  2021-10-06  3:48                   ` character sets as they relate to â€œRawâ€ " Matthew Carter
  0 siblings, 1 reply; 120+ messages in thread
From: Richard Stallman @ 2021-10-05 21:20 UTC (permalink / raw)
  To: Yuri Khan; +Cc: db48x, eliz, anna, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > > > PS: it occurs to me to wonder if my use of Unicode in the prose of this
  > > > message, outside of the examples, detracted from its readability in any
  > > > way?
  > >
  > > If someone is reading this on a text-mode terminal, it could.

  > We should probably invent a term more accurate than “text-mode
  > terminal” for things that fail to display text.

The usual term for these terminals is "Linux consoles" or "ttys".
Let's use those -- they are well-known and specific.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 21:20                 ` Richard Stallman
@ 2021-10-05 22:13                   ` Daniel Brooks
  2021-10-06 12:13                     ` Eli Zaretskii
  2021-10-05 22:25                   ` character sets as they relate to â€œRawâ€ " Stefan Kangas
  2021-10-06 11:53                   ` character sets as they relate to “Raw” " Eli Zaretskii
  2 siblings, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-05 22:13 UTC (permalink / raw)
  To: Richard Stallman; +Cc: eliz, monnier, emacs-devel

Richard Stallman <rms@gnu.org> writes:

> [[[ To any NSA and FBI agents reading my email: please consider    ]]]
> [[[ whether defending the US Constitution against all enemies,     ]]]
> [[[ foreign or domestic, requires you to follow Snowden's example. ]]]
>
>   > Suppose this hypothetical contribution were a language mode for a
>   > Japanese programming language, and thus had the same support profile?
>
> I have to guess what a "Japanese programming language" would mean, but
> I think you're talking about a mode for editing programs written in a
> language whose symbols are meaningful in Japanese and perhaps written
> in kana and kanji.

Correct. The idea is that this hypothetical Emacs feature would be
useful primarily to people who could already read and write Japanese,
and who thus would not be inconvenienced because the software was also
written in Japanese.

> We could conceivably add such a program to Emacs, but should we?  I
> think it is not worth the trouble; I'd say, let's not.
>
> You can write and destribute the program, and people could run it.
> But we should not distribute programs we can't read.

Fair enough; thanks for answering the question!

>   > I think that if I read between the lines, you are saying that the Emacs
>   > project _could_ grow to become multi–lingual at all levels, with a
>   > sufficient number of invested contributors who could each review and
>   > maintain different parts of the code.
>
> It would be an enormous effort -- just consider translating the
> manuals.  And updating the translations for each Emacs version. It
> would be a big burden.

Yes, that’s certainly true; the cost of getting complete parity between
English and a second language would be significant. However, I don’t
think that the ongoing costs would be insurmountable, assuming the
project attracted additional trusted and proven maintainers along with
each additional language. A few docstrings and manual pages get changed
in most version, but not enough to make it impossible to keep up.

Eli Zaretskii <eliz@gnu.org> writes:

>> From: Daniel Brooks <db48x@db48x.net>
>> Cc: emacs-devel@gnu.org
>> Date: Mon, 04 Oct 2021 13:49:53 -0700
>>
>> Would there be any reason to turn away that contribution, or to make the
>> contributor rewrite it?
>
> I'm sorry, this is too abstract and theoretical issue, with many
> important details missing.  So I don't think it will be useful to
> seriously consider such a theoretical example.

That, however, is not a useful answer. :)

What assumptions would you need to make before you could answer yes?

Note that this is a purely hypothetical situation; aside from a
smattering of Latin and Greek that are useful for English etymology, I
cannot read or write any other languages. I don’t have a pile of code
written in Japanese that I’m going to spring on you if you find a way to
say yes. Instead I am looking ahead and wondering what the conditions
would have to be like 20 years from now for non–English code to start
showing up.

> It turns out there are more exception than we imagine.  We just now
> had another bug report, this time about Kitty terminal emulator, which
> has yet another set of issues with displaying non-ASCII characters
> from Emacs.  So much so that I was prompted to add an entry in
> etc/PROBLEMS with some workarounds for users of Kitty.  Granted, their
> problems are not that they don't support recently added Unicode
> characters, it's that they support them "too well".  B ut still, it
> doesn't help when the result is a messed-up display.
>
> Unicode is not a static target, it's a moving one.  They issue a new
> version of the standard twice a year, and each new version adds new
> codepoints with new attributes.  If a new version of Unicode adds
> double-width characters, and some terminal emulator doesn't keep up,
> you will have problems displaying those new codepoints.  (AFAIK,
> that's in essence the problem with the Linux console: they last
> updated when Unicode 5.0 was released.)

That’s an interesting point. On the one hand, the fact that the Linux
console is still using Unicode 5.0 shows just how unmaintained it is
(released in July 2006; the next Emacs release was 22.1 in 2007). On the
other hand, perhaps if problems like this keep cropping up we will have
to add encodings for older unicode versions. People using the Linux
console could set their terminal encoding to
'utf-8-unicode5.0. Characters added after that would show up escaped,
and Emacs would know what width the terminal was going to use for each
character.

> So it might be possible to say that many terminals support substantial
> portions of Unicode, but it definitely is NOT right to say that we can
> freely use any character we want and think they will work everywhere.

So one assumption that you might make is that new source code being
added to Emacs must use characters from a version of Unicode which is
known to have wide compatibility, rather than immediately jumping to the
bleeding–edge version? That would be perfectly reasonable.

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-05 21:20                 ` Richard Stallman
  2021-10-05 22:13                   ` Daniel Brooks
@ 2021-10-05 22:25                   ` Stefan Kangas
  2021-10-06  6:21                     ` Daniel Brooks
  2021-10-06 12:29                     ` Eli Zaretskii
  2021-10-06 11:53                   ` character sets as they relate to “Raw” " Eli Zaretskii
  2 siblings, 2 replies; 120+ messages in thread
From: Stefan Kangas @ 2021-10-05 22:25 UTC (permalink / raw)
  To: rms, Daniel Brooks; +Cc: eliz, monnier, emacs-devel

Richard Stallman <rms@gnu.org> writes:

> It would be an enormous effort -- just consider translating the
> manuals.  And updating the translations for each Emacs version.  It
> would be a big burden.  We should urge volunteers to work on
> other areas of improvement

My native language is Swedish, and I do work professionally among other
things as a translator from English to Swedish.  I have also attempted
translating technical documentation, but usually give up because it's
just too hard.

Let me just say this, with regards to a fully multi-lingual Emacs.

In almost all free software projects I have seen, the Swedish
translation is shockingly poor.  It is so bad, in fact, that I often
think it would be better to just delete it outright.  This work is
usually done by volunteers that are trying their best, out of genuine
love and commitment, which obviously makes such a conversation somewhat
delicate.

Let me also point out that translation is hard, especially so of
technical documentation with all the terms it contains.  If you are not
a professional translator, you are very likely to fail badly, unless you
are prepared to spend a very substantial amount of time.  Even then,
success is all but guaranteed: even "professional" translators can and
do fail horribly at times.

So given my experience, I seriously doubt that volunteer translations
will reach a sufficiently high quality.  At the very least, such an
effort should be organized and coordinated in a serious way on a
cross-project level.  Perhaps GNU as a whole is big enough to do it as a
project.  Maybe.  But Emacs?  I doubt it.

(All of the above comes with the caveat that my experience is obviously
strictly limited to Swedish.)

> What might be worth doing is to implement multilingual output
> messages.  Many GNU packages support that, and Emacs could too.  With
> GNU gettext, the program's developers don't need to get involved in
> the translation, so it would not be a burden on us.

Even multilingual messages would be a large effort, and in the case of
Emacs it is still not clear that it would be of much use.  You would
immediately run into the brick-wall of English documentation, English
symbols, etc. etc.

That said, it sounds infinitely more doable than translating all
documentation, especially if we limit the scope to just some major
languages like Mandarin, Arabic and Spanish.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-05 21:20                 ` Richard Stallman
@ 2021-10-06  3:48                   ` Matthew Carter
  0 siblings, 0 replies; 120+ messages in thread
From: Matthew Carter @ 2021-10-06  3:48 UTC (permalink / raw)
  To: Richard Stallman; +Cc: db48x, eliz, emacs-devel, anna, Yuri Khan

Richard Stallman <rms@gnu.org> writes:

> [[[ To any NSA and FBI agents reading my email: please consider    ]]]
> [[[ whether defending the US Constitution against all enemies,     ]]]
> [[[ foreign or domestic, requires you to follow Snowden's example. ]]]
>
>   > > > PS: it occurs to me to wonder if my use of Unicode in the prose of this
>   > > > message, outside of the examples, detracted from its readability in any
>   > > > way?
>   > >
>   > > If someone is reading this on a text-mode terminal, it could.
>
>   > We should probably invent a term more accurate than “text-mode
>   > terminal” for things that fail to display text.
>
> The usual term for these terminals is "Linux consoles" or "ttys".
> Let's use those -- they are well-known and specific.

On my tty (rxvt-unicode) in which I am reading this via 'emacs -nw', I'm
unable to see most the non-ASCII characters, they simply show up as
unfilled white rectangles due to lack of support in my current font set.

-- 
Matthew Carter (m@ahungry.com)
http://ahungry.com



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-05 22:25                   ` character sets as they relate to â€œRawâ€ " Stefan Kangas
@ 2021-10-06  6:21                     ` Daniel Brooks
  2021-10-07 22:20                       ` Richard Stallman
  2021-10-06 12:29                     ` Eli Zaretskii
  1 sibling, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-06  6:21 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: eliz, emacs-devel, rms, monnier

Stefan Kangas <stefankangas@gmail.com> writes:

> My native language is Swedish, and I do work professionally among other
> things as a translator from English to Swedish.  I have also attempted
> translating technical documentation, but usually give up because it's
> just too hard.
>
> Let me just say this, with regards to a fully multi-lingual Emacs.
>
> In almost all free software projects I have seen, the Swedish
> translation is shockingly poor.  It is so bad, in fact, that I often
> think it would be better to just delete it outright.  This work is
> usually done by volunteers that are trying their best, out of genuine
> love and commitment, which obviously makes such a conversation somewhat
> delicate.

Ouch. Things do look rather nicer in my optimistic visions.

> Even multilingual messages would be a large effort, and in the case of
> Emacs it is still not clear that it would be of much use.  You would
> immediately run into the brick-wall of English documentation, English
> symbols, etc. etc.

This is certainly true. In the ideal multilingual Emacs in my head
everything is translated, even the source code (well, at least the elisp
source). Every symbol has an English name as well as translated names,
and when you visit a source file the translated names for your language
are transparently swapped in and everything is reindented so that it
looks right. Then when you save, the reverse transformation is
applied. Translated command names would be used in the UI and
substituted into the documentation in the same way we already do for key
bindings.

Obviously since that can’t all happen at once there would be a
significant period of time when the translations would be very
incomplete. After catching up there would always be a fringe of new
contributions that aren’t yet fully translated; new contributions to
ELPA and so on.

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 21:20                 ` Richard Stallman
  2021-10-05 22:13                   ` Daniel Brooks
  2021-10-05 22:25                   ` character sets as they relate to â€œRawâ€ " Stefan Kangas
@ 2021-10-06 11:53                   ` Eli Zaretskii
  2 siblings, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-06 11:53 UTC (permalink / raw)
  To: rms; +Cc: db48x, monnier, emacs-devel

> From: Richard Stallman <rms@gnu.org>
> Cc: monnier@iro.umontreal.ca, eliz@gnu.org, emacs-devel@gnu.org
> Date: Tue, 05 Oct 2021 17:20:40 -0400
> 
> The hard part of this is to develop a way for Emacs to use gettext
> and support translations of non-preloaded libraries.  This would
> require gettext to be extensible in a new way.  But I am sure that
> can be done, with some work.

This was discussed several times in the past.  The discussions
identified several problems of various complexity, and some ideas to
solve them.  Basically, this job waits for motivated volunteers to
come forward and implement at least some of what was discussed.

Patches will be welcome.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 22:13                   ` Daniel Brooks
@ 2021-10-06 12:13                     ` Eli Zaretskii
  2021-10-06 18:57                       ` Daniel Brooks
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-06 12:13 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: emacs-devel, rms, monnier

> From: Daniel Brooks <db48x@db48x.net>
> Cc: eliz@gnu.org,  monnier@iro.umontreal.ca,  emacs-devel@gnu.org
> Date: Tue, 05 Oct 2021 15:13:20 -0700
> 
> > So it might be possible to say that many terminals support substantial
> > portions of Unicode, but it definitely is NOT right to say that we can
> > freely use any character we want and think they will work everywhere.
> 
> So one assumption that you might make is that new source code being
> added to Emacs must use characters from a version of Unicode which is
> known to have wide compatibility, rather than immediately jumping to the
> bleeding–edge version? That would be perfectly reasonable.

Except that it will only work if you use a very recent Emacs.  People
are still using Emacs 24 out there, and many are using Emacs 26, which
supports Unicode 11.0 (the latest version is 14.0).

And that's before we even begin to think about people who use other
programs to look at our files.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-05 22:25                   ` character sets as they relate to â€œRawâ€ " Stefan Kangas
  2021-10-06  6:21                     ` Daniel Brooks
@ 2021-10-06 12:29                     ` Eli Zaretskii
  2021-10-06 12:52                       ` Stefan Kangas
  1 sibling, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-06 12:29 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: db48x, emacs-devel, rms, monnier

> From: Stefan Kangas <stefankangas@gmail.com>
> Date: Tue, 5 Oct 2021 18:25:07 -0400
> Cc: eliz@gnu.org, monnier@iro.umontreal.ca, emacs-devel@gnu.org
> 
> So given my experience, I seriously doubt that volunteer translations
> will reach a sufficiently high quality.  At the very least, such an
> effort should be organized and coordinated in a serious way on a
> cross-project level.  Perhaps GNU as a whole is big enough to do it as a
> project.  Maybe.  But Emacs?  I doubt it.

We are currently talking about providing infrastructure for
translation.  As long as there's no such infrastructure, talking about
translations and their quality is not useful.

> Even multilingual messages would be a large effort, and in the case of
> Emacs it is still not clear that it would be of much use.  You would
> immediately run into the brick-wall of English documentation, English
> symbols, etc. etc.

This has been discussed.  The conclusions were that yes, if you want
to be able to translate all of Emacs, it is probably impractical.  But
translating some of it should be doable.  For example, echo-area
messages in Emacs are not different from messages shown by text-mode
programs, and those all have message catalogs translated into many
languages.  There's no reason enthusiasts shouldn't be able to do the
same for Emacs.  There's no reason to require or expect that every
message will be translated, either

Yes, translation of program messages is hard, and there are many bad
translations.  But it is not impossible to produce good translations.

> That said, it sounds infinitely more doable than translating all
> documentation, especially if we limit the scope to just some major
> languages like Mandarin, Arabic and Spanish.

No one seriously talks about translating all our documentation.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-06 12:29                     ` Eli Zaretskii
@ 2021-10-06 12:52                       ` Stefan Kangas
  2021-10-06 13:10                         ` Jean-Christophe Helary
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-06 12:52 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: Daniel Brooks, Emacs developers, Richard Stallman, Stefan Monnier

Eli Zaretskii <eliz@gnu.org> writes:

> We are currently talking about providing infrastructure for
> translation.  As long as there's no such infrastructure, talking about
> translations and their quality is not useful.

We have translations in etc/tutorials and etc/refcards.

> Yes, translation of program messages is hard, and there are many bad
> translations.  But it is not impossible to produce good translations.

I don't recall having used the word "impossible".  It seems to me that
we agree on this point.

> No one seriously talks about translating all our documentation.

That's good.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-06 12:52                       ` Stefan Kangas
@ 2021-10-06 13:10                         ` Jean-Christophe Helary
  0 siblings, 0 replies; 120+ messages in thread
From: Jean-Christophe Helary @ 2021-10-06 13:10 UTC (permalink / raw)
  To: Stefan Kangas
  Cc: Daniel Brooks, Eli Zaretskii, Richard Stallman, Stefan Monnier,
	Emacs developers



> On Oct 6, 2021, at 21:52, Stefan Kangas <stefankangas@gmail.com> wrote:
> 
>> 
>> No one seriously talks about translating all our documentation.
> 
> That's good.

I do. It is something that takes time, but Emacs was not born yesterday either.
There are (free) tools that would make that much easier/faster/more efficient than editing strings in po-mode.
I'll make a short presentation about that in the near future.

-- 
Jean-Christophe Helary @brandelune
https://mac4translators.blogspot.com
https://sr.ht/~brandelune/omegat-as-a-book/




^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 12:13                     ` Eli Zaretskii
@ 2021-10-06 18:57                       ` Daniel Brooks
  2021-10-07  4:23                         ` Eli Zaretskii
                                           ` (2 more replies)
  0 siblings, 3 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-06 18:57 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: rms, monnier, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> From: Daniel Brooks <db48x@db48x.net>
>> Cc: eliz@gnu.org,  monnier@iro.umontreal.ca,  emacs-devel@gnu.org
>> Date: Tue, 05 Oct 2021 15:13:20 -0700
>> 
>> So one assumption that you might make is that new source code being
>> added to Emacs must use characters from a version of Unicode which is
>> known to have wide compatibility, rather than immediately jumping to the
>> bleeding–edge version? That would be perfectly reasonable.
>
> Except that it will only work if you use a very recent Emacs.  People
> are still using Emacs 24 out there, and many are using Emacs 26, which
> supports Unicode 11.0 (the latest version is 14.0).
>
> And that's before we even begin to think about people who use other
> programs to look at our files.

I’m afraid that this is not very convincing, for it means that we can
never move an inch.

db48x



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 17:15                     ` Eli Zaretskii
  2021-10-05 18:40                       ` [External] : " Drew Adams
@ 2021-10-06 20:54                       ` Richard Stallman
  2021-10-07  6:54                         ` Eli Zaretskii
  1 sibling, 1 reply; 120+ messages in thread
From: Richard Stallman @ 2021-10-06 20:54 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: db48x, yuri.v.khan, emacs-devel, monnier, juri

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > > Such ugly writing style where an en dash is not separated from the
  > > nearby words by whitespace makes the Info manual less readable.

  >      As a last resort, if you had buffers with content which were not
  >   associated with any files, or if the autosave was not recent enough to
  >   have recorded important changes, you can use the ‘etc/emacs-buffer.gdb’
  >   script with GDB (the GNU Debugger) to retrieve them from a core
  >   dump–provided that a core dump was saved, and that the Emacs executable

How exactly did you obtain that text?  That ought to be an em-dash,
written in Texinfo source as `---'.  Normally Texinfo represents an
em-dash in ASCII output with two dashes, not just one.  It would be
`...from a core dump-–provided that a core dump...'

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-05 17:08                       ` Daniel Brooks
@ 2021-10-06 20:54                         ` Richard Stallman
  2021-10-07  7:01                           ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Richard Stallman @ 2021-10-06 20:54 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: acm, eliz, monnier, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > Ah, I think I understand what you are getting at. You are asking why raw
  > strings should use ｢｣?

On a Linux console, that shows as two diamonds followed by a question mark.
That's no good for special syntactic delimiters.

Indeed, to use non-ASCII characters for such jobs would be a radical
change.  In things like this, we must play safe.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 18:57                       ` Daniel Brooks
@ 2021-10-07  4:23                         ` Eli Zaretskii
  2021-10-07 22:27                         ` Richard Stallman
  2021-10-08 10:37                         ` Po Lu
  2 siblings, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-07  4:23 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: rms, monnier, emacs-devel

> From: Daniel Brooks <db48x@db48x.net>
> Cc: emacs-devel@gnu.org,  rms@gnu.org,  monnier@iro.umontreal.ca
> Date: Wed, 06 Oct 2021 11:57:32 -0700
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> >> From: Daniel Brooks <db48x@db48x.net>
> >> Cc: eliz@gnu.org,  monnier@iro.umontreal.ca,  emacs-devel@gnu.org
> >> Date: Tue, 05 Oct 2021 15:13:20 -0700
> >> 
> >> So one assumption that you might make is that new source code being
> >> added to Emacs must use characters from a version of Unicode which is
> >> known to have wide compatibility, rather than immediately jumping to the
> >> bleeding–edge version? That would be perfectly reasonable.
> >
> > Except that it will only work if you use a very recent Emacs.  People
> > are still using Emacs 24 out there, and many are using Emacs 26, which
> > supports Unicode 11.0 (the latest version is 14.0).
> >
> > And that's before we even begin to think about people who use other
> > programs to look at our files.
> 
> I’m afraid that this is not very convincing, for it means that we can
> never move an inch.

No, it means we can never go all the way and forget that some displays
with some versions of Emacs might not be able to display some of the
characters.  So we should only use them sparingly and where necessary.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 20:54                       ` Richard Stallman
@ 2021-10-07  6:54                         ` Eli Zaretskii
  2021-10-07 13:14                           ` Stefan Kangas
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-07  6:54 UTC (permalink / raw)
  To: rms; +Cc: db48x, yuri.v.khan, emacs-devel, monnier, juri

> From: Richard Stallman <rms@gnu.org>
> Cc: juri@linkov.net, db48x@db48x.net, emacs-devel@gnu.org,
> 	monnier@iro.umontreal.ca, yuri.v.khan@gmail.com
> Date: Wed, 06 Oct 2021 16:54:16 -0400
> 
>   >      As a last resort, if you had buffers with content which were not
>   >   associated with any files, or if the autosave was not recent enough to
>   >   have recorded important changes, you can use the ‘etc/emacs-buffer.gdb’
>   >   script with GDB (the GNU Debugger) to retrieve them from a core
>   >   dump–provided that a core dump was saved, and that the Emacs executable
> 
> How exactly did you obtain that text?  That ought to be an em-dash,
> written in Texinfo source as `---'.

The original text used '--', which produced en-dash instead.  (Stefan
fixed that since then.)

> Normally Texinfo represents an em-dash in ASCII output with two
> dashes, not just one.  It would be `...from a core dump-–provided
> that a core dump...'

That has changed, since we nowadays by default use UTF-8 encoding in
our Info manuals.  With that, '---' produces the Unicode em-dash
character, displayed as a wide dash, and '--' produces a Unicode
en-dash character, displayed as somewhat more narrow dash (but still
wider than the ASCII dash).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 20:54                         ` Richard Stallman
@ 2021-10-07  7:01                           ` Eli Zaretskii
  0 siblings, 0 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-07  7:01 UTC (permalink / raw)
  To: rms; +Cc: db48x, acm, monnier, emacs-devel

> From: Richard Stallman <rms@gnu.org>
> Cc: acm@muc.de, eliz@gnu.org, monnier@iro.umontreal.ca,
> 	emacs-devel@gnu.org
> Date: Wed, 06 Oct 2021 16:54:17 -0400
> 
>   > Ah, I think I understand what you are getting at. You are asking why raw
>   > strings should use ｢｣?
> 
> On a Linux console, that shows as two diamonds followed by a question mark.
> That's no good for special syntactic delimiters.
> 
> Indeed, to use non-ASCII characters for such jobs would be a radical
> change.  In things like this, we must play safe.

We can use the display-table feature to avoid the problems with such
characters.  We already do that with displaying doc strings in *Help*
buffers, and when displaying Info manuals: we display those characters
as their ASCII equivalents or equivalent ASCII strings.  Of course,
for this to work, Someone(TM) should identify the problematic
characters and program Emacs to set up the display tables accordingly.

The point to take from this discussion, IMO, is that we should
carefully consider introduction of each such character, and arrange
for the text using them to be legible on all supported display types.
We cannot just willy-nilly add them to our sources and documentation
and assume they will always be displayed correctly.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-07  6:54                         ` Eli Zaretskii
@ 2021-10-07 13:14                           ` Stefan Kangas
  2021-10-07 13:34                             ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-07 13:14 UTC (permalink / raw)
  To: Eli Zaretskii, rms; +Cc: db48x, juri, emacs-devel, monnier, yuri.v.khan

Eli Zaretskii <eliz@gnu.org> writes:

>> Normally Texinfo represents an em-dash in ASCII output with two
>> dashes, not just one.  It would be `...from a core dump-–provided
>> that a core dump...'

This does not seem to happen in (info "(texinfo) Conventions"):

    * Use three hyphens in a row, '---', to produce a long dash--like
      this (called an "em dash"), used for punctuation in sentences.

They use two HYPHEN-MINUS characters to represents an em-dash.  I also
quickly checked the gcc manual in the Debian stable package, and that
also has the correct em dashes.

> That has changed, since we nowadays by default use UTF-8 encoding in
> our Info manuals.  With that, '---' produces the Unicode em-dash
> character, displayed as a wide dash, and '--' produces a Unicode
> en-dash character, displayed as somewhat more narrow dash (but still
> wider than the ASCII dash).

IMHO, this is a bug that we should look into, as the correct style used
in the texinfo manual is more readable.  As Juri points out, it is not
well suited for a monospace font.

I guess texinfo would need some way to produce the previous style em
dashes, while still using utf-8?  Or something?

Or perhaps we could add some code info.el to add a space on each side of
an em dash, but that seems like bit of a hack.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-07 13:14                           ` Stefan Kangas
@ 2021-10-07 13:34                             ` Eli Zaretskii
  2021-10-07 14:48                               ` Stefan Kangas
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-07 13:34 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

> From: Stefan Kangas <stefankangas@gmail.com>
> Date: Thu, 7 Oct 2021 09:14:47 -0400
> Cc: db48x@db48x.net, yuri.v.khan@gmail.com, emacs-devel@gnu.org, 
> 	monnier@iro.umontreal.ca, juri@linkov.net
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> >> Normally Texinfo represents an em-dash in ASCII output with two
> >> dashes, not just one.  It would be `...from a core dump-–provided
> >> that a core dump...'
> 
> This does not seem to happen in (info "(texinfo) Conventions"):
> 
>     * Use three hyphens in a row, '---', to produce a long dash--like
>       this (called an "em dash"), used for punctuation in sentences.

You mean, you expected to see em dash there?  They deliberately used
@samp{---} to prevent that, because otherwise the text would be
confusing: it talks about typing 3 dashes in the Texinfo sources.  And
texinfo.texi doesn't have "@documentencoding UTF-8" which AFAIR is
required for the generation of non-ASCII characters from these
multiple dashes.

> They use two HYPHEN-MINUS characters to represents an em-dash.

You mean, 3, not 2, right?

> > That has changed, since we nowadays by default use UTF-8 encoding in
> > our Info manuals.  With that, '---' produces the Unicode em-dash
> > character, displayed as a wide dash, and '--' produces a Unicode
> > en-dash character, displayed as somewhat more narrow dash (but still
> > wider than the ASCII dash).
> 
> IMHO, this is a bug that we should look into, as the correct style used
> in the texinfo manual is more readable.  As Juri points out, it is not
> well suited for a monospace font.

What is the bug that you want to fix here?  I'm not sure I understand.

> I guess texinfo would need some way to produce the previous style em
> dashes, while still using utf-8?  Or something?
> 
> Or perhaps we could add some code info.el to add a space on each side of
> an em dash, but that seems like bit of a hack.

I don't really see what needs to be fixed here.  The original Texinfo
source doesn't have the spaces, according to the US English
conventions we use.  And the produced text also doesn't have any
spaces.  So we get back what we asked for, and Texinfo isn't the one
to blame: it just did what we told it to do.

Or what am I missing?



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-07 13:34                             ` Eli Zaretskii
@ 2021-10-07 14:48                               ` Stefan Kangas
  2021-10-07 16:00                                 ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-07 14:48 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> This does not seem to happen in (info "(texinfo) Conventions"):
>>
>>     * Use three hyphens in a row, '---', to produce a long dash--like
>>       this (called an "em dash"), used for punctuation in sentences.
>
> You mean, you expected to see em dash there?  They deliberately used
> @samp{---} to prevent that, because otherwise the text would be
> confusing: it talks about typing 3 dashes in the Texinfo sources.  And
> texinfo.texi doesn't have "@documentencoding UTF-8" which AFAIR is
> required for the generation of non-ASCII characters from these
> multiple dashes.

Sorry, I actually meant that I think the above looks like I think it
should.

- In the first case, they use "@samp{---}" and it displays correctly.
- In the second case, they use "---" and it displays correctly (as two
  HYPHEN-MINUS).

IOW, when I wrote "that does not seem to happen" what I was trying to
say was: "in the texinfo manual, the display looks correct to me", but I
managed to say what I meant in a rather confusing and unclear way.

> What is the bug that you want to fix here?  I'm not sure I understand.

Yes, that's my bad for not speaking clearly.  :-)

What I mean is that I think it would be better if our manuals displayed
em dash (written as "---") as they are displayed in the texinfo manual:
"--" (HYPHEN-MINUS, HYPHEN-MINUS), instead of as "—" (EM DASH).  I find
the former way to display this character easier to read in the monospace
fonts that we typically use.

Alternatively, I think it would also be easier to read if it was
displayed as " — " (SPACE, EM DASH, SPACE).

If we had width fonts here, I expect that "—" (EM DASH) would be fine in
most commonly used fonts.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-07 14:48                               ` Stefan Kangas
@ 2021-10-07 16:00                                 ` Eli Zaretskii
  2021-10-08  0:37                                   ` Stefan Kangas
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-07 16:00 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

> From: Stefan Kangas <stefankangas@gmail.com>
> Date: Thu, 7 Oct 2021 10:48:09 -0400
> Cc: rms@gnu.org, db48x@db48x.net, yuri.v.khan@gmail.com, emacs-devel@gnu.org, 
> 	monnier@iro.umontreal.ca, juri@linkov.net
> 
> - In the first case, they use "@samp{---}" and it displays correctly.
> - In the second case, they use "---" and it displays correctly (as two
>   HYPHEN-MINUS).

The second one is because the Texinfo manual intentionally doesn't use
UTF-8 as @documentencoding.  Whereas we do (also intentionally).

> What I mean is that I think it would be better if our manuals displayed
> em dash (written as "---") as they are displayed in the texinfo manual:
> "--" (HYPHEN-MINUS, HYPHEN-MINUS), instead of as "—" (EM DASH).  I find
> the former way to display this character easier to read in the monospace
> fonts that we typically use.

Others disagreed at the time, and so we decided quite some time ago to
use @documentencoding UTF-8 in all our manuals.  (It was not only
about the dashes; UTF-8 encoding causes quite a lot of other Unicode
characters to be output by makeinfo.)  I see no reason to reverse that
decision (and start all those arguments all over again).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to â€œRawâ€ string literals for elisp
  2021-10-06  6:21                     ` Daniel Brooks
@ 2021-10-07 22:20                       ` Richard Stallman
  0 siblings, 0 replies; 120+ messages in thread
From: Richard Stallman @ 2021-10-07 22:20 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: eliz, emacs-devel, stefankangas, monnier

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > This is certainly true. In the ideal multilingual Emacs in my head
  > everything is translated, even the source code (well, at least the elisp
  > source). Every symbol has an English name as well as translated names,
  > and when you visit a source file the translated names for your language
  > are transparently swapped in and everything is reindented so that it
  > looks right.

If the idea is that people write the code in English (symbols and
comments), and later provide translations for them, it would be fine
_in principle_.  The English version should be the real code, the code
that really runs.  This way, confusions in translation would not cause
actual bugs.  Thus, people changing the code would have to write in
English.

Providing the translations could provide a way to usefully employ the
good will of tens of thousands of eager contributors, if only we had
them.

Still, I am against actually trying this, because it would not improve
actual _use_ of Emacs.  Anyway, programmers need to learn English.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 18:57                       ` Daniel Brooks
  2021-10-07  4:23                         ` Eli Zaretskii
@ 2021-10-07 22:27                         ` Richard Stallman
  2021-10-08 10:37                         ` Po Lu
  2 siblings, 0 replies; 120+ messages in thread
From: Richard Stallman @ 2021-10-07 22:27 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: eliz, monnier, emacs-devel

[[[ To any NSA and FBI agents reading my email: please consider    ]]]
[[[ whether defending the US Constitution against all enemies,     ]]]
[[[ foreign or domestic, requires you to follow Snowden's example. ]]]

  > > And that's before we even begin to think about people who use other
  > > programs to look at our files.

  > I’m afraid that this is not very convincing, for it means that we can
  > never move an inch.

We're simply not convinced we want to move in that direction at all.

-- 
Dr Richard Stallman (https://stallman.org)
Chief GNUisance of the GNU Project (https://gnu.org)
Founder, Free Software Foundation (https://fsf.org)
Internet Hall-of-Famer (https://internethalloffame.org)





^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-07 16:00                                 ` Eli Zaretskii
@ 2021-10-08  0:37                                   ` Stefan Kangas
  2021-10-08  6:53                                     ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-08  0:37 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> The second one is because the Texinfo manual intentionally doesn't use
> UTF-8 as @documentencoding.  Whereas we do (also intentionally).

Right, thanks for confirming that.

>> What I mean is that I think it would be better if our manuals displayed
>> em dash (written as "---") as they are displayed in the texinfo manual:
>> "--" (HYPHEN-MINUS, HYPHEN-MINUS), instead of as "—" (EM DASH).  I find
>> the former way to display this character easier to read in the monospace
>> fonts that we typically use.
>
> Others disagreed at the time, and so we decided quite some time ago to
> use @documentencoding UTF-8 in all our manuals.  (It was not only
> about the dashes; UTF-8 encoding causes quite a lot of other Unicode
> characters to be output by makeinfo.)  I see no reason to reverse that
> decision (and start all those arguments all over again).

I also see no reason to reverse that decision, if the particular case of
how em dash is displayed was already considered in detail as part of
that discussion.

If that case was not considered in detail, perhaps we could discuss it
now.  I would hope that we could agree that how em dash is displayed is
not necessarily strictly connected to "@documentencoding UTF-8"; and
that it would be useful to continue using UTF-8 encoding, but also get
the "old" way of displaying em dash.

Maybe that would require us to use an existing option in texinfo, or
maybe this would need the texinfo developers to provide a new option
that could support it.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08  0:37                                   ` Stefan Kangas
@ 2021-10-08  6:53                                     ` Eli Zaretskii
  2021-10-08 15:09                                       ` Display of em dashes in our documentation Stefan Kangas
  2021-10-08 17:17                                       ` character sets as they relate to “Raw” string literals for elisp Alan Mackenzie
  0 siblings, 2 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-08  6:53 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

> From: Stefan Kangas <stefankangas@gmail.com>
> Date: Thu, 7 Oct 2021 20:37:19 -0400
> Cc: rms@gnu.org, db48x@db48x.net, yuri.v.khan@gmail.com, emacs-devel@gnu.org, 
> 	monnier@iro.umontreal.ca, juri@linkov.net
> 
> >> What I mean is that I think it would be better if our manuals displayed
> >> em dash (written as "---") as they are displayed in the texinfo manual:
> >> "--" (HYPHEN-MINUS, HYPHEN-MINUS), instead of as "—" (EM DASH).  I find
> >> the former way to display this character easier to read in the monospace
> >> fonts that we typically use.
> >
> > Others disagreed at the time, and so we decided quite some time ago to
> > use @documentencoding UTF-8 in all our manuals.  (It was not only
> > about the dashes; UTF-8 encoding causes quite a lot of other Unicode
> > characters to be output by makeinfo.)  I see no reason to reverse that
> > decision (and start all those arguments all over again).
> 
> I also see no reason to reverse that decision, if the particular case of
> how em dash is displayed was already considered in detail as part of
> that discussion.
> 
> If that case was not considered in detail, perhaps we could discuss it
> now.

I'd rather not start another discussion of this, as opinions tend to
be polarized about it, and IME nothing can bridge over the differences
of opinions in this matter.  So I prefer a different way of handling
this, see below.

> I would hope that we could agree that how em dash is displayed is
> not necessarily strictly connected to "@documentencoding UTF-8"; and
> that it would be useful to continue using UTF-8 encoding, but also get
> the "old" way of displaying em dash.

Many people want to use and see Unicode punctuation characters in
human-readable text.  You can see that clearly in the Emacs mailing
lists: people use Unicode quotes “..”, dashes, Emoji, and other
special characters.  Since Info is largely such a human-readable text,
those people want to see the same there.  I don't see any way of
convincing them to change their views, nor do I think we should try.

> Maybe that would require us to use an existing option in texinfo, or
> maybe this would need the texinfo developers to provide a new option
> that could support it.

Even if such an option existed, it would still beg the question: how
to produce the Info manuals we provide as part of the Emacs release
tarballs?  The downside of any decision in this matter is that it is
imposed on everyone, no matter what their views on this.

So I'd prefer to deal with this differently: introduce a new
(buffer-local) minor mode, which will install a display-table, whereby
"problematic" Unicode characters will be displayed as their ASCII
equivalents or equivalent ASCII strings.  We already set that up
automatically on terminals that are incapable of displaying those
characters, but nothing precludes us from having such a feature on
demand for capable displays as well.  Then users who don't want the
effects of these characters on display could activate such a mode, and
solve their problems without affecting the actual contents of the Info
files.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-06 18:57                       ` Daniel Brooks
  2021-10-07  4:23                         ` Eli Zaretskii
  2021-10-07 22:27                         ` Richard Stallman
@ 2021-10-08 10:37                         ` Po Lu
  2021-10-08 10:53                           ` Basil L. Contovounesios
  2 siblings, 1 reply; 120+ messages in thread
From: Po Lu @ 2021-10-08 10:37 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: Eli Zaretskii, rms, monnier, emacs-devel

Daniel Brooks <db48x@db48x.net> writes:

>> Except that it will only work if you use a very recent Emacs.  People
>> are still using Emacs 24 out there, and many are using Emacs 26, which
>> supports Unicode 11.0 (the latest version is 14.0).
>>
>> And that's before we even begin to think about people who use other
>> programs to look at our files.
>
> I’m afraid that this is not very convincing, for it means that we can
> never move an inch.

I personally know a dozen or so people who are still running Emacs 26,
and a few others who use whatever Debian ships, which is typically 25 or
something to that effect.

Thanks.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 10:37                         ` Po Lu
@ 2021-10-08 10:53                           ` Basil L. Contovounesios
  2021-10-08 11:27                             ` tomas
  0 siblings, 1 reply; 120+ messages in thread
From: Basil L. Contovounesios @ 2021-10-08 10:53 UTC (permalink / raw)
  To: Po Lu; +Cc: Daniel Brooks, Eli Zaretskii, rms, monnier, emacs-devel

Po Lu [2021-10-08 18:37 +0800] wrote:

> I personally know a dozen or so people who are still running Emacs 26,
> and a few others who use whatever Debian ships, which is typically 25 or
> something to that effect.

FWIW, the previous Debian Stable release ships 26.1, and the current one
27.1.

-- 
Basil



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 10:53                           ` Basil L. Contovounesios
@ 2021-10-08 11:27                             ` tomas
  0 siblings, 0 replies; 120+ messages in thread
From: tomas @ 2021-10-08 11:27 UTC (permalink / raw)
  To: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 500 bytes --]

On Fri, Oct 08, 2021 at 11:53:23AM +0100, Basil L. Contovounesios wrote:
> Po Lu [2021-10-08 18:37 +0800] wrote:
> 
> > I personally know a dozen or so people who are still running Emacs 26,
> > and a few others who use whatever Debian ships, which is typically 25 or
> > something to that effect.
> 
> FWIW, the previous Debian Stable release ships 26.1, and the current one
> 27.1.

Yeah, had to look it up. Thanks for setting it straight.

That's how memes spread :-/

Cheers
 - t

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Display of em dashes in our documentation
  2021-10-08  6:53                                     ` Eli Zaretskii
@ 2021-10-08 15:09                                       ` Stefan Kangas
  2021-10-08 16:12                                         ` Eli Zaretskii
  2021-10-08 17:17                                       ` character sets as they relate to “Raw” string literals for elisp Alan Mackenzie
  1 sibling, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-08 15:09 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Even if such an option existed, it would still beg the question: how
> to produce the Info manuals we provide as part of the Emacs release
> tarballs?  The downside of any decision in this matter is that it is
> imposed on everyone, no matter what their views on this.

OTOH, they already get the old/readable em dash rendering in many other
GNU manuals.  So perhaps not many people are very invested in the issue?

> So I'd prefer to deal with this differently: introduce a new
> (buffer-local) minor mode, which will install a display-table, whereby
> "problematic" Unicode characters will be displayed as their ASCII
> equivalents or equivalent ASCII strings.  We already set that up
> automatically on terminals that are incapable of displaying those
> characters, but nothing precludes us from having such a feature on
> demand for capable displays as well.  Then users who don't want the
> effects of these characters on display could activate such a mode, and
> solve their problems without affecting the actual contents of the Info
> files.

That's an interesting idea.  So you propose that this minor mode could
display many more problematic Unicode characters in a different way, and
that it would not necessarily be limited to info?

I guess it doesn't take much coding, but the interesting question is
which other Unicode characters it should cover.  I only know about em
dash; are there others?

One drawback is that em dash is only confirmed to be problematic in some
situations; that is when they are written "like—this" with no space in
between, whereas in situations "like — this" I think it is much
preferable to show the actual Unicode character.

Actually, this gives me another way of how we could do it: in our info
manuals, as the final step before displaying it, we do something like
this (here quickly coded up as a hook):

    (defun sk/Info-fix-em-dashes ()
      (save-excursion
        (goto-char (point-min))
        (let ((case-fold-search t)
              (buffer-read-only nil))
          (while (re-search-forward "\\([a-z”’']\\)—\\([a-z]“‘`'\\)" nil t)
            (replace-match "\\1 — \\2" t)))))

    (add-hook 'Info-selection-hook #'sk/Info-fix-em-dashes)

    (info "(emacs) After a Crash")

This could be made optional, or even the default if we agree that it is
more readable than what we have now.

In any monospace font, I certainly prefer this:

       When ‘recover-session’ is done, the files you’ve chosen to recover
    are present in Emacs buffers.  You should then save them.  Only
    this — saving them — updates the files themselves.

To this:

       When ‘recover-session’ is done, the files you’ve chosen to recover
    are present in Emacs buffers.  You should then save them.  Only
    this—saving them—updates the files themselves.

That way, everyone gets readable text but also UTF-8 characters.
Win-win.  Perhaps this is even something the Texinfo developers would be
willing to consider.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Display of em dashes in our documentation
  2021-10-08 15:09                                       ` Display of em dashes in our documentation Stefan Kangas
@ 2021-10-08 16:12                                         ` Eli Zaretskii
  2021-10-08 17:17                                           ` Stefan Kangas
                                                             ` (2 more replies)
  0 siblings, 3 replies; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-08 16:12 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

> From: Stefan Kangas <stefankangas@gmail.com>
> Date: Fri, 8 Oct 2021 11:09:13 -0400
> Cc: rms@gnu.org, db48x@db48x.net, yuri.v.khan@gmail.com, emacs-devel@gnu.org, 
> 	monnier@iro.umontreal.ca, juri@linkov.net
> 
> > So I'd prefer to deal with this differently: introduce a new
> > (buffer-local) minor mode, which will install a display-table, whereby
> > "problematic" Unicode characters will be displayed as their ASCII
> > equivalents or equivalent ASCII strings.  We already set that up
> > automatically on terminals that are incapable of displaying those
> > characters, but nothing precludes us from having such a feature on
> > demand for capable displays as well.  Then users who don't want the
> > effects of these characters on display could activate such a mode, and
> > solve their problems without affecting the actual contents of the Info
> > files.
> 
> That's an interesting idea.  So you propose that this minor mode could
> display many more problematic Unicode characters in a different way, and
> that it would not necessarily be limited to info?

Yes.

> I guess it doesn't take much coding, but the interesting question is
> which other Unicode characters it should cover.  I only know about em
> dash; are there others?

We could let users customize the list in some way.

> One drawback is that em dash is only confirmed to be problematic in some
> situations; that is when they are written "like—this" with no space in
> between, whereas in situations "like — this" I think it is much
> preferable to show the actual Unicode character.

That's splitting hair, IMO.  The latter should never happen in a
well-written manual.

> Actually, this gives me another way of how we could do it: in our info
> manuals, as the final step before displaying it, we do something like
> this (here quickly coded up as a hook):
> 
>     (defun sk/Info-fix-em-dashes ()
>       (save-excursion
>         (goto-char (point-min))
>         (let ((case-fold-search t)
>               (buffer-read-only nil))
>           (while (re-search-forward "\\([a-z”’']\\)—\\([a-z]“‘`'\\)" nil t)
>             (replace-match "\\1 — \\2" t)))))
> 
>     (add-hook 'Info-selection-hook #'sk/Info-fix-em-dashes)

What will that do to byte offsets in Info tag tables?  I'd rather
avoid modifying the buffer contents.

> In any monospace font, I certainly prefer this:
> 
>        When ‘recover-session’ is done, the files you’ve chosen to recover
>     are present in Emacs buffers.  You should then save them.  Only
>     this — saving them — updates the files themselves.
> 
> To this:
> 
>        When ‘recover-session’ is done, the files you’ve chosen to recover
>     are present in Emacs buffers.  You should then save them.  Only
>     this—saving them—updates the files themselves.

But that's against our style of writing documents, isn't it?  I
believe the usual US English style is not to leave whitespace around
em dash.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Display of em dashes in our documentation
  2021-10-08 16:12                                         ` Eli Zaretskii
@ 2021-10-08 17:17                                           ` Stefan Kangas
  2021-10-10  8:00                                             ` Juri Linkov
  2021-10-08 17:27                                           ` Daniel Brooks
  2021-10-08 18:26                                           ` [External] : " Drew Adams
  2 siblings, 1 reply; 120+ messages in thread
From: Stefan Kangas @ 2021-10-08 17:17 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: rms, yuri.v.khan, juri, db48x, monnier, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> One drawback is that em dash is only confirmed to be problematic in some
>> situations; that is when they are written "like—this" with no space in
>> between, whereas in situations "like — this" I think it is much
>> preferable to show the actual Unicode character.
>
> That's splitting hair, IMO.  The latter should never happen in a
> well-written manual.

Even if your second claim is true, your proposal, IIUC, is that this
mode could be used even outside of Info-mode.  If we introduce a mode
that fixes this in some cases, there is a risk that it will lead to
suboptimal results in others.  I do not think that pointing this out is
unimportant or "splitting hairs".

> What will that do to byte offsets in Info tag tables?  I'd rather
> avoid modifying the buffer contents.

What do you mean by "byte-offsets in Info tag tables"?  Do you mean that
this approach risks leaving a table misaligned?  If so, I think that is
correct, and clearly a drawback.  I don't see an easy way around it with
this approach (but I also don't see a scenario when you would properly
use an em dash in a table).

I agree that it would be better not to modify the buffer contents, but
IIUC that would require changes in Texinfo to support this use-case.

>> In any monospace font, I certainly prefer this:
>>
>>        When ‘recover-session’ is done, the files you’ve chosen to recover
>>     are present in Emacs buffers.  You should then save them.  Only
>>     this — saving them — updates the files themselves.
[...]
>
> But that's against our style of writing documents, isn't it?  I
> believe the usual US English style is not to leave whitespace around
> em dash.

We have discussed this up-thread, and the situation is clear: the most
common style in printed books is to not use whitespace, whereas in
papers and magazines the most common style is to use whitespace.  Both
approaches are valid and commonly used in properly written English.

AFAIK, there is no consensus about how this should be handled when you
render text in a monospace font for display on a screen.  No one has
presented any evidence that such a consensus exists.

I don't know, but I assume that the reason that Texinfo doesn't leave
space around em dash is because this is considered undesirable in
printed manuals.  But I believe treating the printed manual as exactly
analogous to the screen is a mistake here; the practical considerations
that made Texinfo render em dash as "--" in the past still apply when
using UTF-8, as long as the result is intended for display in a
mono-space font.  The better solution in that case is to render it as
" — " or "--", even when the rest of the text is UTF-8.

But with all this, I am actually beginning to wonder if this shouldn't
properly be fixed in Texinfo itself.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08  6:53                                     ` Eli Zaretskii
  2021-10-08 15:09                                       ` Display of em dashes in our documentation Stefan Kangas
@ 2021-10-08 17:17                                       ` Alan Mackenzie
  2021-10-08 17:42                                         ` Eli Zaretskii
  1 sibling, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-08 17:17 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: emacs-devel, rms, juri, db48x, Stefan Kangas, yuri.v.khan,
	monnier

Hello, Eli.

On Fri, Oct 08, 2021 at 09:53:54 +0300, Eli Zaretskii wrote:
> > From: Stefan Kangas <stefankangas@gmail.com>
> > Date: Thu, 7 Oct 2021 20:37:19 -0400
> > Cc: rms@gnu.org, db48x@db48x.net, yuri.v.khan@gmail.com, emacs-devel@gnu.org, 
> > 	monnier@iro.umontreal.ca, juri@linkov.net

[ .... ]

> I'd rather not start another discussion of this [How to display em dash
> in info], as opinions tend to be polarized about it, and IME nothing
> can bridge over the differences of opinions in this matter.  So I
> prefer a different way of handling this, see below.

> > I would hope that we could agree that how em dash is displayed is
> > not necessarily strictly connected to "@documentencoding UTF-8"; and
> > that it would be useful to continue using UTF-8 encoding, but also get
> > the "old" way of displaying em dash.

> Many people want to use and see Unicode punctuation characters in
> human-readable text.

That's a deliciously ambiguous sentence, with two opposite meanings.  :-)

I belong to the group of people who would rather see Unicode punctuation
rendered in human-readable (i.e. ASCII) text.

[ .... ]

> So I'd prefer to deal with this differently: introduce a new
> (buffer-local) minor mode, which will install a display-table, whereby
> "problematic" Unicode characters will be displayed as their ASCII
> equivalents or equivalent ASCII strings.  We already set that up
> automatically on terminals that are incapable of displaying those
> characters, but nothing precludes us from having such a feature on
> demand for capable displays as well.  Then users who don't want the
> effects of these characters on display could activate such a mode, and
> solve their problems without affecting the actual contents of the Info
> files.

I would suggest something slightly different which will solve the entire
problem rather than just part of it.  Have the minor mode translate the
buffer text (temporarily) into ASCII rather than just displaying it thus.
That way the user can search for `foo' or ... simply by using C-s.

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Display of em dashes in our documentation
  2021-10-08 16:12                                         ` Eli Zaretskii
  2021-10-08 17:17                                           ` Stefan Kangas
@ 2021-10-08 17:27                                           ` Daniel Brooks
  2021-10-08 18:26                                           ` [External] : " Drew Adams
  2 siblings, 0 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-08 17:27 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel, rms, juri, Stefan Kangas, yuri.v.khan, monnier

Eli Zaretskii <eliz@gnu.org> writes:

>> From: Stefan Kangas <stefankangas@gmail.com>
>> In any monospace font, I certainly prefer this:
>> 
>>        When ‘recover-session’ is done, the files you’ve chosen to recover
>>     are present in Emacs buffers.  You should then save them.  Only
>>     this — saving them — updates the files themselves.
>> 
>> To this:
>> 
>>        When ‘recover-session’ is done, the files you’ve chosen to recover
>>     are present in Emacs buffers.  You should then save them.  Only
>>     this—saving them—updates the files themselves.
>
> But that's against our style of writing documents, isn't it?  I
> believe the usual US English style is not to leave whitespace around
> em dash.

I agree that there is no universally consistent style here, but that’s
only because half the world is demonstrably mad. In a monospaced font,
putting spaces around the dashes is the only thing that signals that
they aren’t being used as a hyphen. In a variable–pitch font then it is
probably fine not to put any spaces. But if you’re using an en dash in
this role instead, then spaces are probably necessary again. Personally
I prefer _not_ to use dashes of either width for this purpose;
semicolons are less ambiguous.

On the gripping hand, that particular sentence could easily be rewritten
to avoid the use of either. Might even be an improvement.

db48x

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 17:17                                       ` character sets as they relate to “Raw” string literals for elisp Alan Mackenzie
@ 2021-10-08 17:42                                         ` Eli Zaretskii
  2021-10-08 18:47                                           ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-08 17:42 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: emacs-devel, rms, juri, db48x, stefankangas, yuri.v.khan, monnier

> Date: Fri, 8 Oct 2021 17:17:36 +0000
> Cc: Stefan Kangas <stefankangas@gmail.com>, rms@gnu.org,
>   yuri.v.khan@gmail.com, juri@linkov.net, db48x@db48x.net,
>   monnier@iro.umontreal.ca, emacs-devel@gnu.org
> From: Alan Mackenzie <acm@muc.de>
> 
> I would suggest something slightly different which will solve the entire
> problem rather than just part of it.  Have the minor mode translate the
> buffer text (temporarily) into ASCII rather than just displaying it thus.
> That way the user can search for `foo' or ... simply by using C-s.

Modifying the text of an Info manual is a bad idea for the reasons I
explained in an earlier message.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* RE: [External] : Re: Display of em dashes in our documentation
  2021-10-08 16:12                                         ` Eli Zaretskii
  2021-10-08 17:17                                           ` Stefan Kangas
  2021-10-08 17:27                                           ` Daniel Brooks
@ 2021-10-08 18:26                                           ` Drew Adams
  2 siblings, 0 replies; 120+ messages in thread
From: Drew Adams @ 2021-10-08 18:26 UTC (permalink / raw)
  To: Eli Zaretskii, Stefan Kangas
  Cc: rms@gnu.org, yuri.v.khan@gmail.com, juri@linkov.net,
	db48x@db48x.net, monnier@iro.umontreal.ca, emacs-devel@gnu.org

> I believe the usual US English style is not to leave
> whitespace around  em dash.

I already spoke to this, back in the original
thread, "character sets as they relate to “Raw”
string literals for elisp".  Thin space OR no
space is what's used, in typsetting.

But we're not typesetting, here.  TexInfo can
produce typeset print-ready output also, but
this is about on-screen use of Info in Emacs.

And for fixed-width fonts it makes sense to use
a (normal, full) space char.  It makes no sense
to not show any space for a fixed-width font,
especially since the em dash is itself the same
width as other chars.  There's zero difference
in width between en and em dash in a fixed-width
font, AFAICT.

By default, Info uses a fixed-width font.

Would it hurt for our manuals to just use a
regular  space to surround em dash (which, as
you say, is not typical for typeset text with
variable-width fonts)?  I don't think so.

And given that the default is fixed-width, and
most users will not customize the Info fonts to
use variable-width, surrounding em dash with a
regular space char in Emacs Info just makes sense.
IMHO.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 17:42                                         ` Eli Zaretskii
@ 2021-10-08 18:47                                           ` Eli Zaretskii
  2021-10-08 20:01                                             ` Alan Mackenzie
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-08 18:47 UTC (permalink / raw)
  To: acm; +Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

> Date: Fri, 08 Oct 2021 20:42:44 +0300
> From: Eli Zaretskii <eliz@gnu.org>
> Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
>  stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> 
> > Date: Fri, 8 Oct 2021 17:17:36 +0000
> > Cc: Stefan Kangas <stefankangas@gmail.com>, rms@gnu.org,
> >   yuri.v.khan@gmail.com, juri@linkov.net, db48x@db48x.net,
> >   monnier@iro.umontreal.ca, emacs-devel@gnu.org
> > From: Alan Mackenzie <acm@muc.de>
> > 
> > I would suggest something slightly different which will solve the entire
> > problem rather than just part of it.  Have the minor mode translate the
> > buffer text (temporarily) into ASCII rather than just displaying it thus.
> > That way the user can search for `foo' or ... simply by using C-s.
> 
> Modifying the text of an Info manual is a bad idea for the reasons I
> explained in an earlier message.

And, of course, if you must have only ASCII characters in the Info
files themselves, you can always regenerate them from the sources,
without using the --enable-encoding switch.  It's so simple that it's
hardly worth another round of futile "discussions" that never go
anywhere.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 18:47                                           ` Eli Zaretskii
@ 2021-10-08 20:01                                             ` Alan Mackenzie
  2021-10-09  6:18                                               ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-08 20:01 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

Hello, Eli.

On Fri, Oct 08, 2021 at 21:47:15 +0300, Eli Zaretskii wrote:
> > Date: Fri, 08 Oct 2021 20:42:44 +0300
> > From: Eli Zaretskii <eliz@gnu.org>
> > Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
> >  stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca

> > > Date: Fri, 8 Oct 2021 17:17:36 +0000
> > > Cc: Stefan Kangas <stefankangas@gmail.com>, rms@gnu.org,
> > >   yuri.v.khan@gmail.com, juri@linkov.net, db48x@db48x.net,
> > >   monnier@iro.umontreal.ca, emacs-devel@gnu.org
> > > From: Alan Mackenzie <acm@muc.de>

> > > I would suggest something slightly different which will solve the entire
> > > problem rather than just part of it.  Have the minor mode translate the
> > > buffer text (temporarily) into ASCII rather than just displaying it thus.
> > > That way the user can search for `foo' or ... simply by using C-s.

> > Modifying the text of an Info manual is a bad idea for the reasons I
> > explained in an earlier message.

> And, of course, if you must have only ASCII characters in the Info
> files themselves, you can always regenerate them from the sources,
> without using the --enable-encoding switch.  It's so simple that it's
> hardly worth another round of futile "discussions" that never go
> anywhere.

That's not what I want.  I want UTF-8 characters, so that people's names,
etc., can be displayed correctly.  I simply don't want the PUNCTUATION in
texi files to get translated to Unicode characters not on my keyboard,
and which can't be cleanly displayed on my Linux console.  makeinfo and
friends don't have options for this.

I realise I'm not going to get this unless I write it myself, whether in
Texinfo or Emacs.

I'm also not asking you to carry on this discussion, which clearly isn't
going to get anywhere.  But please take note of what it is I want and why
- that is, precisely, punctuation characters that I can type, display,
and search for with C-s.

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-08 20:01                                             ` Alan Mackenzie
@ 2021-10-09  6:18                                               ` Eli Zaretskii
  2021-10-09 10:57                                                 ` Alan Mackenzie
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-09  6:18 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

> Date: Fri, 8 Oct 2021 20:01:15 +0000
> Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
>   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> From: Alan Mackenzie <acm@muc.de>
> 
> > And, of course, if you must have only ASCII characters in the Info
> > files themselves, you can always regenerate them from the sources,
> > without using the --enable-encoding switch.  It's so simple that it's
> > hardly worth another round of futile "discussions" that never go
> > anywhere.
> 
> That's not what I want.  I want UTF-8 characters, so that people's names,
> etc., can be displayed correctly.  I simply don't want the PUNCTUATION in
> texi files to get translated to Unicode characters not on my keyboard,
> and which can't be cleanly displayed on my Linux console.  makeinfo and
> friends don't have options for this.

But Emacs already automatically translates those punctuation
characters at display time in your case, so what exactly is the
problem you want to solve?



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09  6:18                                               ` Eli Zaretskii
@ 2021-10-09 10:57                                                 ` Alan Mackenzie
  2021-10-09 11:49                                                   ` Eli Zaretskii
  2021-10-10  8:03                                                   ` character sets as they relate to “Raw” string literals for elisp Juri Linkov
  0 siblings, 2 replies; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-09 10:57 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

Hello, Eli.

On Sat, Oct 09, 2021 at 09:18:43 +0300, Eli Zaretskii wrote:
> > Date: Fri, 8 Oct 2021 20:01:15 +0000
> > Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
> >   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> > From: Alan Mackenzie <acm@muc.de>

> > > And, of course, if you must have only ASCII characters in the Info
> > > files themselves, you can always regenerate them from the sources,
> > > without using the --enable-encoding switch.  It's so simple that it's
> > > hardly worth another round of futile "discussions" that never go
> > > anywhere.

> > That's not what I want.  I want UTF-8 characters, so that people's names,
> > etc., can be displayed correctly.  I simply don't want the PUNCTUATION in
> > texi files to get translated to Unicode characters not on my keyboard,
> > and which can't be cleanly displayed on my Linux console.  makeinfo and
> > friends don't have options for this.

> But Emacs already automatically translates those punctuation
> characters at display time in your case, ....

Not satisfactorally.  All these characters have homoglyph face on them,
which is ugly and I don't like.  (It would be confusing, and thus worse,
without this face.)

--- (EM DASH) appears as an inverse question mark on my screen.  So do
several other punctuation marks, I think.

> so what exactly is the problem you want to solve?

I want to be able to search for @code{foo} by typing the six keys:

    C-s ` f o o '

, like I could with previous versions of Texinfo.  I want this by having
these characters in the buffer, not by some clumsy workaround in isearch
(which I think was tried some time ago, but wasn't really satisfactory).

I want to be able to _use_ info buffers without restriction.  For
example, if I wanted to copy `foo' to a doc string in a file.el, I
should be able just to do that.  At the moment, I'd have to copy just
foo, then type in the quote marks by hand.  Not a big thing, but
annoying all the same.

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09 10:57                                                 ` Alan Mackenzie
@ 2021-10-09 11:49                                                   ` Eli Zaretskii
  2021-10-09 13:08                                                     ` Alan Mackenzie
  2021-10-10  8:03                                                   ` character sets as they relate to “Raw” string literals for elisp Juri Linkov
  1 sibling, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-09 11:49 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

> Date: Sat, 9 Oct 2021 10:57:53 +0000
> Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
>   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> From: Alan Mackenzie <acm@muc.de>
> 
> > But Emacs already automatically translates those punctuation
> > characters at display time in your case, ....
> 
> Not satisfactorally.  All these characters have homoglyph face on them,
> which is ugly and I don't like.  (It would be confusing, and thus worse,
> without this face.)
> 
> --- (EM DASH) appears as an inverse question mark on my screen.  So do
> several other punctuation marks, I think.

That means the display-time replacement doesn't happen, I think.  What
is your terminal-coding-system?

> > so what exactly is the problem you want to solve?
> 
> I want to be able to search for @code{foo} by typing the six keys:
> 
>     C-s ` f o o '
> 
> , like I could with previous versions of Texinfo.  I want this by having
> these characters in the buffer, not by some clumsy workaround in isearch
> (which I think was tried some time ago, but wasn't really satisfactory).

Then please talk to the Texinfo developers to provide a kind of output
that leaves the quotes and other punctuation intact, while keeping the
non-ASCII characters in names in their UTF-8 encoding (which will, of
course, show as inverted question marks on your console).  Emacs
cannot do anything to satisfy your request, as long as the Info files
are as they are, and I will object to us changing the contents of the
Info files in the buffer.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09 11:49                                                   ` Eli Zaretskii
@ 2021-10-09 13:08                                                     ` Alan Mackenzie
  2021-10-09 13:15                                                       ` Eli Zaretskii
  0 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-09 13:08 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

Hello, Eli.

On Sat, Oct 09, 2021 at 14:49:28 +0300, Eli Zaretskii wrote:
> > Date: Sat, 9 Oct 2021 10:57:53 +0000
> > Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
> >   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> > From: Alan Mackenzie <acm@muc.de>

> > > But Emacs already automatically translates those punctuation
> > > characters at display time in your case, ....

> > Not satisfactorally.  All these characters have homoglyph face on them,
> > which is ugly and I don't like.  (It would be confusing, and thus worse,
> > without this face.)

> > --- (EM DASH) appears as an inverse question mark on my screen.  So do
> > several other punctuation marks, I think.

> That means the display-time replacement doesn't happen, I think.  What
> is your terminal-coding-system?

M-: default-terminal-coding-system says utf-8-unix.  I haven't set this
in my site-start.el or .emacs.

I have my font set to Latin-1.  More precisely,

    consolefont="lat1-16"

in my /etc/conf.d/consolefont.

> > > so what exactly is the problem you want to solve?

> > I want to be able to search for @code{foo} by typing the six keys:

> >     C-s ` f o o '

> > , like I could with previous versions of Texinfo.  I want this by having
> > these characters in the buffer, not by some clumsy workaround in isearch
> > (which I think was tried some time ago, but wasn't really satisfactory).

> Then please talk to the Texinfo developers to provide a kind of output
> that leaves the quotes and other punctuation intact, ....

I will try, but I doubt that will bring anything.  It's such an obvious
thing to want that it must have been brought up in the Texinfo mailing
lists lots of times in the last few years.  I think the maintainer is
hostile to ASCII punctuation characters.

> .... while keeping the non-ASCII characters in names in their UTF-8
> encoding (which will, of course, show as inverted question marks on
> your console).  Emacs cannot do anything to satisfy your request, as
> long as the Info files are as they are, and I will object to us
> changing the contents of the Info files in the buffer.

Like I said, if I want this fixed I'll probably need to fix it myself,
even if only for me personally.

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09 13:08                                                     ` Alan Mackenzie
@ 2021-10-09 13:15                                                       ` Eli Zaretskii
  2021-10-09 15:07                                                         ` Alan Mackenzie
  0 siblings, 1 reply; 120+ messages in thread
From: Eli Zaretskii @ 2021-10-09 13:15 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

> Date: Sat, 9 Oct 2021 13:08:35 +0000
> Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
>   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> From: Alan Mackenzie <acm@muc.de>
> 
> > > --- (EM DASH) appears as an inverse question mark on my screen.  So do
> > > several other punctuation marks, I think.
> 
> > That means the display-time replacement doesn't happen, I think.  What
> > is your terminal-coding-system?
> 
> M-: default-terminal-coding-system says utf-8-unix.  I haven't set this
> in my site-start.el or .emacs.

That explains it, I think: info.el thinks your console can display
those characters.

> I have my font set to Latin-1.  More precisely,
> 
>     consolefont="lat1-16"
> 
> in my /etc/conf.d/consolefont.

Are there any other possible values that will cause these characters
display correctly?  Or is the Linux console unable to display them no
matter what?

Regardless, we could have a customizable option in info.el to force
display of the Unicode punctuation as their ASCII equivalents, even if
the terminal seems capable of the Unicode display.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09 13:15                                                       ` Eli Zaretskii
@ 2021-10-09 15:07                                                         ` Alan Mackenzie
  2021-10-11  0:45                                                           ` linux console limitations Daniel Brooks
  0 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-09 15:07 UTC (permalink / raw)
  To: Eli Zaretskii
  Cc: emacs-devel, rms, yuri.v.khan, db48x, stefankangas, juri, monnier

Hello, Eli.

On Sat, Oct 09, 2021 at 16:15:50 +0300, Eli Zaretskii wrote:
> > Date: Sat, 9 Oct 2021 13:08:35 +0000
> > Cc: emacs-devel@gnu.org, rms@gnu.org, juri@linkov.net, db48x@db48x.net,
> >   stefankangas@gmail.com, yuri.v.khan@gmail.com, monnier@iro.umontreal.ca
> > From: Alan Mackenzie <acm@muc.de>

> > > > --- (EM DASH) appears as an inverse question mark on my screen.  So do
> > > > several other punctuation marks, I think.

> > > That means the display-time replacement doesn't happen, I think.  What
> > > is your terminal-coding-system?

> > M-: default-terminal-coding-system says utf-8-unix.  I haven't set this
> > in my site-start.el or .emacs.

> That explains it, I think: info.el thinks your console can display
> those characters.

I vaguely remember looking into this before.  I think the problem was
that there was no facility in Linux for determining whether a particular
character could be displayed on its console.  The low level interface
simply wasn't there.

> > I have my font set to Latin-1.  More precisely,

> >     consolefont="lat1-16"

> > in my /etc/conf.d/consolefont.

> Are there any other possible values that will cause these characters
> display correctly?  Or is the Linux console unable to display them no
> matter what?

The Linux console is limited to 256 glyphs, some of which are used by two
or several Unicode characters.  It would be possible but time consuming
to amend the font to display the EM-DASH as, say, a minus sign.

> Regardless, we could have a customizable option in info.el to force
> display of the Unicode punctuation as their ASCII equivalents, even if
> the terminal seems capable of the Unicode display.

Again, this would only solve half of the problem, the display half, but
might be worthwhile.  I would prefer, though, to find a way of preventing
these awkward Unicode punctuation charaacters from being in info buffers
in the first place.

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Display of em dashes in our documentation
  2021-10-08 17:17                                           ` Stefan Kangas
@ 2021-10-10  8:00                                             ` Juri Linkov
  0 siblings, 0 replies; 120+ messages in thread
From: Juri Linkov @ 2021-10-10  8:00 UTC (permalink / raw)
  To: Stefan Kangas
  Cc: rms, emacs-devel, db48x, monnier, Eli Zaretskii, yuri.v.khan

>> What will that do to byte offsets in Info tag tables?  I'd rather
>> avoid modifying the buffer contents.
>
> What do you mean by "byte-offsets in Info tag tables"?  Do you mean that
> this approach risks leaving a table misaligned?  If so, I think that is
> correct, and clearly a drawback.  I don't see an easy way around it with
> this approach (but I also don't see a scenario when you would properly
> use an em dash in a table).

You can put a string with spaces using the ‘display’ text property over
the em-dash char the same way as Info-hide-note-references is implemented.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: character sets as they relate to “Raw” string literals for elisp
  2021-10-09 10:57                                                 ` Alan Mackenzie
  2021-10-09 11:49                                                   ` Eli Zaretskii
@ 2021-10-10  8:03                                                   ` Juri Linkov
  1 sibling, 0 replies; 120+ messages in thread
From: Juri Linkov @ 2021-10-10  8:03 UTC (permalink / raw)
  To: Alan Mackenzie
  Cc: rms, yuri.v.khan, db48x, stefankangas, Eli Zaretskii, emacs-devel,
	monnier

>> so what exactly is the problem you want to solve?
>
> I want to be able to search for @code{foo} by typing the six keys:
>
>     C-s ` f o o '
>
> , like I could with previous versions of Texinfo.  I want this by having
> these characters in the buffer, not by some clumsy workaround in isearch
> (which I think was tried some time ago, but wasn't really satisfactory).

  C-s M-s '
  ` f o o '

works fine especially with ‘char-fold-symmetric’ customized to ‘t’.



^ permalink raw reply	[flat|nested] 120+ messages in thread

* linux console limitations
  2021-10-09 15:07                                                         ` Alan Mackenzie
@ 2021-10-11  0:45                                                           ` Daniel Brooks
  2021-10-12 10:18                                                             ` Alan Mackenzie
  0 siblings, 1 reply; 120+ messages in thread
From: Daniel Brooks @ 2021-10-11  0:45 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> The Linux console is limited to 256 glyphs, some of which are used by two
> or several Unicode characters.  It would be possible but time consuming
> to amend the font to display the EM-DASH as, say, a minus sign.

I’ve never tried it (I only heard about it last week), but can you run
fbterm?

db48x



^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: linux console limitations
  2021-10-11  0:45                                                           ` linux console limitations Daniel Brooks
@ 2021-10-12 10:18                                                             ` Alan Mackenzie
  2021-10-14  4:05                                                               ` Daniel Brooks
  0 siblings, 1 reply; 120+ messages in thread
From: Alan Mackenzie @ 2021-10-12 10:18 UTC (permalink / raw)
  To: Daniel Brooks; +Cc: emacs-devel

Hello, Daniel.

On Sun, Oct 10, 2021 at 17:45:28 -0700, Daniel Brooks wrote:
> Alan Mackenzie <acm@muc.de> writes:

> > The Linux console is limited to 256 glyphs, some of which are used by two
> > or several Unicode characters.  It would be possible but time consuming
> > to amend the font to display the EM-DASH as, say, a minus sign.

> I’ve never tried it (I only heard about it last week), but can you run
> fbterm?

I've also barely heard of it.  On a web search, the only articles I
could find about it were over 10 years old (I couldn't find anything
meaningful on GitHub where it's hosted, either).

Some of these articles described how fbterm has "features".  Two of
these random features took the key sequences C-M-d and C-M-k.  I think
it uses C-<right> and C-<left> to move to the next/previous terminal.
No doubt there are several (or even many) more such features, making the
program not very useful for running Emacs, which uses pretty much all
the key sequences, certainly all the alphabetical keys + modifiers.

So, I've never tried it either.

> db48x

-- 
Alan Mackenzie (Nuremberg, Germany).

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: linux console limitations
  2021-10-12 10:18                                                             ` Alan Mackenzie
@ 2021-10-14  4:05                                                               ` Daniel Brooks
  0 siblings, 0 replies; 120+ messages in thread
From: Daniel Brooks @ 2021-10-14  4:05 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> Some of these articles described how fbterm has "features".  Two of
> these random features took the key sequences C-M-d and C-M-k.  I think
> it uses C-<right> and C-<left> to move to the next/previous terminal.
> No doubt there are several (or even many) more such features, making the
> program not very useful for running Emacs, which uses pretty much all
> the key sequences, certainly all the alphabetical keys + modifiers.
>
> So, I've never tried it either.

Ah, I see. That is certainly a nuisance. I guess your choice was between
accepting a very limited character set and commenting out or changing
fbterm’s dumb key bindings, which is not an appealing option to me
either.

db48x



^ permalink raw reply	[flat|nested] 120+ messages in thread

end of thread, other threads:[~2021-10-14  4:05 UTC | newest]

Thread overview: 120+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-09-08  1:49 "Raw" string literals for elisp Anna Glasgall
2021-09-08  7:10 ` Po Lu
2021-09-08 14:19   ` Anna Glasgall
2021-09-08  7:12 ` Lars Ingebrigtsen
2021-09-08 14:20   ` Anna Glasgall
2021-09-08 11:30 ` Alan Mackenzie
2021-09-08 14:27   ` Anna Glasgall
2021-09-08 11:34 ` Adam Porter
2021-09-08 13:59   ` Clément Pit-Claudel
2021-09-08 14:12     ` Adam Porter
2021-09-09  3:09   ` Richard Stallman
2021-09-08 13:10 ` Stefan Monnier
2021-09-08 14:31   ` Anna Glasgall
2021-09-08 15:27     ` Mattias Engdegård
2021-09-08 15:41       ` Stefan Kangas
2021-09-08 16:45         ` Mattias Engdegård
2021-09-08 16:01       ` Alan Mackenzie
2021-09-08 18:24         ` Mattias Engdegård
2021-09-08 19:00           ` Alan Mackenzie
2021-09-08 19:22         ` Philip Kaludercic
2021-09-08 19:36           ` Alan Mackenzie
2021-09-08 21:11           ` Stefan Kangas
2021-09-08 21:24             ` Philip Kaludercic
2021-09-09  6:52             ` tomas
2021-09-08 15:54     ` Stefan Kangas
2021-09-08 16:05     ` tomas
2021-09-08 16:42       ` Lars Ingebrigtsen
2021-09-08 20:08         ` Stefan Monnier
2021-09-08 20:18       ` Stefan Monnier
2021-09-09  7:04         ` tomas
2021-09-09 10:30         ` Mattias Engdegård
2021-09-09 11:36           ` Stefan Kangas
2021-09-09 13:33             ` Mattias Engdegård
2021-09-09 14:32               ` tomas
2021-09-14 10:43               ` Augusto Stoffel
2021-09-14 11:42                 ` Ihor Radchenko
2021-09-14 13:18                   ` Stefan Monnier
2021-09-14 13:22                     ` Stefan Kangas
2021-09-14 14:01                       ` Ihor Radchenko
2021-09-14 14:39                       ` Clément Pit-Claudel
2021-09-14 15:33                         ` Amin Bandali
2021-09-14 16:05                         ` Eli Zaretskii
2021-09-14 17:49                   ` Jose E. Marchesi
2021-09-08 20:40 ` Anna Glasgall
2021-09-08 21:28   ` Alan Mackenzie
2021-10-02 21:03   ` Daniel Brooks
2021-10-04  0:13     ` Richard Stallman
2021-10-04  0:36       ` Daniel Brooks
2021-10-04 12:00         ` Eli Zaretskii
2021-10-04 15:36           ` character sets as they relate to “Raw” " Daniel Brooks
2021-10-04 16:34             ` Stefan Monnier
2021-10-04 20:49               ` Daniel Brooks
2021-10-04 21:19                 ` Alan Mackenzie
2021-10-04 22:19                   ` Daniel Brooks
2021-10-05 11:20                     ` Alan Mackenzie
2021-10-05 17:08                       ` Daniel Brooks
2021-10-06 20:54                         ` Richard Stallman
2021-10-07  7:01                           ` Eli Zaretskii
2021-10-05  8:55                 ` Yuri Khan
2021-10-05 16:25                   ` Juri Linkov
2021-10-05 17:15                     ` Eli Zaretskii
2021-10-05 18:40                       ` [External] : " Drew Adams
2021-10-06 20:54                       ` Richard Stallman
2021-10-07  6:54                         ` Eli Zaretskii
2021-10-07 13:14                           ` Stefan Kangas
2021-10-07 13:34                             ` Eli Zaretskii
2021-10-07 14:48                               ` Stefan Kangas
2021-10-07 16:00                                 ` Eli Zaretskii
2021-10-08  0:37                                   ` Stefan Kangas
2021-10-08  6:53                                     ` Eli Zaretskii
2021-10-08 15:09                                       ` Display of em dashes in our documentation Stefan Kangas
2021-10-08 16:12                                         ` Eli Zaretskii
2021-10-08 17:17                                           ` Stefan Kangas
2021-10-10  8:00                                             ` Juri Linkov
2021-10-08 17:27                                           ` Daniel Brooks
2021-10-08 18:26                                           ` [External] : " Drew Adams
2021-10-08 17:17                                       ` character sets as they relate to “Raw” string literals for elisp Alan Mackenzie
2021-10-08 17:42                                         ` Eli Zaretskii
2021-10-08 18:47                                           ` Eli Zaretskii
2021-10-08 20:01                                             ` Alan Mackenzie
2021-10-09  6:18                                               ` Eli Zaretskii
2021-10-09 10:57                                                 ` Alan Mackenzie
2021-10-09 11:49                                                   ` Eli Zaretskii
2021-10-09 13:08                                                     ` Alan Mackenzie
2021-10-09 13:15                                                       ` Eli Zaretskii
2021-10-09 15:07                                                         ` Alan Mackenzie
2021-10-11  0:45                                                           ` linux console limitations Daniel Brooks
2021-10-12 10:18                                                             ` Alan Mackenzie
2021-10-14  4:05                                                               ` Daniel Brooks
2021-10-10  8:03                                                   ` character sets as they relate to “Raw” string literals for elisp Juri Linkov
2021-10-05 18:23                     ` [External] : " Drew Adams
2021-10-05 19:13                       ` Stefan Kangas
2021-10-05 19:20                         ` Drew Adams
2021-10-05 17:13                   ` Daniel Brooks
2021-10-05 12:04                 ` Eli Zaretskii
2021-10-05 21:20                 ` Richard Stallman
2021-10-05 22:13                   ` Daniel Brooks
2021-10-06 12:13                     ` Eli Zaretskii
2021-10-06 18:57                       ` Daniel Brooks
2021-10-07  4:23                         ` Eli Zaretskii
2021-10-07 22:27                         ` Richard Stallman
2021-10-08 10:37                         ` Po Lu
2021-10-08 10:53                           ` Basil L. Contovounesios
2021-10-08 11:27                             ` tomas
2021-10-05 22:25                   ` character sets as they relate to â€œRawâ€ " Stefan Kangas
2021-10-06  6:21                     ` Daniel Brooks
2021-10-07 22:20                       ` Richard Stallman
2021-10-06 12:29                     ` Eli Zaretskii
2021-10-06 12:52                       ` Stefan Kangas
2021-10-06 13:10                         ` Jean-Christophe Helary
2021-10-06 11:53                   ` character sets as they relate to “Raw” " Eli Zaretskii
2021-10-04 18:57             ` Eli Zaretskii
2021-10-04 19:14               ` Yuri Khan
2021-10-05 21:20                 ` Richard Stallman
2021-10-06  3:48                   ` character sets as they relate to â€œRawâ€ " Matthew Carter
2021-10-04 22:29         ` "Raw" " Richard Stallman
2021-10-05  5:39           ` Daniel Brooks
2021-10-05  5:43             ` Jean-Christophe Helary
2021-10-05  8:24               ` Richard Stallman
2021-10-05 12:23               ` Eli Zaretskii

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).