Re: Tree-sitter integration on feature/tree-sitter

unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed

* Re: Tree-sitter integration on feature/tree-sitter
@ 2022-05-19  1:35 Kiong-Ge Liau
  2022-05-20  2:01 ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Kiong-Ge Liau @ 2022-05-19  1:35 UTC (permalink / raw)
  To: casouri, emacs-devel

Can you please share the mentioned "treesit-demo.el" file? I cannot see
it attached to any messages on emacs-devel mailing list. 

Thanks.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-19  1:35 Tree-sitter integration on feature/tree-sitter Kiong-Ge Liau
@ 2022-05-20  2:01 ` Yuan Fu
  2022-06-16 19:03   ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-05-20  2:01 UTC (permalink / raw)
  To: Kiong-Ge Liau; +Cc: emacs-devel

[-- Attachment #1: Type: text/plain, Size: 275 bytes --]



> On May 18, 2022, at 6:35 PM, Kiong-Ge Liau <lkg.ch@pm.me> wrote:
> 
> Can you please share the mentioned "treesit-demo.el" file? I cannot see
> it attached to any messages on emacs-devel mailing list. 
> 
> Thanks.
> 

Here it is, take it with a grain of salt :-)

Yuan


[-- Attachment #2: treesit-demo.el --]
[-- Type: application/octet-stream, Size: 6624 bytes --]

;; -*- lexical-binding: t -*-

(require 'treesit)
(require 'pp)
(require 'rx)

(defun treesit-show-buffer-tree ()
  "Show the AST in a popup buffer."
  (interactive)
  (let ((root-node (treesit-parser-root-node
                    (treesit-get-parser-create 'c)))
        (pp-use-max-width t))
    (pop-to-buffer (get-buffer-create "*treesit-show-tree*"))
    (erase-buffer)
    (insert (treesit-node-string root-node))
    ;; Format the output.
    (goto-char (point-min))
    (while (re-search-forward (rx (or "(" (seq (+ word) ":"))) nil t)
      (goto-char (match-beginning 0))
      (insert "\n")
      (goto-char (1+ (match-end 0))))
    (setq indent-line-function #'lisp-indent-line)
    (indent-region (point-min) (point-max))))

(defun ts-c-fontify-system-lib (beg end _)
  "Fortify a #include <lib>.
Fortify the angled brackets in preprocessor-face,
and the lib name in string-face."
  (put-text-property beg (1+ beg) 'face 'font-lock-preprocessor-face)
  (put-text-property (1- end) end 'face 'font-lock-preprocessor-face)
  (put-text-property (1+ beg) (1- end)
                     'face 'font-lock-string-face))

;; Please compiler.
(defvar ts-c-treesit-indent-rules)
(define-derived-mode ts-c-mode prog-mode "TS C"
  "C mode with tree-sitter support."
  (if (treesit-should-enable-p)
      (progn
        (setq-local treesit-font-lock-defaults
                    '((ts-c-treesit-settings-1))

                    font-lock-defaults
                    '(nil t)

                    indent-line-function
                    #'treesit-indent

                    treesit-simple-indent-rules
                    ts-c-treesit-indent-rules)
        (treesit-font-lock-enable))
    ;; Copied from cc-mode.
    (setq-local font-lock-defaults
                '((c-font-lock-keywords
                   c-font-lock-keywords-1
                   c-font-lock-keywords-2
                   c-font-lock-keywords-3)
                  nil nil
                  ((95 . "w")
                   (36 . "w"))
                  c-beginning-of-syntax
                  (font-lock-mark-block-function . c-mark-function)))))

(defvar ts-c-treesit-indent-rules
  `((c
     ;; Empty line.
     (no-node prev-line 0)

     ;; Function/struct definition body {}.
     ((match nil "function_definition" "body") parent 0)
     ((node-is "field_declaration_list") parent 0)

     ;; Call expression.
     ((parent-is "call_expression") parent 2)

     ;; If-else.
     ((match nil "if_statement" "condition") parent 2)
     ((match nil "if_statement" "consequence") parent 2)
     ((match nil "if_statement" "alternative") parent 2)
     ((match nil "switch_statement" "condition")  parent 2)
     ((node-is "else") parent 0)

     ;; Switch case.
     ((parent-is "case_statement") parent 2)
     ((node-is "case_statement") parent 0)

     ;; { and }.
     ((node-is "compound_statement") parent 2)
     ((node-is "}") parent 0)

     ;; Multi-line string.
     ((parent-is "string_literal") no-indent 0)

     ;; List.
     ,@(cl-loop for type in '("compound_statement" "initializer_list"
                              "argument_list" "parameter_list"
                              "field_declaration_list")
                collect `((match nil ,type nil 0 0) parent 2)
                collect `((match nil ,type nil 1) first-sibling 0)))))

(defvar ts-c-treesit-settings-1
  `((c
     ,(treesit-expand-query
       '((null) @font-lock-constant-face
         (true) @font-lock-constant-face
         (false) @font-lock-constant-face

         (comment) @font-lock-comment-face

         (system_lib_string) @ts-c-fontify-system-lib

         (unary_expression
          operator: _ @font-lock-negation-char-face)

         (string_literal) @font-lock-string-face
         (char_literal) @font-lock-string-face



         (function_definition
          declarator: (identifier) @font-lock-function-name-face)

         (declaration
          declarator: (identifier) @font-lock-function-name-face)

         (function_declarator
          declarator: (identifier) @font-lock-function-name-face)



         (init_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (parameter_declaration
          declarator: (identifier) @font-lock-variable-name-face)

         (preproc_def
          name: (identifier) @font-lock-variable-name-face)

         (enumerator
          name: (identifier) @font-lock-variable-name-face)

         (field_identifier) @font-lock-variable-name-face

         (parameter_list
          (parameter_declaration
           (identifier) @font-lock-variable-name-face))

         (pointer_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (array_declarator
          declarator: (identifier) @font-lock-variable-name-face)

         (preproc_function_def
          name: (identifier) @font-lock-variable-name-face
          parameters: (preproc_params
                       (identifier) @font-lock-variable-name-face))



         (type_identifier) @font-lock-type-face
         (primitive_type) @font-lock-type-face

         "auto" @font-lock-keyword-face
         "break" @font-lock-keyword-face
         "case" @font-lock-keyword-face
         "const" @font-lock-keyword-face
         "continue" @font-lock-keyword-face
         "default" @font-lock-keyword-face
         "do" @font-lock-keyword-face
         "else" @font-lock-keyword-face
         "enum" @font-lock-keyword-face
         "extern" @font-lock-keyword-face
         "for" @font-lock-keyword-face
         "goto" @font-lock-keyword-face
         "if" @font-lock-keyword-face
         "register" @font-lock-keyword-face
         "return" @font-lock-keyword-face
         "sizeof" @font-lock-keyword-face
         "static" @font-lock-keyword-face
         "struct" @font-lock-keyword-face
         "switch" @font-lock-keyword-face
         "typedef" @font-lock-keyword-face
         "union" @font-lock-keyword-face
         "volatile" @font-lock-keyword-face
         "while" @font-lock-keyword-face

         "long" @font-lock-type-face
         "short" @font-lock-type-face
         "signed" @font-lock-type-face
         "unsigned" @font-lock-type-face

         "#include" @font-lock-preprocessor-face
         "#define" @font-lock-preprocessor-face
         "#ifdef" @font-lock-preprocessor-face
         "#ifndef" @font-lock-preprocessor-face
         "#endif" @font-lock-preprocessor-face
         "#else" @font-lock-preprocessor-face
         "#elif" @font-lock-preprocessor-face
         )))))

;;; Config

(add-to-list 'auto-mode-alist '("\\.tsc\\'" . ts-c-mode))

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-05-20  2:01 ` Yuan Fu
@ 2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
                       ` (4 more replies)
  0 siblings, 5 replies; 64+ messages in thread
From: Yuan Fu @ 2022-06-16 19:03 UTC (permalink / raw)
  To: Emacs Devel

Hey,

I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter

- Now one can compile a query, compiled query is much faster than uncompiled queries.
- Traversal functions now have a parameter that controls how deep to traverse.
- Removed the ltree-sitter setting in configure.ac
- Consolidated all the parser creation functions into one: treesit-parser-create, that means treesit-get-parser and treesit-get-parser-create are removed.

I think these are all the pending requests (sans highlight-paren), please let me know if I missed anything.

Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list. Because I’m not comfortable letting users to remove and re-add parsers into the list anymore. Previously we determined that if a user wants to do the wrong thing, so be it. But now I realized that there could be danger in crashing Emacs if user fiddle with treesit-parser-list incorrectly (and violates some of the assertions I put in).

Can I just add a new Lisp_Object field in struct buffer? I assume that’s how you add an internal buffer-local data?

Yuan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* RE: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
@ 2022-06-16 19:25     ` Drew Adams
  2022-06-17  1:11       ` Yuan Fu
  2022-06-17  1:24     ` Po Lu
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 64+ messages in thread
From: Drew Adams @ 2022-06-16 19:25 UTC (permalink / raw)
  To: Yuan Fu, Emacs Devel

> I’ve just finished with Real Life and got back to tree-sitter. 

The afterlife revealed, finally. ;-)


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:25     ` [External] : " Drew Adams
@ 2022-06-17  1:11       ` Yuan Fu
  2022-06-17 14:22         ` Drew Adams
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-17  1:11 UTC (permalink / raw)
  To: Drew Adams; +Cc: Emacs Devel



> On Jun 16, 2022, at 12:25 PM, Drew Adams <drew.adams@oracle.com> wrote:
> 
>> I’ve just finished with Real Life and got back to tree-sitter. 
> 
> The afterlife revealed, finally. ;-)
> 

I’ll be very glad if my afterlife includes hacking Emacs.

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
@ 2022-06-17  1:24     ` Po Lu
  2022-06-18  0:09       ` Yuan Fu
  2022-06-17  2:00     ` Ihor Radchenko
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 64+ messages in thread
From: Po Lu @ 2022-06-17  1:24 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:

> Can I just add a new Lisp_Object field in struct buffer? I assume
> that’s how you add an internal buffer-local data?

Yes.  Make sure the field is placed before
`cursor_in_non_selected_windows_', or it won't be traced by GC.

Also make sure to access it using the `BVAR' macro and add a
corresponding `DEFVAR_PER_BUFFER' form in syms_of_buffer if it's
supposed to be a buffer local variable.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
  2022-06-16 19:25     ` [External] : " Drew Adams
  2022-06-17  1:24     ` Po Lu
@ 2022-06-17  2:00     ` Ihor Radchenko
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  2022-06-17 11:06     ` Jostein Kjønigsen
  4 siblings, 2 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-17  2:00 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

Yuan Fu <casouri@gmail.com> writes:

> I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter

Would it be possible to expose ts_record_change to Elisp?

I am asking in the interest of Org mode parser that is also parsing the
buffer AST and tracks buffer modifications.

The built-in after-change-functions are not reliable because they can be
(and often are) easily suppressed by with-silent-modifications macro.
See bug#46982 and bug#51766.

Best,
Ihor




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:00     ` Ihor Radchenko
@ 2022-06-17  2:25       ` Yuan Fu
  2022-06-17  2:55         ` Stefan Monnier
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
  1 sibling, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-17  2:25 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Emacs Devel

> 
> Would it be possible to expose ts_record_change to Elisp?
> 
> I am asking in the interest of Org mode parser that is also parsing the
> buffer AST and tracks buffer modifications.
> 
> The built-in after-change-functions are not reliable because they can be
> (and often are) easily suppressed by with-silent-modifications macro.
> See bug#46982 and bug#51766.

I think you probably want a separate hook just for this purpose, rather than repurposing ts_record_change. We could have a lower-level after-change-functions that is immune to with-silent-modifications. Whether we should add such hook is probably another discussion. (So I opened a new thread.) I think it will be handy, but I don’t know that problem it might cause.

Yuan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
@ 2022-06-17  2:55         ` Stefan Monnier
  2022-06-17  5:28           ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Stefan Monnier @ 2022-06-17  2:55 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Ihor Radchenko, Emacs Devel

> I think you probably want a separate hook just for this purpose, rather than
> repurposing ts_record_change. We could have a lower-level
> after-change-functions that is immune to with-silent-modifications. Whether
> we should add such hook is probably another discussion. (So I opened a new
> thread.) I think it will be handy, but I don’t know that problem it
> might cause.

As I just argued in bug#51766, I don't think it makes sense to try to
have such "a lower-level after-change-functions that is immune to
with-silent-modifications".


        Stefan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  2:00     ` Ihor Radchenko
  2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
@ 2022-06-17  5:23       ` Eli Zaretskii
  2022-06-17 10:40         ` Ihor Radchenko
  1 sibling, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17  5:23 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: Emacs Devel <emacs-devel@gnu.org>
> Date: Fri, 17 Jun 2022 10:00:04 +0800
> 
> Would it be possible to expose ts_record_change to Elisp?
> 
> I am asking in the interest of Org mode parser that is also parsing the
> buffer AST and tracks buffer modifications.

Please tell more about the need.  I'm not happy with exposing this to
Lisp, and don't understand why the low-level parts of parsing the
buffer AST should be written in Lisp in the first place.  The
tree-sitter branch does this in C for that very reason.

We could rename ts_record_change to something more general, of course,
and make it available even if Emacs is not compiled with TS, if it can
be useful for other needs.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  2:55         ` Stefan Monnier
@ 2022-06-17  5:28           ` Eli Zaretskii
  2022-06-17 10:10             ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17  5:28 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: casouri, yantar92, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Ihor Radchenko <yantar92@gmail.com>,  Emacs Devel <emacs-devel@gnu.org>
> Date: Thu, 16 Jun 2022 22:55:50 -0400
> 
> > I think you probably want a separate hook just for this purpose, rather than
> > repurposing ts_record_change. We could have a lower-level
> > after-change-functions that is immune to with-silent-modifications. Whether
> > we should add such hook is probably another discussion. (So I opened a new
> > thread.) I think it will be handy, but I don’t know that problem it
> > might cause.
> 
> As I just argued in bug#51766, I don't think it makes sense to try to
> have such "a lower-level after-change-functions that is immune to
> with-silent-modifications".

I tend to agree.  We can discuss the specific needs that triggered
that request, but by and large, we have a good reason to have
inhibit-modification-hooks that affects any Lisp program that wants to
know about buffer modifications.  That's the difference between the
Lisp level and the lower-level code in C, which "knows everything",
including when it isn't safe to use some data or some objects.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
                       ` (2 preceding siblings ...)
  2022-06-17  2:00     ` Ihor Radchenko
@ 2022-06-17  6:15     ` Eli Zaretskii
  2022-06-17  7:17       ` Yuan Fu
  2022-06-17 11:06     ` Jostein Kjønigsen
  4 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17  6:15 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Thu, 16 Jun 2022 12:03:08 -0700
> 
> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.

And a function to add to the list, right?  Or does it already exist?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17  7:17       ` Yuan Fu
  2022-06-17 10:37         ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-17  7:17 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 16, 2022, at 11:15 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Thu, 16 Jun 2022 12:03:08 -0700
>> 
>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> 
> And a function to add to the list, right?  Or does it already exist?

Creating a parser automatically adds it to the parser list of a buffer. The purpose of the parser list is to update each parser when buffer content changed. So you don’t want to remove a parser from the list and add it back: it would be out-of-sync.

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17  5:28           ` Eli Zaretskii
@ 2022-06-17 10:10             ` Ihor Radchenko
  2022-06-17 11:03               ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-17 10:10 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Stefan Monnier, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> As I just argued in bug#51766, I don't think it makes sense to try to
>> have such "a lower-level after-change-functions that is immune to
>> with-silent-modifications".
>
> I tend to agree.  We can discuss the specific needs that triggered
> that request, but by and large, we have a good reason to have
> inhibit-modification-hooks that affects any Lisp program that wants to
> know about buffer modifications.  That's the difference between the
> Lisp level and the lower-level code in C, which "knows everything",
> including when it isn't safe to use some data or some objects.

Now I am wondering why tree-sitter should be any different.
Apparently the existing after-change-functions functionality was not
sufficient for tree-sitter. Probably because of issues similar to
bug#51766. Can more fine-grained modification info be exposed to Elisp?

Best,
Ihor





^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  7:17       ` Yuan Fu
@ 2022-06-17 10:37         ` Eli Zaretskii
  2022-06-18  0:14           ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17 10:37 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 00:17:54 -0700
> Cc: emacs-devel@gnu.org
> 
> >> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> > 
> > And a function to add to the list, right?  Or does it already exist?
> 
> Creating a parser automatically adds it to the parser list of a buffer.

Then removing a parser means we actually delete it?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17 10:40         ` Ihor Radchenko
  2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-17 10:40 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> I am asking in the interest of Org mode parser that is also parsing the
>> buffer AST and tracks buffer modifications.
>
> Please tell more about the need.  I'm not happy with exposing this to
> Lisp, and don't understand why the low-level parts of parsing the
> buffer AST should be written in Lisp in the first place.  The
> tree-sitter branch does this in C for that very reason.

AFAIK, tree-sitter branch does not do anything related to _writing_
parsers. Parsers are implemented via tree-sitter modules.

Org mode parses Org markup elements in buffer into AST structure.
This AST structure is used to fontify Org buffers, modify various
elements, query element properties, build lists of matching elements
according to user queries (agenda), etc

The Org mode parser is implementing pretty much the same features
tree-sitter provides (except that the relevant Org code was in place
before tree-sitter became a thing): Only parts of Org buffer are parsed
as needed; buffer modifications trigger updates only within the affected
parts of the AST.

Thanks to the parser, Org is able to handle quite large buffers. Our
parser written in Lisp and yet it can parse a 15Mb Org file within 17sec
vs. 8sec if parsed using the available incomplete tree-sitter Org parser
(https://github.com/milisims/tree-sitter-org).

Note that unlike tree-sitter, Org parser is able to change syntax using
Elisp. For example, adding new link element types is trivial with a
number of ol-*.el libraries provided by Org and third-party packages.

Moreover, the on-demand parsing makes even 15Mb Org files responsive on
runtime with little issues. I was able to get a bearable performance even
in 100Mb Org file.

Org mode parser with all its flexibility would be difficult to implement
using tree-sitter.

As for implementing in C, I am not even sure how to approach this. Emacs
does provide external module, but AFAIU modules communicate with Emacs
process via print-ing/read-ing strings and the internal Emacs-C
functions are not available. I am not convinced that the speed
difference will be worth it to bother rewriting the whole parser in
Emacs-C.

Best,
Ihor

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Lower-level change hook immune to with-silent-modifications
  2022-06-17 10:10             ` Ihor Radchenko
@ 2022-06-17 11:03               ` Eli Zaretskii
  0 siblings, 0 replies; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17 11:03 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: monnier, casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: Stefan Monnier <monnier@iro.umontreal.ca>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Fri, 17 Jun 2022 18:10:46 +0800
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> >> As I just argued in bug#51766, I don't think it makes sense to try to
> >> have such "a lower-level after-change-functions that is immune to
> >> with-silent-modifications".
> >
> > I tend to agree.  We can discuss the specific needs that triggered
> > that request, but by and large, we have a good reason to have
> > inhibit-modification-hooks that affects any Lisp program that wants to
> > know about buffer modifications.  That's the difference between the
> > Lisp level and the lower-level code in C, which "knows everything",
> > including when it isn't safe to use some data or some objects.
> 
> Now I am wondering why tree-sitter should be any different.
> Apparently the existing after-change-functions functionality was not
> sufficient for tree-sitter. Probably because of issues similar to
> bug#51766. Can more fine-grained modification info be exposed to Elisp?

tree-sitter isn't different: it does that in C, as part of the
low-level Emacs code which manipulates changes in buffer text.

My response above was about exposing that to Lisp, not about letting
features access buffer text in general.

after-change-functions is not the right tool for accessing buffer
text, they are a means to signal to Lisp that _some_ change happened
in buffer text which Lisp program _may_ wish to know about, and the
core reserves the right not to tell Lisp about some of the changes via
that hook.  Programs that _must_ know about each and every change in
buffer text cannot be written in Lisp, because there are changes that
I don't even know how to tell to a Lisp program in terms it will
understand.  For example, what if the buffer was changed from
multibyte to unibyte, or vice versa?  Or how to describe efficiently a
change in text properties?

Asking to have every aspect of the Emacs internals be exposed to Lisp
is NOT the right way of implementing features in Emacs!  Instead,
whenever the existing facilities are insufficient or don't allow you
to do the job, please describe the job you need to do, and let's
discuss how best to divide the implementation between the C primitives
(whether existing or new) and the Lisp application code.

Most of Emacs is written in Lisp to allow flexibility and safety, not
because we don't like C.  So the line that divides the C from the Lisp
parts of the implementation should consider which parts need to be
easily modified and which don't, and also which internals, if exposed
to Lisp, could easily lead to runaway applications wedging or crashing
Emacs.  These are non-trivial aspects, and the decision is not always
self-evident (though sometimes it is).

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-16 19:03   ` Yuan Fu
                       ` (3 preceding siblings ...)
  2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
@ 2022-06-17 11:06     ` Jostein Kjønigsen
  2022-06-18  0:28       ` Yuan Fu
  4 siblings, 1 reply; 64+ messages in thread
From: Jostein Kjønigsen @ 2022-06-17 11:06 UTC (permalink / raw)
  To: Yuan Fu, Emacs Devel

[-- Attachment #1: Type: text/plain, Size: 9435 bytes --]

On 16.06.2022 21:03, Yuan Fu wrote:
> Hey,
>
> I’ve just finished with Real Life and got back to tree-sitter. I’ll reply to individual messages separately, but here is a summary of all the latest changes pushed to feature/tree-sitter
>
> - Now one can compile a query, compiled query is much faster than uncompiled queries.
> - Traversal functions now have a parameter that controls how deep to traverse.
> - Removed the ltree-sitter setting in configure.ac
> - Consolidated all the parser creation functions into one: treesit-parser-create, that means treesit-get-parser and treesit-get-parser-create are removed.
>
> I think these are all the pending requests (sans highlight-paren), please let me know if I missed anything.
>
> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list. Because I’m not comfortable letting users to remove and re-add parsers into the list anymore. Previously we determined that if a user wants to do the wrong thing, so be it. But now I realized that there could be danger in crashing Emacs if user fiddle with treesit-parser-list incorrectly (and violates some of the assertions I put in).
>
> Can I just add a new Lisp_Object field in struct buffer? I assume that’s how you add an internal buffer-local data?
>
> Yuan

Nice update! Good work!

Trying latest source from emacs feature/tree-sitter branch though, and 
updating my code to use treesite-parser-create rather than 
treesit-get-parser-create... I have emacs segfaulting because of a 
double-free.

    jostein@ThinkPad-T14s:~/build/emacs$ emacs
    double free or corruption (out)
    Fatal error 6: Aborted

Running it through gdb gets me this result:

    jostein@ThinkPad-T14s:~/build/emacs$ gdb
    /home/jostein/build/emacs/src/emacs
    GNU gdb (Ubuntu 12.0.90-0ubuntu1) 12.0.90
    Copyright (C) 2022 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later
    <http://gnu.org/licenses/gpl.html>
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law.
    Type "show copying" and "show warranty" for details.
    This GDB was configured as "x86_64-linux-gnu".
    Type "show configuration" for configuration details.
    For bug reporting instructions, please see:
    <https://www.gnu.org/software/gdb/bugs/>.
    Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

    For help, type "help".
    Type "apropos word" to search for commands related to "word"...
    Reading symbols from /home/jostein/build/emacs/src/emacs...
    (gdb) r
    Starting program: /home/jostein/build/emacs/src/emacs
    [Thread debugging using libthread_db enabled]
    Using host libthread_db library
    "/lib/x86_64-linux-gnu/libthread_db.so.1".
    [New Thread 0x7ffff11bc640 (LWP 54902)]
    [New Thread 0x7ffff086d640 (LWP 54903)]
    [New Thread 0x7fffebf3d640 (LWP 54904)]
    [New Thread 0x7fffeb5b4640 (LWP 54905)]
    [New Thread 0x7fffeadb3640 (LWP 54906)]
    [New Thread 0x7fffea50c640 (LWP 54907)]
    [Thread 0x7fffea50c640 (LWP 54907) exited]
    [New Thread 0x7fffea50c640 (LWP 54908)]
    [New Thread 0x7fffe9d0b640 (LWP 54909)]
    [Thread 0x7fffea50c640 (LWP 54908) exited]
    [Thread 0x7fffe9d0b640 (LWP 54909) exited]
    [New Thread 0x7fffe9d0b640 (LWP 54910)]
    [New Thread 0x7fffea50c640 (LWP 54911)]
    [Thread 0x7fffe9d0b640 (LWP 54910) exited]
    [Thread 0x7fffea50c640 (LWP 54911) exited]
    [Thread 0x7fffeadb3640 (LWP 54906) exited]
    [Detaching after vfork from child process 54913]
    [Detaching after vfork from child process 54914]
    [Detaching after vfork from child process 54922]
    [Detaching after vfork from child process 54924]
    [Detaching after vfork from child process 54929]
    [Detaching after vfork from child process 54930]
    [Detaching after vfork from child process 54964]
    [Detaching after vfork from child process 54965]
    [Detaching after vfork from child process 54994]
    [Detaching after vfork from child process 54995]
    [Detaching after vfork from child process 54996]
    [Detaching after vfork from child process 54997]
    [Detaching after vfork from child process 54998]
    [Detaching after vfork from child process 54999]
    [Detaching after vfork from child process 55001]
    [Detaching after vfork from child process 55003]
    [Detaching after vfork from child process 55004]
    [Thread 0x7fffeb5b4640 (LWP 54905) exited]
    [Detaching after vfork from child process 55044]
    [Detaching after vfork from child process 55045]
    [Detaching after vfork from child process 55046]
    [Detaching after vfork from child process 55047]
    [Detaching after vfork from child process 55048]

    Thread 1 "emacs" received signal SIGSEGV, Segmentation fault.
    0x00007ffff58de39f in ts_query_delete () from
    /usr/local/lib/libtree-sitter.so.0
    (gdb) bt
    #0  0x00007ffff58de39f in ts_query_delete () at
    /usr/local/lib/libtree-sitter.so.0
    #1  0x000055555573e849 in cleanup_vector (vector=<optimized out>) at
    alloc.c:3184
    #2  sweep_vectors () at alloc.c:3259
    #3  0x0000555555743a50 in gc_sweep () at alloc.c:7413
    #4  garbage_collect () at alloc.c:6259
    #5  0x0000555555743f11 in maybe_garbage_collect () at alloc.c:6108
    #6  0x0000555555765e85 in maybe_gc () at
    /home/jostein/build/emacs/src/lisp.h:5539
    #7  Ffuncall (nargs=nargs@entry=2, args=args@entry=0x7fffffff9310)
    at eval.c:2961
    #8  0x0000555555764751 in internal_condition_case_n
          (bfun=0x555555765cf0 <Ffuncall>, nargs=nargs@entry=2,
    args=args@entry=0x7fffffff9310, handlers=handlers@entry=0x30,
    hfun=hfun@entry=0x5555555ded20 <safe_eval_handler>) at eval.c:1565
    #9  0x00005555555c9e13 in safe__call
    (inhibit_quit=inhibit_quit@entry=false, nargs=nargs@entry=2,
    func=func@entry=0xb160, ap=ap@entry=0x7fffffff9390) at xdisp.c:3015
    #10 0x00005555555dd3b6 in safe_call (nargs=nargs@entry=2,
    func=func@entry=0xb160) at xdisp.c:3030
    #11 0x00005555555fed32 in safe_call1 (arg=0x55555643771d, fn=0xb160)
    at xdisp.c:3041
    #12 display_mode_lines (w=w@entry=0x555556437718) at xdisp.c:26098
    #13 0x0000555555614869 in redisplay_window (window=<optimized out>,
    just_this_one_p=<optimized out>) at xdisp.c:19894
    #14 0x0000555555618063 in redisplay_window_0
    (window=window@entry=0x55555643771d) at xdisp.c:17148
    #15 0x00005555557645fc in internal_condition_case_1
         (bfun=bfun@entry=0x555555618030 <redisplay_window_0>,
    arg=arg@entry=0x55555643771d, handlers=<optimized out>,
    hfun=hfun@entry=0x5555555c8ee0 <redisplay_window_error>) at eval.c:1509
    #16 0x00005555555caf49 in redisplay_windows (window=0x55555643771d)
    at xdisp.c:17128
    #17 0x00005555555ffe0d in redisplay_internal () at xdisp.c:16595
    #18 0x0000555555601414 in redisplay_preserve_echo_area
    (from_where=from_where@entry=9) at xdisp.c:16944
    #19 0x00005555557bdc4f in wait_reading_process_output
         (time_limit=time_limit@entry=0, nsecs=nsecs@entry=0,
    read_kbd=read_kbd@entry=-1, do_display=true,
    wait_for_cell=wait_for_cell@entry=0x0,
    wait_proc=wait_proc@entry=0x0, just_wait_proc=0) at process.c:5334
    #20 0x00005555556df7a7 in kbd_buffer_get_event (end_time=0x0,
    used_mouse_menu=0x7fffffffdb2b, kbp=<synthetic pointer>) at
    keyboard.c:3953
    #21 read_event_from_main_queue (end_time=<optimized out>,
    local_getcjmp=0x7fffffffd810, used_mouse_menu=0x7fffffffdb2b) at
    keyboard.c:2225
    #22 0x00005555556e55bb in read_decoded_event_from_main_queue
    (used_mouse_menu=<optimized out>, prev_event=<optimized out>,
    local_getcjmp=<optimized out>, end_time=<optimized out>) at
    keyboard.c:2288
    #23 read_char (commandflag=1, map=0x555558bfd8f3, prev_event=0x0,
    used_mouse_menu=0x7fffffffdb2b, end_time=0x0) at keyboard.c:2919
    #24 0x00005555556e7326 in read_key_sequence (keybuf=<optimized out>,
    prompt=0x0, dont_downcase_last=<optimized out>,
    can_return_switch_frame=true, fix_current_buffer=true,
    prevent_redisplay=<optimized out>)
         at keyboard.c:9965
    #25 0x00005555556e8fbc in command_loop_1 () at keyboard.c:1391
    #26 0x0000555555764567 in internal_condition_case
    (bfun=bfun@entry=0x5555556e8d70 <command_loop_1>,
    handlers=handlers@entry=0x90, hfun=hfun@entry=0x5555556dc5b0
    <cmd_error>) at eval.c:1485
    #27 0x00005555556d4c7e in command_loop_2
    (handlers=handlers@entry=0x90) at keyboard.c:1132
    #28 0x00005555557644a9 in internal_catch (tag=tag@entry=0xf6c0,
    func=func@entry=0x5555556d4c50 <command_loop_2>, arg=arg@entry=0x90)
    at eval.c:1208
    #29 0x00005555556d4c19 in command_loop () at keyboard.c:1110
    #30 0x00005555556dc108 in recursive_edit_1 () at keyboard.c:719
    #31 0x00005555556dc4b0 in Frecursive_edit () at keyboard.c:802
    #32 0x00005555555adf54 in main (argc=<optimized out>,
    argv=<optimized out>) at emacs.c:2518
    (gdb)


This is on "plain" Ubuntu 22.04, x86_64.

Emacs compiled using: ./configure --with-tree-sitter && make -j4

No other features or "experimental" things enabled.



-- 
Vennlig hilsen
*Jostein Kjønigsen*

jostein@kjonigsen.net 🍵 jostein@gmail.com
https://jostein.kjønigsen.no <https://jostein.kjønigsen.no>

[-- Attachment #2: Type: text/html, Size: 12337 bytes --]

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-17 10:40         ` Ihor Radchenko
@ 2022-06-17 11:42           ` Eli Zaretskii
  2022-06-18  5:52             ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-17 11:42 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

[I've changed the Subject, since this is not longer about tree-sitter.]

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Fri, 17 Jun 2022 18:40:52 +0800
> 
> AFAIK, tree-sitter branch does not do anything related to _writing_
> parsers. Parsers are implemented via tree-sitter modules.

That is correct.  However, tree-sitter support is called for certain
changes in buffer text because tree-sitter needs direct and efficient
access to buffer text when those certain changes happen, and that
cannot be provided in Lisp.  There was a long discussion several
months ago where we came to this conclusion; the original design ideas
were different, and indeed at least some of them were based on
buffer-substring, which IMO is a terrible idea for this class of
features.

> Org mode parses Org markup elements in buffer into AST structure.
> This AST structure is used to fontify Org buffers, modify various
> elements, query element properties, build lists of matching elements
> according to user queries (agenda), etc
> 
> The Org mode parser is implementing pretty much the same features
> tree-sitter provides (except that the relevant Org code was in place
> before tree-sitter became a thing): Only parts of Org buffer are parsed
> as needed; buffer modifications trigger updates only within the affected
> parts of the AST.

OK, but that still doesn't tell what you need from the Emacs core.
Can you describe those needs?  I presume that modification hooks (of
any kind) are just the means; the real need is something else.  What
is it?  If (as I presume) you need to know about changes to the
buffer, then can you enumerate the changes that are of interest?  For
example, are changes in text properties and overlays of interest, and
if so, what kind of properties/overlays?  (But please don't limit your
answers to just text properties and overlays, because I asked about
them explicitly.)

Next, what kind of ASTs do you want to build, and how do you
represent text as AST?  In particular, is the AST defined by regexps
or some other Lisp data structures?

> As for implementing in C, I am not even sure how to approach this.

This is what needs to be discussed.  Emacs does have features
implemented partially in Lisp and partially in C, so this is not
impossible, far from that.  One example that comes to mind is
character composition -- a feature of the display engine that is
completely controlled by Lisp data structures that can be easily
changed at run time.  So, once we understand the needs and the
requirements, I'm quite sure ideas about the possible implementations
will not have us waiting for long.

> Emacs does provide external module, but AFAIU modules communicate
> with Emacs process via print-ing/read-ing strings and the internal
> Emacs-C functions are not available. I am not convinced that the
> speed difference will be worth it to bother rewriting the whole
> parser in Emacs-C.

I wasn't suggesting using modules.  Modules are intentionally limited
in their access to the Emacs internals.  For a core feature like the
one you are describing, using modules makes no sense at all.  No, I
was talking about providing new primitives and/or extending existing
primitives in order to support these features you want to provide in
Org (and also potentially to enable implementation of other similar
features by other packages).

As for speed, I suggest to delay the discussion of that until we have
a better understanding of the requirements and their various aspects,
and have some ideas regarding the possible implementations.  Even if
eventually there will be no gain in speed (and I find that hard to
believe), the safety of keeping some of the implementation un-exposed
to Lisp could well be worth our while.  Speed alone is not a
good-enough reason to implement something in C, especially if Lisp
performance is acceptable.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* RE: [External] : Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  1:11       ` Yuan Fu
@ 2022-06-17 14:22         ` Drew Adams
  0 siblings, 0 replies; 64+ messages in thread
From: Drew Adams @ 2022-06-17 14:22 UTC (permalink / raw)
  To: Yuan Fu; +Cc: Emacs Devel

> >> I’ve just finished with Real Life and got back to tree-sitter.
> >
> > The afterlife revealed, finally. ;-)
> 
> I’ll be very glad if my afterlife includes hacking Emacs.

You may need to autoload package AfterLife,
or perhaps add a suitable function to
`after-life-hook'.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17  1:24     ` Po Lu
@ 2022-06-18  0:09       ` Yuan Fu
  0 siblings, 0 replies; 64+ messages in thread
From: Yuan Fu @ 2022-06-18  0:09 UTC (permalink / raw)
  To: Po Lu; +Cc: Emacs Devel



> On Jun 16, 2022, at 6:24 PM, Po Lu <luangruo@yahoo.com> wrote:
> 
> Yuan Fu <casouri@gmail.com> writes:
> 
>> Can I just add a new Lisp_Object field in struct buffer? I assume
>> that’s how you add an internal buffer-local data?
> 
> Yes.  Make sure the field is placed before
> `cursor_in_non_selected_windows_', or it won't be traced by GC.
> 
> Also make sure to access it using the `BVAR' macro and add a
> corresponding `DEFVAR_PER_BUFFER' form in syms_of_buffer if it's
> supposed to be a buffer local variable.

I don’t plan to expose it as a variable, so I don’t need DEFVAR_PER_BUFFER, is that correct?

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 10:37         ` Eli Zaretskii
@ 2022-06-18  0:14           ` Yuan Fu
  2022-06-18  6:22             ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-18  0:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 17, 2022, at 3:37 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Fri, 17 Jun 2022 00:17:54 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
>>> 
>>> And a function to add to the list, right?  Or does it already exist?
>> 
>> Creating a parser automatically adds it to the parser list of a buffer.
> 
> Then removing a parser means we actually delete it?

Not sure what do you men “delete”. Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed. I probably should add checks that prohibit using a parser after it has been deleted.

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-17 11:06     ` Jostein Kjønigsen
@ 2022-06-18  0:28       ` Yuan Fu
  2022-06-18 20:57         ` Jostein Kjønigsen
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-18  0:28 UTC (permalink / raw)
  To: jostein; +Cc: Emacs Devel, Yoav Marco


> I tried to run the benchmarks again real quick, and ran into a segfault.
> It occurs in the call to ts_query_delete in cleanup_vector when
> garbage collecting.
> 
> I'll try to gather more info tomorrow, going to bed now.
> 
> Yoav


> 
> Nice update! Good work!
> 
> Trying latest source from emacs feature/tree-sitter branch though, and updating my code to use treesite-parser-create rather than treesit-get-parser-create... I have emacs segfaulting because of a double-free.


I’ve figure out the problem. It is due to my misunderstanding of how gc works. I’ve pushed a fix.

On a separate note, I also pushed the change that makes treesit-parser-list a function (rather than a variable).

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
@ 2022-06-18  5:52             ` Ihor Radchenko
  2022-06-18  7:01               ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-18  5:52 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> [I've changed the Subject, since this is not longer about tree-sitter.]

Well. I had some hope that we can generalize the tree-sitter interface
to allow Elisp-based parsers, but it is just a wish.

> OK, but that still doesn't tell what you need from the Emacs core.
> Can you describe those needs?  I presume that modification hooks (of
> any kind) are just the means; the real need is something else.  What
> is it?  If (as I presume) you need to know about changes to the
> buffer, then can you enumerate the changes that are of interest?  For
> example, are changes in text properties and overlays of interest, and
> if so, what kind of properties/overlays?  (But please don't limit your
> answers to just text properties and overlays, because I asked about
> them explicitly.)

Valid question. I am a bit too familiar with Org parser code and assume
that some things are "obvious" when they are not.

I will first answer about AST.

> Next, what kind of ASTs do you want to build, and how do you
> represent text as AST?  In particular, is the AST defined by regexps
> or some other Lisp data structures?

Org AST represents semantic objects using nested lists.
Similar to tree-sitter (AFAIU), each object in the tree is represented
by

(object-type (object-plist) object-children ...)

for example:

* test headline :tag:

is represented as

(headline
  (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
  ;; no children
   )

Upon modifying text inside the headline, we need to update :begin/:end
properties to reflect the new headline boundaries in buffer and possibly
update headline properties (e.g. :tags).

The same should be done for all the elements containing the headline.

Updating the elements require the following information:

1. Whether modified text contained terminal symbols or text contributing
   to object-plist _before_ modification.
2. The boundaries of the edited text in buffer and change in the text
   length.
3. Whether the modified text contain terminal symbols/text contributing
   to object-plist _after_ modification.

Org does not care about text property changes or overlay changes.
We just perform a series of regexp searches over the changed parts of
buffer (possibly with extended boundaries) before and after the
modification + know which region of text has been modified (its begin,
end, and change in length).

Missing any significant change (the one involving terminal symbols or
changing region length) will make the AST invalid.

Hope it clarifies the needs.

Best,
Ihor

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:14           ` Yuan Fu
@ 2022-06-18  6:22             ` Eli Zaretskii
  2022-06-18  8:25               ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-18  6:22 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Fri, 17 Jun 2022 17:14:58 -0700
> Cc: emacs-devel@gnu.org
> 
> >>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
> >>> 
> >>> And a function to add to the list, right?  Or does it already exist?
> >> 
> >> Creating a parser automatically adds it to the parser list of a buffer.
> > 
> > Then removing a parser means we actually delete it?
> 
> Not sure what do you men “delete”.

If creating a parser adds it to the list, then I guessed the semantics
of removing from the list is the opposite: having the parser no longer
exist, i.e. "delete" it.

But now I'm confused by what you say here:

> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.

If we already have treesit-parser-delete, and that call removes the
parser from the list, then why would we need a function "to remove a
parser from the list"?  It sounds like treesit-parser-delete already
does it?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  5:52             ` Ihor Radchenko
@ 2022-06-18  7:01               ` Eli Zaretskii
  2022-06-18  7:23                 ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-18  7:01 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 13:52:59 +0800
> 
> Org AST represents semantic objects using nested lists.
> Similar to tree-sitter (AFAIU), each object in the tree is represented
> by
> 
> (object-type (object-plist) object-children ...)
> 
> for example:
> 
> * test headline :tag:
> 
> is represented as
> 
> (headline
>   (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
>   ;; no children
>    )
> 
> Upon modifying text inside the headline, we need to update :begin/:end
> properties to reflect the new headline boundaries in buffer and possibly
> update headline properties (e.g. :tags).
> 
> The same should be done for all the elements containing the headline.

Where you care about changes in buffer positions of the AST elements,
markers should take care of most, if not all, of them.  I presume you
already do use markers wherever possible? if not, why not?  Or what am
I missing?

> Updating the elements require the following information:
> 
> 1. Whether modified text contained terminal symbols or text contributing
>    to object-plist _before_ modification.
> 2. The boundaries of the edited text in buffer and change in the text
>    length.
> 3. Whether the modified text contain terminal symbols/text contributing
>    to object-plist _after_ modification.
> 
> Org does not care about text property changes or overlay changes.
> We just perform a series of regexp searches over the changed parts of
> buffer (possibly with extended boundaries) before and after the
> modification + know which region of text has been modified (its begin,
> end, and change in length).
> 
> Missing any significant change (the one involving terminal symbols or
> changing region length) will make the AST invalid.

Why would you miss significant changes if you base your implementation
on buffer-modification hooks?  If there are some situations where
buffer text is modified in ways that are significant for the update
of the AST, but buffer-modification hooks are NOT called, please
describe some of those situations, so we will have something concrete
to talk about.

IOW, I still don't see from the above description why markers and
buffer-modification hooks couldn't do the job, and you would need a
lower-level hook into buffer text change machinery.  I guess that
would require a more detailed description of the job at hand?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:01               ` Eli Zaretskii
@ 2022-06-18  7:23                 ` Ihor Radchenko
  2022-06-18  7:44                   ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-18  7:23 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> (headline
>>   (:raw-value "test headline" :begin 292 :end 314 ... :tags ("tag") ... :parent (...))
>>   ;; no children
>>    )
>> 
>> Upon modifying text inside the headline, we need to update :begin/:end
>> properties to reflect the new headline boundaries in buffer and possibly
>> update headline properties (e.g. :tags).
>> 
>> The same should be done for all the elements containing the headline.
>
> Where you care about changes in buffer positions of the AST elements,
> markers should take care of most, if not all, of them.  I presume you
> already do use markers wherever possible? if not, why not?  Or what am
> I missing?

Because lots of markers degrade Emacs regex search performance
tremendously.

See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
and https://orgmode.org/list/87y21wkdwu.fsf@localhost

>> Missing any significant change (the one involving terminal symbols or
>> changing region length) will make the AST invalid.
>
> Why would you miss significant changes if you base your implementation
> on buffer-modification hooks?  If there are some situations where
> buffer text is modified in ways that are significant for the update
> of the AST, but buffer-modification hooks are NOT called, please
> describe some of those situations, so we will have something concrete
> to talk about.

The situation is third-party code doing bloody murder with

(with-silent-modifications
 (insert "Some text not triggering modification hooks))

Another scenario is modifying text in indirect buffers created with
make-indirect-buffer. (where there is no chance to install
before/after-change-functions via clone-indirect-buffer-hook).

Best,
Ihor



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:23                 ` Ihor Radchenko
@ 2022-06-18  7:44                   ` Eli Zaretskii
  2022-06-18  8:13                     ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-18  7:44 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 15:23:51 +0800
> 
> > Where you care about changes in buffer positions of the AST elements,
> > markers should take care of most, if not all, of them.  I presume you
> > already do use markers wherever possible? if not, why not?  Or what am
> > I missing?
> 
> Because lots of markers degrade Emacs regex search performance
> tremendously.
> 
> See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
> and https://orgmode.org/list/87y21wkdwu.fsf@localhost

AFAIU, the right fix for this is to fix performance degradation when a
buffer has many markers, not avoiding the use of markers.

Here's one conclusion from this discussion that indicates changes
required to be done in core (other than a low-level modification hook
for buffer text) to take care of your AST implementation.

We already have a TODO item for making markers more efficient; any
takers?

> > Why would you miss significant changes if you base your implementation
> > on buffer-modification hooks?  If there are some situations where
> > buffer text is modified in ways that are significant for the update
> > of the AST, but buffer-modification hooks are NOT called, please
> > describe some of those situations, so we will have something concrete
> > to talk about.
> 
> The situation is third-party code doing bloody murder with
> 
> (with-silent-modifications
>  (insert "Some text not triggering modification hooks))
> 
> Another scenario is modifying text in indirect buffers created with
> make-indirect-buffer. (where there is no chance to install
> before/after-change-functions via clone-indirect-buffer-hook).

In at least the latter case the idea for a proper solution was
outlined by Stefan.

For other cases, I think a careful discussion on a case by case basis
will show the path towards solving each one of them.  It is possible
that some of them require further changes in core, but we won't know
until we discuss the details of each case.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter)
  2022-06-18  7:44                   ` Eli Zaretskii
@ 2022-06-18  8:13                     ` Ihor Radchenko
  2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-18  8:13 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> Because lots of markers degrade Emacs regex search performance
>> tremendously.
>> 
>> See https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
>> and https://orgmode.org/list/87y21wkdwu.fsf@localhost
>
> AFAIU, the right fix for this is to fix performance degradation when a
> buffer has many markers, not avoiding the use of markers.
>
> Here's one conclusion from this discussion that indicates changes
> required to be done in core (other than a low-level modification hook
> for buffer text) to take care of your AST implementation.
>
> We already have a TODO item for making markers more efficient; any
> takers?

This is trickier than it may appear.
Each element in Org AST has 3-7 markers.
My real-life large org buffer contains ~200k Org syntax elements
(actually more, but not all the elements are ever queried).
So, we are talking about 600k-1.4M markers in buffer if Org AST were to
use markers.

Now, imagine an edit somewhere near the beginning of Org buffer. Such
edit means that Emacs will have to shift positions of nearly all the
markers in the buffer. All the >1M markers. On every
self-insert-command.

Org parser goes around this issue by updating AST positions on idle and
maintaining asynchronous request queue. This works relatively well
because AST queries are skewed to be near the buffer region being
edited. I am not sure if similar approach (not trivial to start with)
can be efficiently utilized by Emacs. IDK the typical marker access
pattern in Emacs core.

Probably, Emacs may need to implement an alternative data structure to
store markers and allow efficient batch-shifting of the markers. Again,
not trivial.

>> The situation is third-party code doing bloody murder with
>> 
>> (with-silent-modifications
>>  (insert "Some text not triggering modification hooks))
>> 
>> Another scenario is modifying text in indirect buffers created with
>> make-indirect-buffer. (where there is no chance to install
>> before/after-change-functions via clone-indirect-buffer-hook).
>
> In at least the latter case the idea for a proper solution was
> outlined by Stefan.

I haven't read through his email carefully yet. A quick response is that
I have seen a lot of code in the wild that simply uses
make-indirect-buffer. Expecting compliance is unreliable in practice. (I
may need to think more about this though)

Best,
Ihor

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  6:22             ` Eli Zaretskii
@ 2022-06-18  8:25               ` Yuan Fu
  2022-06-18  8:50                 ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-18  8:25 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 17, 2022, at 11:22 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Fri, 17 Jun 2022 17:14:58 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>>>> Moving forward, I want to make treesit-parser-list internal and turn it into a function that returns the parser list. And add a function to remove a parser from the parser list.
>>>>> 
>>>>> And a function to add to the list, right?  Or does it already exist?
>>>> 
>>>> Creating a parser automatically adds it to the parser list of a buffer.
>>> 
>>> Then removing a parser means we actually delete it?
>> 
>> Not sure what do you men “delete”.
> 
> If creating a parser adds it to the list, then I guessed the semantics
> of removing from the list is the opposite: having the parser no longer
> exist, i.e. "delete" it.
> 
> But now I'm confused by what you say here:
> 
>> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
> 
> If we already have treesit-parser-delete, and that call removes the
> parser from the list, then why would we need a function "to remove a
> parser from the list"?  It sounds like treesit-parser-delete already
> does it?

Yeah. There is no other function, treesit-parser-delete deletes and removes the parser. Though I don’t know how can you make a Lisp_Object “no longer exist”. Normally we just turn #<stuff> into #<deleted stuff>, right?

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-18  8:13                     ` Ihor Radchenko
@ 2022-06-18  8:47                       ` Eli Zaretskii
  2022-06-20 11:58                         ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-18  8:47 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 18 Jun 2022 16:13:13 +0800
> 
> > AFAIU, the right fix for this is to fix performance degradation when a
> > buffer has many markers, not avoiding the use of markers.
> >
> > Here's one conclusion from this discussion that indicates changes
> > required to be done in core (other than a low-level modification hook
> > for buffer text) to take care of your AST implementation.
> >
> > We already have a TODO item for making markers more efficient; any
> > takers?
> 
> This is trickier than it may appear.
> Each element in Org AST has 3-7 markers.
> My real-life large org buffer contains ~200k Org syntax elements
> (actually more, but not all the elements are ever queried).
> So, we are talking about 600k-1.4M markers in buffer if Org AST were to
> use markers.
> 
> Now, imagine an edit somewhere near the beginning of Org buffer. Such
> edit means that Emacs will have to shift positions of nearly all the
> markers in the buffer. All the >1M markers. On every
> self-insert-command.

The inner loop of adjust_markers_for_insert is just 40 machine
instructions.  (This is in unoptimized code; it could be fewer
instruction in an optimized build.)  Assuming a 3GHz CPU clock, 40
instructions should take just 13 nsec, and 1 million of these should
take 13 milliseconds -- a very short time indeed.  I expect that to be
between 5 and 7 msec in an optimized build.

(Compare that with inserting the characters itself: the first
insertion could potentially mean moving the gap, which in a large
buffer means moving megabytes of bytes -- not a negligible feat.)

So I don't think the performance degradation due to markers is because
the insert/delete operations on buffer text need to update many
markers.  I think the real slowdown comes from the functions which
convert character positions to byte positions and vice versa: these
use markers.  There are a lot of such calls all over our code, and
that's where the current linear-linked-list implementation of markers
slows us down.

Of course, the right method to show the bottleneck(s) is to profile
the code with a tool like 'prof', and take it from there.  So here's
one more interesting job for someone to volunteer.

> Org parser goes around this issue by updating AST positions on idle and
> maintaining asynchronous request queue. This works relatively well
> because AST queries are skewed to be near the buffer region being
> edited. I am not sure if similar approach (not trivial to start with)
> can be efficiently utilized by Emacs. IDK the typical marker access
> pattern in Emacs core.

If you already have a workaround for marker-related problems, then why
do you need to hook into insertion and deletion on the lowest level?

> >> The situation is third-party code doing bloody murder with
> >> 
> >> (with-silent-modifications
> >>  (insert "Some text not triggering modification hooks))
> >> 
> >> Another scenario is modifying text in indirect buffers created with
> >> make-indirect-buffer. (where there is no chance to install
> >> before/after-change-functions via clone-indirect-buffer-hook).
> >
> > In at least the latter case the idea for a proper solution was
> > outlined by Stefan.
> 
> I haven't read through his email carefully yet. A quick response is that
> I have seen a lot of code in the wild that simply uses
> make-indirect-buffer. Expecting compliance is unreliable in practice. (I
> may need to think more about this though)

If this is a frequent problem, perhaps we should introduce a special
variant of add-hook which would cater to the indirect-buffer case.
Discussion of several such cases could point to that conclusion, or it
can point to something else.

And that is my long-standing gripe aimed at developers of 3rd party
packages: they should come here (or bug-gnu-emacs@gnu.org) and present
the cases where they needed some missing infrastructure, instead of
trying to jump through hoops to work around what they perceive as
Emacs restrictions that (they think) cannot be possibly lifted.  Doing
the former will have at least two benefits: (a) it will facilitate
Emacs development into a better platform, and (b) it will avoid giving
birth to some of the horrible kludges out there, which eventually
don't work well enough, and thus make Emacs seem less professional
than it should be.

And if that is my expectation from developers of 3rd party packages, I
definitely expect that from packages that are bundled, such as Org.
Since Org is basically part of the core Emacs, it makes little sense
to me to realize that it goes to such lengths trying to work around
the limitations, instead of asking the core team to improve the
existing implementation or add some missing ones.  I could perhaps
understand if the request existed, but no one volunteered to work on
it, but not having the requests in the first place I cannot
understand.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  8:25               ` Yuan Fu
@ 2022-06-18  8:50                 ` Eli Zaretskii
  2022-06-18 20:07                   ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-18  8:50 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 18 Jun 2022 01:25:01 -0700
> Cc: emacs-devel@gnu.org
> 
> >>> Then removing a parser means we actually delete it?
> >> 
> >> Not sure what do you men “delete”.
> > 
> > If creating a parser adds it to the list, then I guessed the semantics
> > of removing from the list is the opposite: having the parser no longer
> > exist, i.e. "delete" it.
> > 
> > But now I'm confused by what you say here:
> > 
> >> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
> > 
> > If we already have treesit-parser-delete, and that call removes the
> > parser from the list, then why would we need a function "to remove a
> > parser from the list"?  It sounds like treesit-parser-delete already
> > does it?
> 
> Yeah. There is no other function, treesit-parser-delete deletes and removes the parser.

So you agree with me that a function to remove from the list is not
needed?  Once the list is no longer exposed to Lisp, the way Lisp
programs should manipulate the list is by adding and deleting parsers,
and by asking Emacs to show the list of existing parsers.  Right?

> Though I don’t know how can you make a Lisp_Object “no longer exist”. Normally we just turn #<stuff> into #<deleted stuff>, right?

Yes, we leave the actual deleting to GC.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  8:50                 ` Eli Zaretskii
@ 2022-06-18 20:07                   ` Yuan Fu
  2022-06-19  5:39                     ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-18 20:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 18, 2022, at 1:50 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Sat, 18 Jun 2022 01:25:01 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>>> Then removing a parser means we actually delete it?
>>>> 
>>>> Not sure what do you men “delete”.
>>> 
>>> If creating a parser adds it to the list, then I guessed the semantics
>>> of removing from the list is the opposite: having the parser no longer
>>> exist, i.e. "delete" it.
>>> 
>>> But now I'm confused by what you say here:
>>> 
>>>> Treesit-parser-delete removes the parser from the parser list of a buffer, so it is never kept up-to-date with that buffer again. But you can still do stuff with it until it is gc’ed.
>>> 
>>> If we already have treesit-parser-delete, and that call removes the
>>> parser from the list, then why would we need a function "to remove a
>>> parser from the list"?  It sounds like treesit-parser-delete already
>>> does it?
>> 
>> Yeah. There is no other function, treesit-parser-delete deletes and removes the parser.
> 
> So you agree with me that a function to remove from the list is not
> needed?  Once the list is no longer exposed to Lisp, the way Lisp
> programs should manipulate the list is by adding and deleting parsers,
> and by asking Emacs to show the list of existing parsers.  Right?

I don’t think we have any disagreement here, it’s just my miscommunication. We have three functions:
- treesit-parser-create that creates a parser and adds it to the parser list
- treesit-parser-delete that deletes a parser and removes it from the parser list
- treesit-parser-list that returns the parser list

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18  0:28       ` Yuan Fu
@ 2022-06-18 20:57         ` Jostein Kjønigsen
  0 siblings, 0 replies; 64+ messages in thread
From: Jostein Kjønigsen @ 2022-06-18 20:57 UTC (permalink / raw)
  To: Yuan Fu, jostein; +Cc: Emacs Devel, Yoav Marco

[-- Attachment #1: Type: text/plain, Size: 759 bytes --]


On 18.06.2022 02:28, Yuan Fu wrote:
>
> I’ve figure out the problem. It is due to my misunderstanding of how gc works. I’ve pushed a fix.
>
> On a separate note, I also pushed the change that makes treesit-parser-list a function (rather than a variable).
>
> Yuan

Just FYI: Getting latest sources and compiling from scratch, I get the 
following build warning:

    In treesit-traverse-forward:
    treesit.el:291:2: Warning: docstring has wrong usage of unescaped
    single quotes (use \= or different quoting)

Besides that, I've tested your changes and they definitely fix the 
segfault. Great stuff!

-- 
Vennlig hilsen
*Jostein Kjønigsen*

jostein@kjonigsen.net 🍵 jostein@gmail.com
https://jostein.kjønigsen.no <https://jostein.kjønigsen.no>

[-- Attachment #2: Type: text/html, Size: 1592 bytes --]

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-18 20:07                   ` Yuan Fu
@ 2022-06-19  5:39                     ` Eli Zaretskii
  2022-06-20  3:00                       ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-19  5:39 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sat, 18 Jun 2022 13:07:07 -0700
> Cc: emacs-devel@gnu.org
> 
> > So you agree with me that a function to remove from the list is not
> > needed?  Once the list is no longer exposed to Lisp, the way Lisp
> > programs should manipulate the list is by adding and deleting parsers,
> > and by asking Emacs to show the list of existing parsers.  Right?
> 
> I don’t think we have any disagreement here, it’s just my miscommunication. We have three functions:
> - treesit-parser-create that creates a parser and adds it to the parser list
> - treesit-parser-delete that deletes a parser and removes it from the parser list
> - treesit-parser-list that returns the parser list

Right, that's exactly what I meant.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-19  5:39                     ` Eli Zaretskii
@ 2022-06-20  3:00                       ` Yuan Fu
  2022-06-20 11:44                         ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-20  3:00 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel

I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual? By revenant sections I mean put treesit-beginning/end-of-defun in the same section as beginning/end-of-defun, etc.

Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.

Yuan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20  3:00                       ` Yuan Fu
@ 2022-06-20 11:44                         ` Eli Zaretskii
  2022-06-20 20:01                           ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-20 11:44 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Sun, 19 Jun 2022 20:00:49 -0700
> Cc: emacs-devel@gnu.org
> 
> 
> I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual?

The latter, please.

But why do we need a separate description for the tree-sitter
variants?  Shouldn't that be automatically supported by
beginning/end-of-defun, once some switch is thrown to enable
tree-sitter?

And if beginning/end-of-defun is for some reason too low-level/basic
for this role (but if you think so, please explain why), then I think
we need higher-level functions that by default are just thin wrappers
around beginning/end-of-defun, and will call tree-sitter versions when
Emacs is configured to do so.

I mean, it would be very cumbersome to request that each and every
major mode which wants to use tree-sitter will have to explicitly call
treesit-SOMETHING everywhere.

> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.

So you are saying treesit-beginning/end-of-defun don't actually look
for beginning and end of a function, but for beginning and end of a
more abstract entity?  Then perhaps it would be wrong to have "defun"
in their names?  And in that case, maybe a separate section (under
"Motion") is better after all, since this is no longer "List Motion",
strictly speaking.

Thanks.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
@ 2022-06-20 11:58                         ` Ihor Radchenko
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:33                           ` Alan Mackenzie
  0 siblings, 2 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-20 11:58 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> > We already have a TODO item for making markers more efficient; any
>> > takers?
>> 
>> This is trickier than it may appear.
>> Each element in Org AST has 3-7 markers.
>> My real-life large org buffer contains ~200k Org syntax elements
>> (actually more, but not all the elements are ever queried).
>> So, we are talking about 600k-1.4M markers in buffer if Org AST were to
>> use markers.
>> 
>> Now, imagine an edit somewhere near the beginning of Org buffer. Such
>> edit means that Emacs will have to shift positions of nearly all the
>> markers in the buffer. All the >1M markers. On every
>> self-insert-command.
>
> The inner loop of adjust_markers_for_insert is just 40 machine
> instructions.  (This is in unoptimized code; it could be fewer
> instruction in an optimized build.)  Assuming a 3GHz CPU clock, 40
> instructions should take just 13 nsec, and 1 million of these should
> take 13 milliseconds -- a very short time indeed.  I expect that to be
> between 5 and 7 msec in an optimized build.
>
> (Compare that with inserting the characters itself: the first
> insertion could potentially mean moving the gap, which in a large
> buffer means moving megabytes of bytes -- not a negligible feat.)

Noted.
Does Emacs C code provide any generic tree structure implementation?

> So I don't think the performance degradation due to markers is because
> the insert/delete operations on buffer text need to update many
> markers.  I think the real slowdown comes from the functions which
> convert character positions to byte positions and vice versa: these
> use markers.  There are a lot of such calls all over our code, and
> that's where the current linear-linked-list implementation of markers
> slows us down.
>
> Of course, the right method to show the bottleneck(s) is to profile
> the code with a tool like 'prof', and take it from there.  So here's
> one more interesting job for someone to volunteer.

That's what I did in https://orgmode.org/list/87y21wkdwu.fsf@localhost:

>>> The bottleneck appears to be buf_bytepos_to_charpos, called by
>>> BYTE_TO_CHAR macro, which, in turn, is used by set_search_regs

>>> buf_bytepos_to_charpos contains the following loop:
>>> 
>>>   for (tail = BUF_MARKERS (b); tail; tail = tail->next)
>>>     {
>>>       CONSIDER (tail->bytepos, tail->charpos);
>>> 
>>>       /* If we are down to a range of 50 chars,
>>> 	 don't bother checking any other markers;
>>> 	 scan the intervening chars directly now.  */
>>>       if (best_above - bytepos < distance
>>>           || bytepos - best_below < distance)
>>> 	break;
>>>       else
>>>         distance += BYTECHAR_DISTANCE_INCREMENT;
>>>     }
>>> 
>>> I am not sure if I understand the code correctly, but that loop is
>>> clearly scaling performance with the number of markers

>> Org parser goes around this issue by updating AST positions on idle and
>> maintaining asynchronous request queue. This works relatively well
>> because AST queries are skewed to be near the buffer region being
>> edited. I am not sure if similar approach (not trivial to start with)
>> can be efficiently utilized by Emacs. IDK the typical marker access
>> pattern in Emacs core.
>
> If you already have a workaround for marker-related problems, then why
> do you need to hook into insertion and deletion on the lowest level?

Because the workaround relies on before/after-change-functions that may
be suppressed by bad third-party code.

Also, markers will not solve all the needs of Org parser even when they
become more efficient. As I mentioned earlier, we also need to keep
track whether terminal symbols appear in the changed text before/after
modification. It boils down to matching regexps around changed region in
buffer before/after each modification. Suppressed
before/after-change-functions ruin this logic as well.

> And that is my long-standing gripe aimed at developers of 3rd party
> packages: they should come here (or bug-gnu-emacs@gnu.org) and present
> the cases where they needed some missing infrastructure, instead of
> trying to jump through hoops to work around what they perceive as
> Emacs restrictions that (they think) cannot be possibly lifted.  Doing
> the former will have at least two benefits: (a) it will facilitate
> Emacs development into a better platform, and (b) it will avoid giving
> birth to some of the horrible kludges out there, which eventually
> don't work well enough, and thus make Emacs seem less professional
> than it should be.
>
> And if that is my expectation from developers of 3rd party packages, I
> definitely expect that from packages that are bundled, such as Org.
> Since Org is basically part of the core Emacs, it makes little sense
> to me to realize that it goes to such lengths trying to work around
> the limitations, instead of asking the core team to improve the
> existing implementation or add some missing ones.  I could perhaps
> understand if the request existed, but no one volunteered to work on
> it, but not having the requests in the first place I cannot
> understand.

I think I need to clarify my position here.

The important thing you need to know about Org is that it does not only
support Emacs version Org is bundled with.
We currently support Emacs >=26. See
https://orgmode.org/worg/org-maintenance.html#emacs-compatibility

So, any major feature implemented in the development version of Emacs
cannot be easily used. The new feature will mean doubling the relevant
code on Org side: (1) supporting the new feature; (2) compatibility
layer to support older Emacs versions. Which means extra maintenance.

When I am also asked to implement the patch for this new feature for
Emacs, I get triple work.

Moreover, my previous attempt to propose a patch required for Org was
sunk in the depths of emacs-devel threads. (It was a patch for
isearch.el and it does not apply anymore onto master. I plan to
re-submit it when I get more time and interest. Just FYI)

Having said that, I do know that it is a better thing to reach Emacs when
new feature is really beneficial. But I hope that my previous
explanation clarifies why there is a friction (at least, it is the case
for me personally) to contribute to Emacs. Emacs core-related items tend
to go down towards the end of todo lists.

Best,
Ihor




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 11:58                         ` Ihor Radchenko
@ 2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
                                               ` (2 more replies)
  2022-06-20 14:33                           ` Alan Mackenzie
  1 sibling, 3 replies; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-20 12:32 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Mon, 20 Jun 2022 19:58:31 +0800
> 
> Does Emacs C code provide any generic tree structure implementation?

We have interval trees and red-black trees, but they are used for
specific C-level features, and I wouldn't call them "generic".

OTOH, don't you want to create a Lisp structure to represent AST?  If
so, C-level tree will not really help you, would it?

And I'm not sure it would be a good idea to have the trees in C even
if it would help: that's not the right subdivision between C and Lisp,
IME.  I'm quite sure your Lisp-level AST is fine for your purposes,
you just want to be able to update it more efficiently and not as
painfully as you do now.  Right?  If so, what we should do in C is to
enable that more efficient and less painful implementation, not to
provide the implementation itself.

> > Of course, the right method to show the bottleneck(s) is to profile
> > the code with a tool like 'prof', and take it from there.  So here's
> > one more interesting job for someone to volunteer.
> 
> That's what I did in https://orgmode.org/list/87y21wkdwu.fsf@localhost:
> 
> >>> The bottleneck appears to be buf_bytepos_to_charpos, called by
> >>> BYTE_TO_CHAR macro, which, in turn, is used by set_search_regs
> 
> >>> buf_bytepos_to_charpos contains the following loop:
> >>> 
> >>>   for (tail = BUF_MARKERS (b); tail; tail = tail->next)
> >>>     {
> >>>       CONSIDER (tail->bytepos, tail->charpos);
> >>> 
> >>>       /* If we are down to a range of 50 chars,
> >>> 	 don't bother checking any other markers;
> >>> 	 scan the intervening chars directly now.  */
> >>>       if (best_above - bytepos < distance
> >>>           || bytepos - best_below < distance)
> >>> 	break;
> >>>       else
> >>>         distance += BYTECHAR_DISTANCE_INCREMENT;
> >>>     }
> >>> 
> >>> I am not sure if I understand the code correctly, but that loop is
> >>> clearly scaling performance with the number of markers

Thanks, this seems to confirm my guess that these conversions are the
bottleneck.  In which case making marker access and search more
efficient will go a long way toward helping Org's AST parsing to
become more performant and eventually more easily maintainable.

> > If you already have a workaround for marker-related problems, then why
> > do you need to hook into insertion and deletion on the lowest level?
> 
> Because the workaround relies on before/after-change-functions that may
> be suppressed by bad third-party code.

Understood.

> Also, markers will not solve all the needs of Org parser even when they
> become more efficient. As I mentioned earlier, we also need to keep
> track whether terminal symbols appear in the changed text before/after
> modification. It boils down to matching regexps around changed region in
> buffer before/after each modification. Suppressed
> before/after-change-functions ruin this logic as well.

I asked a question about that, but you said you wanted to answer the
AST-related parts first.  So can we now go back to this aspect to
understand it better?  Emacs inhibits buffer-modification hooks when
it is quite sure Lisp programs "don't need to know" about those
modifications.  One example you cited where this bites you is use of
input methods.  But Quail doesn't inhibit the hooks completely, it
only inhibits them enough to pretend that just one character was
inserted, when the user might have inserted more.  So why does this
get in the way of the Org parser, if the modification hooks are being
called "enough"?

> > And that is my long-standing gripe aimed at developers of 3rd party
> > packages: they should come here (or bug-gnu-emacs@gnu.org) and present
> > the cases where they needed some missing infrastructure, instead of
> > trying to jump through hoops to work around what they perceive as
> > Emacs restrictions that (they think) cannot be possibly lifted.  Doing
> > the former will have at least two benefits: (a) it will facilitate
> > Emacs development into a better platform, and (b) it will avoid giving
> > birth to some of the horrible kludges out there, which eventually
> > don't work well enough, and thus make Emacs seem less professional
> > than it should be.
> >
> > And if that is my expectation from developers of 3rd party packages, I
> > definitely expect that from packages that are bundled, such as Org.
> > Since Org is basically part of the core Emacs, it makes little sense
> > to me to realize that it goes to such lengths trying to work around
> > the limitations, instead of asking the core team to improve the
> > existing implementation or add some missing ones.  I could perhaps
> > understand if the request existed, but no one volunteered to work on
> > it, but not having the requests in the first place I cannot
> > understand.
> 
> I think I need to clarify my position here.
> 
> The important thing you need to know about Org is that it does not only
> support Emacs version Org is bundled with.
> We currently support Emacs >=26. See
> https://orgmode.org/worg/org-maintenance.html#emacs-compatibility
> 
> So, any major feature implemented in the development version of Emacs
> cannot be easily used. The new feature will mean doubling the relevant
> code on Org side: (1) supporting the new feature; (2) compatibility
> layer to support older Emacs versions. Which means extra maintenance.

That's true, but the same is true for any new feature added to Emacs:
they can only be used since the first version which added them.  And
yet you still are asking us to provide such new features, like that
super-buffer-modification hook.

Thus, the difference between these two approaches is not whether or
not to add new features to core (which understandably makes the job of
developers of packages like Org harder due to support of older
Emacsen), the difference is _which_ new features to add.  I'm saying
that it is much better to add features which will avoid your jumping
through hoops, instead of adding features that will allow you to jump
through hoops faster and better, so to say.  It is better also in the
long run, because it helps Emacs development as well, and it helps you
and other 3rd party packages that will be able to use those new
features for future implementations.

> Moreover, my previous attempt to propose a patch required for Org was
> sunk in the depths of emacs-devel threads. (It was a patch for
> isearch.el and it does not apply anymore onto master. I plan to
> re-submit it when I get more time and interest. Just FYI)

This can unfortunately happen with any discussion, and is not always
under our control.  Perseverance is the only way I know of to prevail
in those cases.

> Emacs core-related items tend to go down towards the end of todo
> lists.

We don't have enough resources, it's true.  Hopefully, this won't
prevent people from raising such issues.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
@ 2022-06-20 14:14                             ` Stefan Kangas
  2022-06-21  3:56                               ` Ihor Radchenko
  2022-06-21  4:36                             ` Ihor Radchenko
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
  2 siblings, 1 reply; 64+ messages in thread
From: Stefan Kangas @ 2022-06-20 14:14 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Ihor Radchenko, Yuan Fu, Emacs developers

Eli Zaretskii <eliz@gnu.org> writes:

> > Moreover, my previous attempt to propose a patch required for Org was
> > sunk in the depths of emacs-devel threads.
>
> This can unfortunately happen with any discussion, and is not always
> under our control.  Perseverance is the only way I know of to prevail
> in those cases.

I very much recommend sending patches to bug-gnu-emacs instead of
emacs-devel.  That decreases the risk of them getting forgotten or
lost in the noise by orders of magnitude.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 11:58                         ` Ihor Radchenko
  2022-06-20 12:32                           ` Eli Zaretskii
@ 2022-06-20 14:33                           ` Alan Mackenzie
  2022-06-21  3:58                             ` Ihor Radchenko
  1 sibling, 1 reply; 64+ messages in thread
From: Alan Mackenzie @ 2022-06-20 14:33 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

Hello, Ihor.

On Mon, Jun 20, 2022 at 19:58:31 +0800, Ihor Radchenko wrote:
> Eli Zaretskii <eliz@gnu.org> writes:

[ .... ]

> > If you already have a workaround for marker-related problems, then why
> > do you need to hook into insertion and deletion on the lowest level?

> Because the workaround relies on before/after-change-functions that may
> be suppressed by bad third-party code.

Why is that your (or our) problem?  Code which isn't the major mode
masking out the change functions is just invalid code.  Can't you just
document somewhere that before/after-change-functions are an essential
part of Org Mode, and that messing around with them will lead to
unpredictable results?

[ .... ]

> Best,
> Ihor

-- 
Alan Mackenzie (Nuremberg, Germany).



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20 11:44                         ` Eli Zaretskii
@ 2022-06-20 20:01                           ` Yuan Fu
  2022-06-21  2:26                             ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-20 20:01 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 20, 2022, at 4:44 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Sun, 19 Jun 2022 20:00:49 -0700
>> Cc: emacs-devel@gnu.org
>> 
>> 
>> I added navigation functions like treesit-beginning/end-of-defun, and added search functions like treesit-search-beginning/end. Now I wonder where should I put them in the manual, do I put them under the treesit section (Parsing Program Source), or under the relevant existing sections in the manual?
> 
> The latter, please.
> 
> But why do we need a separate description for the tree-sitter
> variants?  Shouldn't that be automatically supported by
> beginning/end-of-defun, once some switch is thrown to enable
> tree-sitter?
> 
> And if beginning/end-of-defun is for some reason too low-level/basic
> for this role (but if you think so, please explain why), then I think
> we need higher-level functions that by default are just thin wrappers
> around beginning/end-of-defun, and will call tree-sitter versions when
> Emacs is configured to do so.
> 
> I mean, it would be very cumbersome to request that each and every
> major mode which wants to use tree-sitter will have to explicitly call
> treesit-SOMETHING everywhere.

Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.

> 
>> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
> 
> So you are saying treesit-beginning/end-of-defun don't actually look
> for beginning and end of a function, but for beginning and end of a
> more abstract entity?  Then perhaps it would be wrong to have "defun"
> in their names?  And in that case, maybe a separate section (under
> "Motion") is better after all, since this is no longer "List Motion",
> strictly speaking.

The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?

Thanks,
Yuan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-20 20:01                           ` Yuan Fu
@ 2022-06-21  2:26                             ` Eli Zaretskii
  2022-06-21  4:39                               ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-21  2:26 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Mon, 20 Jun 2022 13:01:32 -0700
> Cc: emacs-devel@gnu.org
> 
> Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.

Then those tree-sitter functions should be described together with
beginning/end-of-defun, I think.

> >> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
> > 
> > So you are saying treesit-beginning/end-of-defun don't actually look
> > for beginning and end of a function, but for beginning and end of a
> > more abstract entity?  Then perhaps it would be wrong to have "defun"
> > in their names?  And in that case, maybe a separate section (under
> > "Motion") is better after all, since this is no longer "List Motion",
> > strictly speaking.
> 
> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?

Maybe so, but why did you mention treesit-search-beginning in this
context to begin with?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 14:14                             ` Stefan Kangas
@ 2022-06-21  3:56                               ` Ihor Radchenko
  0 siblings, 0 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-21  3:56 UTC (permalink / raw)
  To: Stefan Kangas; +Cc: Eli Zaretskii, Yuan Fu, Emacs developers

Stefan Kangas <stefan@marxist.se> writes:

> I very much recommend sending patches to bug-gnu-emacs instead of
> emacs-devel.  That decreases the risk of them getting forgotten or
> lost in the noise by orders of magnitude.

Thanks! I will keep this in mind.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 14:33                           ` Alan Mackenzie
@ 2022-06-21  3:58                             ` Ihor Radchenko
  0 siblings, 0 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-21  3:58 UTC (permalink / raw)
  To: Alan Mackenzie; +Cc: Eli Zaretskii, casouri, emacs-devel

Alan Mackenzie <acm@muc.de> writes:

> On Mon, Jun 20, 2022 at 19:58:31 +0800, Ihor Radchenko wrote:
>> Eli Zaretskii <eliz@gnu.org> writes:
>
> [ .... ]
>
>> > If you already have a workaround for marker-related problems, then why
>> > do you need to hook into insertion and deletion on the lowest level?
>
>> Because the workaround relies on before/after-change-functions that may
>> be suppressed by bad third-party code.
>
> Why is that your (or our) problem?  Code which isn't the major mode
> masking out the change functions is just invalid code.  Can't you just
> document somewhere that before/after-change-functions are an essential
> part of Org Mode, and that messing around with them will lead to
> unpredictable results?

It is indeed possible. However, this particular issue can cause data
loss in user files. Which I'd prefer to avoid at all costs.

Best,
Ihor



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
@ 2022-06-21  4:36                             ` Ihor Radchenko
  2022-06-21 12:27                               ` Eli Zaretskii
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
  2 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-21  4:36 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

>> Does Emacs C code provide any generic tree structure implementation?
>
> We have interval trees and red-black trees, but they are used for
> specific C-level features, and I wouldn't call them "generic".
>
> OTOH, don't you want to create a Lisp structure to represent AST?  If
> so, C-level tree will not really help you, would it?

Clarification: I was asking about C-level trees to store marker list.
I did not have moving Org AST from Lisp to C-level in mind. We currently
use built-in Lisp implementation of AVL-tree to search across AST (which
is not ideal, but good enough for moderately large files).

>> Also, markers will not solve all the needs of Org parser even when they
>> become more efficient. As I mentioned earlier, we also need to keep
>> track whether terminal symbols appear in the changed text before/after
>> modification. It boils down to matching regexps around changed region in
>> buffer before/after each modification. Suppressed
>> before/after-change-functions ruin this logic as well.
>
> I asked a question about that, but you said you wanted to answer the
> AST-related parts first.  So can we now go back to this aspect to
> understand it better?

This is still somewhat related to AST. AST object properties that do not
refer to positions in buffer may still need to be updated upon buffer
modification.

For example, consider

* TODO headline

being changed into

* DONE headline

with (with-silent-modifications (search-foward "TODO") (replace-match "DONE"))
or even simply by (replace-match ...) inside indirect buffer created by
direct call to make-indirect-buffer.

The AST headline object will need to be updated from
(headline (... :todo-keyword "TODO" ...))
to
(headline (... :todo-keyword "DONE" ...))

> Emacs inhibits buffer-modification hooks when
> it is quite sure Lisp programs "don't need to know" about those
> modifications.  One example you cited where this bites you is use of
> input methods.  But Quail doesn't inhibit the hooks completely, it
> only inhibits them enough to pretend that just one character was
> inserted, when the user might have inserted more.  So why does this
> get in the way of the Org parser, if the modification hooks are being
> called "enough"?

It does not. Given that I implement the suggestion about using
buffer-size to track "missed" modifications, Quail will not be an issue
anymore.

The only potential problem that will remain is the type of buffer
modifications I described above (shielded by inhibit-modification-hooks
or by being done inside indirect buffer). If such modifications do not
change the buffer size (as above), we still get a problem that may
(although less likely) cause data loss on user side.

> Thus, the difference between these two approaches is not whether or
> not to add new features to core (which understandably makes the job of
> developers of packages like Org harder due to support of older
> Emacsen), the difference is _which_ new features to add.  I'm saying
> that it is much better to add features which will avoid your jumping
> through hoops, instead of adding features that will allow you to jump
> through hoops faster and better, so to say.  It is better also in the
> long run, because it helps Emacs development as well, and it helps you
> and other 3rd party packages that will be able to use those new
> features for future implementations.

I totally agree. Though additional consideration is LOC cost of adding
new features. As you can see, I took a lazy approach in this request.
Adding a new hook would not require much code change on Org side. In
contrast, changing implementation to markers will actually require
careful testing and a lot more LOC changes. So, we have a clash between
"faster" and "better" :)

In any case, I totally get your position and I do know that Emacs core
should not accept low-quality features just because they are going to be
easier for some single specific use-case. I would do the same if I were
to maintain Emacs.

> This can unfortunately happen with any discussion, and is not always
> under our control.  Perseverance is the only way I know of to prevail
> in those cases.

I understand. Unfortunately, it also creates mental friction on my side
despite this understanding. I will submit patches via debbugs in future
to make things more visible.

Best,
Ihor

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21  2:26                             ` Eli Zaretskii
@ 2022-06-21  4:39                               ` Yuan Fu
  2022-06-21 10:18                                 ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Yuan Fu @ 2022-06-21  4:39 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 20, 2022, at 7:26 PM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Mon, 20 Jun 2022 13:01:32 -0700
>> Cc: emacs-devel@gnu.org
>> 
>> Major mode should set beginning-of-defun-function to treesit-beginning-of-defun, not unlike what they already do with major mode-specific beginning-of-defun functions. This way major mode has the freedom to decide which treesit features it wants to leverage.
> 
> Then those tree-sitter functions should be described together with
> beginning/end-of-defun, I think.

Cool.

> 
>>>> Treesit-beginning/end-of-defun jumps to the beginning/end of the current defun form, treesit-search-beginning searches for a query and stops at the beginning/end of the node that matched the query.
>>> 
>>> So you are saying treesit-beginning/end-of-defun don't actually look
>>> for beginning and end of a function, but for beginning and end of a
>>> more abstract entity?  Then perhaps it would be wrong to have "defun"
>>> in their names?  And in that case, maybe a separate section (under
>>> "Motion") is better after all, since this is no longer "List Motion",
>>> strictly speaking.
>> 
>> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
> 
> Maybe so, but why did you mention treesit-search-beginning in this
> context to begin with?

They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.

Yuan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21  4:39                               ` Yuan Fu
@ 2022-06-21 10:18                                 ` Eli Zaretskii
  2022-06-22  0:34                                   ` Yuan Fu
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-21 10:18 UTC (permalink / raw)
  To: Yuan Fu; +Cc: emacs-devel

> From: Yuan Fu <casouri@gmail.com>
> Date: Mon, 20 Jun 2022 21:39:34 -0700
> Cc: emacs-devel@gnu.org
> 
> >> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
> > 
> > Maybe so, but why did you mention treesit-search-beginning in this
> > context to begin with?
> 
> They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.

Maybe a new section under Searching and Matching?



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-21  4:36                             ` Ihor Radchenko
@ 2022-06-21 12:27                               ` Eli Zaretskii
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-21 12:27 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: casouri@gmail.com,  emacs-devel@gnu.org
> Date: Tue, 21 Jun 2022 12:36:14 +0800
> 
> > OTOH, don't you want to create a Lisp structure to represent AST?  If
> > so, C-level tree will not really help you, would it?
> 
> Clarification: I was asking about C-level trees to store marker list.
> I did not have moving Org AST from Lisp to C-level in mind. We currently
> use built-in Lisp implementation of AVL-tree to search across AST (which
> is not ideal, but good enough for moderately large files).

Ah, okay.  Sorry for my misunderstanding.

Trees could indeed be relevant, but maybe other data structures as
well?  E.g., why not hash tables?  Not that I consider myself an
expert on efficient search algorithms...

> For example, consider
> 
> * TODO headline
> 
> being changed into
> 
> * DONE headline
> 
> with (with-silent-modifications (search-foward "TODO") (replace-match "DONE"))
> or even simply by (replace-match ...) inside indirect buffer created by
> direct call to make-indirect-buffer.
> 
> The AST headline object will need to be updated from
> (headline (... :todo-keyword "TODO" ...))
> to
> (headline (... :todo-keyword "DONE" ...))
> 
> > Emacs inhibits buffer-modification hooks when
> > it is quite sure Lisp programs "don't need to know" about those
> > modifications.  One example you cited where this bites you is use of
> > input methods.  But Quail doesn't inhibit the hooks completely, it
> > only inhibits them enough to pretend that just one character was
> > inserted, when the user might have inserted more.  So why does this
> > get in the way of the Org parser, if the modification hooks are being
> > called "enough"?
> 
> It does not. Given that I implement the suggestion about using
> buffer-size to track "missed" modifications, Quail will not be an issue
> anymore.
> 
> The only potential problem that will remain is the type of buffer
> modifications I described above (shielded by inhibit-modification-hooks
> or by being done inside indirect buffer). If such modifications do not
> change the buffer size (as above), we still get a problem that may
> (although less likely) cause data loss on user side.

I'd consider such Lisp programs buggy.  Actually modifying a buffer
while concealing the entire modification is evil.

> I will submit patches via debbugs in future to make things more
> visible.

TIA.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Tree-sitter integration on feature/tree-sitter
  2022-06-21 10:18                                 ` Eli Zaretskii
@ 2022-06-22  0:34                                   ` Yuan Fu
  0 siblings, 0 replies; 64+ messages in thread
From: Yuan Fu @ 2022-06-22  0:34 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel



> On Jun 21, 2022, at 3:18 AM, Eli Zaretskii <eliz@gnu.org> wrote:
> 
>> From: Yuan Fu <casouri@gmail.com>
>> Date: Mon, 20 Jun 2022 21:39:34 -0700
>> Cc: emacs-devel@gnu.org
>> 
>>>> The second sentence describes treesit-search-beginning, not treesit-beginning-of-defun, I think you confused the two?
>>> 
>>> Maybe so, but why did you mention treesit-search-beginning in this
>>> context to begin with?
>> 
>> They are another set of functions that I wonder where to put manual entries in. I’ll probably put them in 35.1 Searching for Strings, after search-forward.
> 
> Maybe a new section under Searching and Matching?

Sure.

Yuan


^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-20 12:32                           ` Eli Zaretskii
  2022-06-20 14:14                             ` Stefan Kangas
  2022-06-21  4:36                             ` Ihor Radchenko
@ 2022-06-22 15:45                             ` Basil L. Contovounesios
  2022-06-22 16:13                               ` Eli Zaretskii
  2 siblings, 1 reply; 64+ messages in thread
From: Basil L. Contovounesios @ 2022-06-22 15:45 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Ihor Radchenko, casouri, emacs-devel

Eli Zaretskii [2022-06-20 15:32 +0300] wrote:

>> From: Ihor Radchenko <yantar92@gmail.com>
>> Cc: casouri@gmail.com,  emacs-devel@gnu.org
>> Date: Mon, 20 Jun 2022 19:58:31 +0800
>> 
>> Does Emacs C code provide any generic tree structure implementation?
>
> We have interval trees and red-black trees, but they are used for
> specific C-level features, and I wouldn't call them "generic".

Would any of Gnulib's generic container types[0] be appropriate in this
case, and if so, fair game for using in Emacs' sources?

[0]: (info "(gnulib) Container data types")
https://gnu.org/s/gnulib/manual/html_node/Container-data-types.html

Thanks,

-- 
Basil



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
@ 2022-06-22 16:13                               ` Eli Zaretskii
  2022-06-25  4:54                                 ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-22 16:13 UTC (permalink / raw)
  To: Basil L. Contovounesios; +Cc: yantar92, casouri, emacs-devel

> From: "Basil L. Contovounesios" <contovob@tcd.ie>
> Cc: Ihor Radchenko <yantar92@gmail.com>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Wed, 22 Jun 2022 18:45:53 +0300
> 
> Would any of Gnulib's generic container types[0] be appropriate in this
> case, and if so, fair game for using in Emacs' sources?
> 
> [0]: (info "(gnulib) Container data types")
> https://gnu.org/s/gnulib/manual/html_node/Container-data-types.html

Could be.  But I think we need more research before we decide.
Someone™ should study the usage patterns of the markers for
character-to-byte translation, and see which operations should be the
fastest and which could be slower.  Armed with that information, we
could then select the best data structure for the job.

Btw, do we have recipes for measuring the effects of changing the data
structures used for markers?  If we do have such recipes, did someone
try to compare the performance in plain-ASCII Org buffers (where the
conversion is trivial and shouldn't even access the markers) and
non-ASCII buffers?  Inserting a single non-ASCII character somewhere
in an otherwise plain-ASCII buffer should show the effect of many
markers on the likes of CHAR_TO_BYTE.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp)
  2022-06-21 12:27                               ` Eli Zaretskii
@ 2022-06-25  4:47                                 ` Ihor Radchenko
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
  2022-06-26 10:32                                   ` Robert Pluim
  0 siblings, 2 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-25  4:47 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: casouri, emacs-devel

I think that we settled something workable regarding buffer
modifications. I will try the proposed solutions and see if issues pop
up on Org ML.

Changing subject to reflect the remaining point of the discussion better.

Eli Zaretskii <eliz@gnu.org> writes:

>> Clarification: I was asking about C-level trees to store marker list.
>> I did not have moving Org AST from Lisp to C-level in mind. We currently
>> use built-in Lisp implementation of AVL-tree to search across AST (which
>> is not ideal, but good enough for moderately large files).
>
> Ah, okay.  Sorry for my misunderstanding.
>
> Trees could indeed be relevant, but maybe other data structures as
> well?  E.g., why not hash tables?  Not that I consider myself an
> expert on efficient search algorithms...

AFAIU, buf_bytepos_to_charpos tries to search for the closest marker
near the requested bytepos. It currently does it using the following
heuristics (roughly):

(let ((threshold 50))
 (dolist (marker markers)
  (if (or (< (abs (- marker bytepos)) threshold)
          (< (abs (- nearest_previous_marker bytepos)) threshold))
     (throw 'found marker)
   (cl-incf threshold 50))))

If we store markers in a hash table, there will be no benefit - Hash
table will only allow to find marker at exact position, not nearby.

AFAIK, the most natural data structure to search for data
before/after given key is a binary tree. There are more exotic data
structures, like skip list, but I do not expect skip lists to be
implemented in Emacs C code.

Best,
Ihor

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-22 16:13                               ` Eli Zaretskii
@ 2022-06-25  4:54                                 ` Ihor Radchenko
  2022-06-25  5:46                                   ` Eli Zaretskii
  0 siblings, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-25  4:54 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: Basil L. Contovounesios, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Btw, do we have recipes for measuring the effects of changing the data
> structures used for markers?  If we do have such recipes, did someone
> try to compare the performance in plain-ASCII Org buffers (where the
> conversion is trivial and shouldn't even access the markers) and
> non-ASCII buffers?  Inserting a single non-ASCII character somewhere
> in an otherwise plain-ASCII buffer should show the effect of many
> markers on the likes of CHAR_TO_BYTE.

AFAIK, buf_bytepos_to_charpos should take no time on plain-ASCII buffers
because

  /* If this buffer has as many characters as bytes,
     each character must be one byte.
     This takes care of the case where enable-multibyte-characters is nil.  */
  if (best_above == best_above_byte)
    return bytepos;

The recipe of measuring the effects is in
https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/

That email literally provides Elisp code to run in order to measure the
effect.

Best,
Ihor



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-25  4:54                                 ` Ihor Radchenko
@ 2022-06-25  5:46                                   ` Eli Zaretskii
  2022-06-29 12:24                                     ` Ihor Radchenko
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-25  5:46 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: contovob, casouri, emacs-devel

> From: Ihor Radchenko <yantar92@gmail.com>
> Cc: "Basil L. Contovounesios" <contovob@tcd.ie>,  casouri@gmail.com,
>   emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 12:54:36 +0800
> 
> Eli Zaretskii <eliz@gnu.org> writes:
> 
> > Btw, do we have recipes for measuring the effects of changing the data
> > structures used for markers?  If we do have such recipes, did someone
> > try to compare the performance in plain-ASCII Org buffers (where the
> > conversion is trivial and shouldn't even access the markers) and
> > non-ASCII buffers?  Inserting a single non-ASCII character somewhere
> > in an otherwise plain-ASCII buffer should show the effect of many
> > markers on the likes of CHAR_TO_BYTE.
> 
> AFAIK, buf_bytepos_to_charpos should take no time on plain-ASCII buffers

Yes, that's what I said above.

> The recipe of measuring the effects is in
> https://list.orgmode.org/orgmode/scedec$2g0$1@ciao.gmane.io/
> 
> That email literally provides Elisp code to run in order to measure the
> effect.

Thanks.  However, that Lisp includes Org code, and I wonder why is
that relevant and how it could affect the results.  Using regexp
search shouldn't need any Org code, and is quite simple to write, I
think.  The main problem is to find a recipe that puts many markers in
a buffer.  If this is what the Org code in that recipe is about, then
it's one situation where benchmarking ASCII vs non-ASCII buffers will
help.  Another use case is a buffer with a lot of overlays.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
@ 2022-06-25  8:29                                   ` Stefan Monnier
  2022-06-25  8:44                                     ` Eli Zaretskii
  2022-06-26 10:32                                   ` Robert Pluim
  1 sibling, 1 reply; 64+ messages in thread
From: Stefan Monnier @ 2022-06-25  8:29 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

> AFAIK, the most natural data structure to search for data
> before/after given key is a binary tree. There are more exotic data
> structures, like skip list, but I do not expect skip lists to be
> implemented in Emacs C code.

BTW, most markers are actually part of overlays.  And Andreas Politz
implemented an AA-tree based representation of overlays for Emacs (see
the branch `feature/noverlay`).

So if you have performance problems due to overlays, you might want to
check the branch, make sure it solves the problem, and see if you can
get it merged once and for all into `master`.


        Stefan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
@ 2022-06-25  8:44                                     ` Eli Zaretskii
  2022-06-25  9:07                                       ` Stefan Monnier
  0 siblings, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-25  8:44 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: yantar92, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: Eli Zaretskii <eliz@gnu.org>,  casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 04:29:08 -0400
> 
> > AFAIK, the most natural data structure to search for data
> > before/after given key is a binary tree. There are more exotic data
> > structures, like skip list, but I do not expect skip lists to be
> > implemented in Emacs C code.
> 
> BTW, most markers are actually part of overlays.  And Andreas Politz
> implemented an AA-tree based representation of overlays for Emacs (see
> the branch `feature/noverlay`).
> 
> So if you have performance problems due to overlays, you might want to
> check the branch, make sure it solves the problem, and see if you can
> get it merged once and for all into `master`.

Landing that branch is indeed a very Good Thing, but AFAIU this
discussion revealed that Org adds quite a few markers of its own when
it parses the buffer, because it wants to track the positions of some
syntactic elements of the buffer.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  8:44                                     ` Eli Zaretskii
@ 2022-06-25  9:07                                       ` Stefan Monnier
  2022-06-25  9:20                                         ` Eli Zaretskii
  2022-06-25  9:47                                         ` Ihor Radchenko
  0 siblings, 2 replies; 64+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yantar92, casouri, emacs-devel

> Landing that branch is indeed a very Good Thing, but AFAIU this
> discussion revealed that Org adds quite a few markers of its own when
> it parses the buffer, because it wants to track the positions of some
> syntactic elements of the buffer.

That's quite unusual, tho, and I suspect that Org's code could be
changed to use overlays instead (after all, we could define an
"efficient marker" as (make-overlay POS POS)).


        Stefan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:07                                       ` Stefan Monnier
@ 2022-06-25  9:20                                         ` Eli Zaretskii
  2022-06-25  9:27                                           ` Stefan Monnier
  2022-06-25  9:47                                         ` Ihor Radchenko
  1 sibling, 1 reply; 64+ messages in thread
From: Eli Zaretskii @ 2022-06-25  9:20 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: yantar92, casouri, emacs-devel

> From: Stefan Monnier <monnier@iro.umontreal.ca>
> Cc: yantar92@gmail.com,  casouri@gmail.com,  emacs-devel@gnu.org
> Date: Sat, 25 Jun 2022 05:07:59 -0400
> 
> > Landing that branch is indeed a very Good Thing, but AFAIU this
> > discussion revealed that Org adds quite a few markers of its own when
> > it parses the buffer, because it wants to track the positions of some
> > syntactic elements of the buffer.
> 
> That's quite unusual, tho, and I suspect that Org's code could be
> changed to use overlays instead (after all, we could define an
> "efficient marker" as (make-overlay POS POS)).

If that is going to be our advice, it would make sense to reimplement
markers as a kind of overlays, so that Lisp programs shouldn't bother
with this trick.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:20                                         ` Eli Zaretskii
@ 2022-06-25  9:27                                           ` Stefan Monnier
  0 siblings, 0 replies; 64+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:27 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: yantar92, casouri, emacs-devel

> If that is going to be our advice,

That wouldn't be a general advice, but rather for specific cases where
performance proves problematic.  In any case, this is quite
hypothetical: we don't know yet whether that branch would help Org's
specific use-case.  And the code needs some work before we can merge it
(it's about 5 years old).

> it would make sense to reimplement markers as a kind of overlays, so
> that Lisp programs shouldn't bother with this trick.

That's a possibility, but this would require further changes for the
charpos<->bytepos conversion.

        Stefan

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:07                                       ` Stefan Monnier
  2022-06-25  9:20                                         ` Eli Zaretskii
@ 2022-06-25  9:47                                         ` Ihor Radchenko
  2022-06-25  9:53                                           ` Stefan Monnier
  1 sibling, 1 reply; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-25  9:47 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Eli Zaretskii, casouri, emacs-devel

Stefan Monnier <monnier@iro.umontreal.ca> writes:

>> Landing that branch is indeed a very Good Thing, but AFAIU this
>> discussion revealed that Org adds quite a few markers of its own when
>> it parses the buffer, because it wants to track the positions of some
>> syntactic elements of the buffer.
>
> That's quite unusual, tho, and I suspect that Org's code could be
> changed to use overlays instead (after all, we could define an
> "efficient marker" as (make-overlay POS POS)).

Using overlays will put extra load on display engine, on top of the
performance related to pure markers.



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  9:47                                         ` Ihor Radchenko
@ 2022-06-25  9:53                                           ` Stefan Monnier
  0 siblings, 0 replies; 64+ messages in thread
From: Stefan Monnier @ 2022-06-25  9:53 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

> Using overlays will put extra load on display engine, on top of the
> performance related to pure markers.

Could be, yes (but the tree storage of overlays should compensate to
some extent).  Hard to tell how significant that would be without trying
it out.


        Stefan




^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Optimizing performance of buffer markers
  2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
  2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
@ 2022-06-26 10:32                                   ` Robert Pluim
  1 sibling, 0 replies; 64+ messages in thread
From: Robert Pluim @ 2022-06-26 10:32 UTC (permalink / raw)
  To: Ihor Radchenko; +Cc: Eli Zaretskii, casouri, emacs-devel

>>>>> On Sat, 25 Jun 2022 12:47:38 +0800, Ihor Radchenko <yantar92@gmail.com> said:
    Ihor> AFAIU, buf_bytepos_to_charpos tries to search for the closest marker
    Ihor> near the requested bytepos. It currently does it using the following
    Ihor> heuristics (roughly):

    Ihor> (let ((threshold 50))
    Ihor>  (dolist (marker markers)
    Ihor>   (if (or (< (abs (- marker bytepos)) threshold)
    Ihor>           (< (abs (- nearest_previous_marker bytepos)) threshold))
    Ihor>      (throw 'found marker)
    Ihor>    (cl-incf threshold 50))))

    Ihor> If we store markers in a hash table, there will be no benefit - Hash
    Ihor> table will only allow to find marker at exact position, not nearby.

We could use (charpos / 50) as the key to the hash table, and then
store a list of markers as the value(s). Of course you'd have to
update the hash table every time a marker is added, deleted, or
changes charpos, so I donʼt know if it would be a win.

Robert
-- 



^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: Exposing buffer text modifications to Lisp
  2022-06-25  5:46                                   ` Eli Zaretskii
@ 2022-06-29 12:24                                     ` Ihor Radchenko
  0 siblings, 0 replies; 64+ messages in thread
From: Ihor Radchenko @ 2022-06-29 12:24 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: contovob, casouri, emacs-devel

Eli Zaretskii <eliz@gnu.org> writes:

> Thanks.  However, that Lisp includes Org code, and I wonder why is
> that relevant and how it could affect the results.  Using regexp
> search shouldn't need any Org code, and is quite simple to write, I
> think.  The main problem is to find a recipe that puts many markers in
> a buffer.  If this is what the Org code in that recipe is about, then
> it's one situation where benchmarking ASCII vs non-ASCII buffers will
> help.  Another use case is a buffer with a lot of overlays.

It is harder than just putting many markers.
I was unable to create a simpler reproducer.

Best,
Ihor



^ permalink raw reply	[flat|nested] 64+ messages in thread

end of thread, other threads:[~2022-06-29 12:24 UTC | newest]

Thread overview: 64+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-05-19  1:35 Tree-sitter integration on feature/tree-sitter Kiong-Ge Liau
2022-05-20  2:01 ` Yuan Fu
2022-06-16 19:03   ` Yuan Fu
2022-06-16 19:25     ` [External] : " Drew Adams
2022-06-17  1:11       ` Yuan Fu
2022-06-17 14:22         ` Drew Adams
2022-06-17  1:24     ` Po Lu
2022-06-18  0:09       ` Yuan Fu
2022-06-17  2:00     ` Ihor Radchenko
2022-06-17  2:25       ` Lower-level change hook immune to with-silent-modifications Yuan Fu
2022-06-17  2:55         ` Stefan Monnier
2022-06-17  5:28           ` Eli Zaretskii
2022-06-17 10:10             ` Ihor Radchenko
2022-06-17 11:03               ` Eli Zaretskii
2022-06-17  5:23       ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
2022-06-17 10:40         ` Ihor Radchenko
2022-06-17 11:42           ` Exposing buffer text modifications to Lisp (was: Tree-sitter integration on feature/tree-sitter) Eli Zaretskii
2022-06-18  5:52             ` Ihor Radchenko
2022-06-18  7:01               ` Eli Zaretskii
2022-06-18  7:23                 ` Ihor Radchenko
2022-06-18  7:44                   ` Eli Zaretskii
2022-06-18  8:13                     ` Ihor Radchenko
2022-06-18  8:47                       ` Exposing buffer text modifications to Lisp Eli Zaretskii
2022-06-20 11:58                         ` Ihor Radchenko
2022-06-20 12:32                           ` Eli Zaretskii
2022-06-20 14:14                             ` Stefan Kangas
2022-06-21  3:56                               ` Ihor Radchenko
2022-06-21  4:36                             ` Ihor Radchenko
2022-06-21 12:27                               ` Eli Zaretskii
2022-06-25  4:47                                 ` Optimizing performance of buffer markers (was: Exposing buffer text modifications to Lisp) Ihor Radchenko
2022-06-25  8:29                                   ` Optimizing performance of buffer markers Stefan Monnier
2022-06-25  8:44                                     ` Eli Zaretskii
2022-06-25  9:07                                       ` Stefan Monnier
2022-06-25  9:20                                         ` Eli Zaretskii
2022-06-25  9:27                                           ` Stefan Monnier
2022-06-25  9:47                                         ` Ihor Radchenko
2022-06-25  9:53                                           ` Stefan Monnier
2022-06-26 10:32                                   ` Robert Pluim
2022-06-22 15:45                             ` Exposing buffer text modifications to Lisp Basil L. Contovounesios
2022-06-22 16:13                               ` Eli Zaretskii
2022-06-25  4:54                                 ` Ihor Radchenko
2022-06-25  5:46                                   ` Eli Zaretskii
2022-06-29 12:24                                     ` Ihor Radchenko
2022-06-20 14:33                           ` Alan Mackenzie
2022-06-21  3:58                             ` Ihor Radchenko
2022-06-17  6:15     ` Tree-sitter integration on feature/tree-sitter Eli Zaretskii
2022-06-17  7:17       ` Yuan Fu
2022-06-17 10:37         ` Eli Zaretskii
2022-06-18  0:14           ` Yuan Fu
2022-06-18  6:22             ` Eli Zaretskii
2022-06-18  8:25               ` Yuan Fu
2022-06-18  8:50                 ` Eli Zaretskii
2022-06-18 20:07                   ` Yuan Fu
2022-06-19  5:39                     ` Eli Zaretskii
2022-06-20  3:00                       ` Yuan Fu
2022-06-20 11:44                         ` Eli Zaretskii
2022-06-20 20:01                           ` Yuan Fu
2022-06-21  2:26                             ` Eli Zaretskii
2022-06-21  4:39                               ` Yuan Fu
2022-06-21 10:18                                 ` Eli Zaretskii
2022-06-22  0:34                                   ` Yuan Fu
2022-06-17 11:06     ` Jostein Kjønigsen
2022-06-18  0:28       ` Yuan Fu
2022-06-18 20:57         ` Jostein Kjønigsen

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).